diff --git a/rtd/.readthedocs.yaml b/rtd/.readthedocs.yaml new file mode 100644 index 000000000..26efcab11 --- /dev/null +++ b/rtd/.readthedocs.yaml @@ -0,0 +1,22 @@ +# .readthedocs.yaml +# Read the Docs configuration file +# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details + +# Required +version: 2 + +# Set the version of Python and other tools you might need +build: + os: ubuntu-22.04 + tools: + python: "3.11" + +# Build documentation in the docs/ directory with Sphinx +sphinx: + configuration: rtd/docs/source/conf.py + +# We recommend specifying your dependencies to enable reproducible builds: +# https://docs.readthedocs.io/en/stable/guides/reproducible-builds.html +python: + install: + - requirements: rtd/docs/requirements.txt diff --git a/rtd/README.rst b/rtd/README.rst new file mode 100644 index 000000000..80b6a640f --- /dev/null +++ b/rtd/README.rst @@ -0,0 +1,46 @@ +OVIS-HPC LDMS Documentation +######################## + +This repository hosts all LDMS related documentation such as how-to tutorials, getting started with LDMS, docker-hub links, API's and much more. Documentation webpage can be found in the `LDMS readthedocs webpage `_ + +Contributing to ReadTheDocs +############################ +Instructions and documentation on how to use ReadTheDocs can be found here: +`readthedocs Help Guide `_ + + +* Clone the repository: + +.. code-block:: RST + + > git clone git@github.com:/ovis-docs.git + +* Add any existing file name(s) you will be editing to paper.lock + +.. code-block:: RST + + > vi paper.lock + + | mm/dd | + +* Make necessary changes, update paper.lock file and push to repo. + +.. code-block:: RST + + > vi paper.lock + + ## remove line + > git add + > git commit -m "add message" + > git push + +Adding A New File +****************** +For any new RST files created, please include them in rtd/docs/src/index.rst under their corresponding sections. All RST files not included in index.rst will not populate on the offical webpage (e.g. readthedocs). + +Paper Lock +************ +This is for claiming any sections you are working on so there is no overlap. +Please USE paper.lock to indicate if you are editing an existing RST file. + + diff --git a/rtd/docs/make.bat b/rtd/docs/make.bat new file mode 100644 index 000000000..9534b0181 --- /dev/null +++ b/rtd/docs/make.bat @@ -0,0 +1,35 @@ +@ECHO OFF + +pushd %~dp0 + +REM Command file for Sphinx documentation + +if "%SPHINXBUILD%" == "" ( + set SPHINXBUILD=sphinx-build +) +set SOURCEDIR=source +set BUILDDIR=build + +if "%1" == "" goto help + +%SPHINXBUILD% >NUL 2>NUL +if errorlevel 9009 ( + echo. + echo.The 'sphinx-build' command was not found. Make sure you have Sphinx + echo.installed, then set the SPHINXBUILD environment variable to point + echo.to the full path of the 'sphinx-build' executable. Alternatively you + echo.may add the Sphinx directory to PATH. + echo. + echo.If you don't have Sphinx installed, grab it from + echo.http://sphinx-doc.org/ + exit /b 1 +) + +%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% +goto end + +:help +%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% + +:end +popd diff --git a/rtd/docs/requirements.txt b/rtd/docs/requirements.txt new file mode 100644 index 000000000..8893b57d6 --- /dev/null +++ b/rtd/docs/requirements.txt @@ -0,0 +1,2 @@ +# compatable with the newest version of Sphinx (v7.2.1) +sphinx_rtd_theme==1.3.0rc1 diff --git a/rtd/docs/source/asf/asf-quickstart.rst b/rtd/docs/source/asf/asf-quickstart.rst new file mode 100644 index 000000000..4c96fd6e9 --- /dev/null +++ b/rtd/docs/source/asf/asf-quickstart.rst @@ -0,0 +1,130 @@ +AppSysFusion Quick Start +================================================================== + +Create A Simple Analysis +------------------------ +To start, please create a folder called ``graf_analysis`` in your home directory and copy the following contents to a python file called ``dsosTemplate.py``: + +* This is a python analysis that queries the DSOS database and returns a DataFrame of the ``meminfo`` schema metrics along with the ``timestamp``, ``component_id`` and ``job_id``. + +dsosTemplate.py: + +.. code-block :: python + + import os, sys, traceback + import datetime as dt + from graf_analysis.grafanaAnalysis import Analysis + from sosdb import Sos + import pandas as pd + import numpy as np + class dsosTemplate(Analysis): + def __init__(self, cont, start, end, schema='meminfo', maxDataPoints=4096): + super().__init__(cont, start, end, schema, 1000000) + + def get_data(self, metrics, filters=[],params=None): + try: + self.sel = f'select {",".join(metrics)} from {self.schema}' + where_clause = self.get_where(filters,res=FALSE) + order = 'time_job_comp' + orderby='order_by ' + order + self.query.select(f'{sel} {where_clause} {orderby}') + res = self.get_all_data(self.query) + # Fun stuff here! + print(res.head) + return res + except Exception as e: + a, b, c = sys.exc_info() + print(str(e)+' '+str(c.tb_lineno)) + +.. note:: + + If you want to use this analysis module in a Grafana dashboard, you will need to ask your administrator to copy your new analysis module(s) into the directory that Grafana points to. This is because Grafana is setup to look at a specific path directory to query from. + +Test Analysis via Terminal Window +---------------------------------- +You can easily test your module without the Grafana interface by creating a python script that mimics the Grafana query and formats the returned JSON into a timeseries dataframe or table. + +First, create the following file in the same directory as your python analysis (i.e. ``/user/home/graf_analysis/``) and label it ``testDSOSanalysis.py``. + +* This python script imitates the Grafana query that calls your analysis module and will return a timeseries DataFrame of the ``Active`` and ``Inactive`` meminfo metrics. + +.. code-block :: python + + #!/usr/bin/python3 + + import time,sys + from sosdb import Sos + from grafanaFormatter import DataFormatter + from table_formatter import table_formatter + from time_series_formatter import time_series_formatter + from dsosTemplate import dsosTemplate + + sess = Sos.Session("//config/dsos.conf") + cont = '' + cont = sess.open(cont) + + model = dsosTemplate(cont, time.time()-300, time.time(), schema='meminfo', maxDataPoints=4096) + + x = model.get_data(['Active','Inactive'], filters=['job_id'], params='') + + #fmt = table_formatter(x) + fmt = time_series_formatter(x) + x = fmt.ret_json() + print(x) + +.. note:: + + You will need to provide the path to the DSOS container and ``Sos.Session()`` configuration file in order to run this python script. Please see the `Python Analysis Creation `_ for more details. + +* Next, run the python module: + +.. code-block :: bash + + python3 dsosTemplate.py + +.. note:: + + All imports are python scripts that need to reside in the same directory as the test analysis module in order for it to run successfully. + +Then, run the python script with the current python verion installed. In this case it would be ``python3 `` + +Expected Results & Output ++++++++++++++++++++++++++ +The following is an example test of an analysis module that queries the ``meminfo`` schema an returns a timeseries dataframe of the ``Active`` and ``Inactive`` metrics: + +.. image:: ../images/grafana/grafana_output.png + +Test Analysis via Grafana Dashboard +----------------------------------- +You can optionally test the analysis in a grafana dashboard. This is not preferred because it is a bit more time consuming and, if there is a lot of data to query, there can be some additional wait time in that as well. + +Create A New Dashboard +++++++++++++++++++++++++++ +To create a new dashboard, click on the + sign on the left side of the home page and hit dashboard. This will create a blank dashboard with an empty panel in it. Hit the add query button on the panel to begin configuring the query to be sent to an analysis module.  + +.. note:: + + For more information on how to navigate around the Grafana dashboard and what the variables and advanced settings do, please see `Grafana Panel `_ and `Grafana Usage `_. + +* Next, add your analysis by filling out the required fields shown below: + +.. image:: ../images/grafana/grafana_query.png + +* These fields are identical to the python script you can generate to test in your terminal window so please refer to :ref:`Test Analysis via Terminal Window` or `Grafana Panel `_ for more details. + +* Now change the analysis to query from the last 5 minutes by selecting the down arrow in the top right of the panel and selecting "Last 5 minutes" + +.. image:: ../images/grafana/grafana_time.png + :height: 250 + :width: 50 + +* Then change the refresh rate to 5 seconds so that Grafana will automatically query the data every 5 seconds + +.. image:: ../images/grafana/grafana_timerange.png + +* Now you should be able to see a the "Active" and "Inactive" values for each job_id. + +.. image:: + + + diff --git a/rtd/docs/source/asf/asf-tutorial.rst b/rtd/docs/source/asf/asf-tutorial.rst new file mode 100644 index 000000000..31840b853 --- /dev/null +++ b/rtd/docs/source/asf/asf-tutorial.rst @@ -0,0 +1,2 @@ +Additional ASF Tutorial Material +=============================== diff --git a/rtd/docs/source/asf/deployment/index.rst b/rtd/docs/source/asf/deployment/index.rst new file mode 100644 index 000000000..96d8eebf5 --- /dev/null +++ b/rtd/docs/source/asf/deployment/index.rst @@ -0,0 +1,9 @@ +ASF Deployment +=============== +This section covers how to deploy and test AppSysFusion + +.. toctree:: + :maxdepth: 2 + + test + diff --git a/rtd/docs/source/asf/deployment/test.rst b/rtd/docs/source/asf/deployment/test.rst new file mode 100644 index 000000000..9d51e9b41 --- /dev/null +++ b/rtd/docs/source/asf/deployment/test.rst @@ -0,0 +1,4 @@ +Github +====== + +Documentation for this is currently under development. diff --git a/rtd/docs/source/asf/grafanapanel.rst b/rtd/docs/source/asf/grafanapanel.rst new file mode 100644 index 000000000..fb408debe --- /dev/null +++ b/rtd/docs/source/asf/grafanapanel.rst @@ -0,0 +1,47 @@ +Grafana Panel Creation with DSOS Plugin +====================================== + +To create a new dashboard, click on the + sign on the left side of the Grafana home page and select dashboard. +This will create a blank dashboard with an empty panel in it. Panels can be thought of as a visualization of a single query. select the add query button on the panel to begin configuring the query to be sent to an analysis module. + +Configuring the Query and Visualization +--------------------------------------- +.. image:: ../images/grafana/grafanapanel.png + +Once you right click on the panel title and select edit, the panel settings will appear. The first tab is for configuring the query. There are 8 fields in the query field defined below: + +* Query Type - type of query to perform. The most commonly used in "analysis" which calls an analysis module. "metrics" is used to return raw data without any analysis module. +* Query Format - the type of visualization to be used on the dataset. It is used by Grafana Formatter to properly JSON-ify the data returned from the analysis module. Can be either time_series, table, or heatmap. +* Analysis - required if you choose analysis query type. Specifies the python module to call to transofrm the data. +* Container - the name of the container to be used. This can be either the full path to the container or the Django backend get_container function can be changed to customize for site settings. +* Schema - What LDMS schema will be passed into the analysis module +* Metric - Pass a metric, or a comma separated list (without spaces) of metrics, into the analysis module +* Extra Params - (Optional) pass in an arbitrary string into the analysis module +* Filters - (Optional) include a no-sql like syntax for filtering your query, can be a comma separated list of filters i.e. component_id == 5,job_id > 0 + +The second tab in the panel settings is for visualization. Graph, Table, and Heatmap are the available visualizations for a query output. + +Text, which uses Markdown language, could also be used for Dashboard descriptions or details. If you use a graph visualization, the query Format should be time_series. If you use a table visualization, the query Format should be table. + +Graphs have multiple draw modes: bars, lines, and points. You can any or all of these draw modes on. You can also stack multiple time_series using the stack toggle button. + +For more information about how to view the data and configure the panels, please see Grafana's `Panels and Visualization Documentation `_ + +Dashboard Variables and Advanced Settings +------------------------------------------- +.. image:: ../images/grafana/grafanapanel_variables.png + +Often we want users to be able to change inputs into the queries, however users cannot change edit queries. What they can edit in Grafana are variables, which are listed at the top of the dashboard. These variables can be referenced with a ``$`` in front of the variable name. For example, we can let the user switch SOS containers they are interested by creating a variable called container and then putting ``$container`` in the container field of the query. To create variables, go to the dashboard settings (gear button at the top right) and go to variables. Here you can create new variables. Common variable types are text boxes, for users to fill in, or queries. We can actually create a pre-populated list of options for certain fields by querying the container. Below are the queryable metrics what information to put in the query field. + +* Container - select the custom option in the **Type** field and add the name of the container being used to query from in the **custom options** field. +* Schema - ``query=schema&container=`` +* Index - ``query=index&container=&schema=`` +* Metrics - ``query=metrics&container=&schema=`` +* Component IDs - ``query=components&container=&schema=`` +* Jobs - ``query=jobs&container=&schema=`` + +You can put variables in queries as well. For example, if you already have a $container variable, you can set the schema variable query to be ``query=schema&container=$container``. Then the ``$schema`` variable can be used in other queries. + +In the dashboard settings you can also change the dashboard name and folder location and load previously saved versions. + +Other than the container variable, all other variables bulleted above are set to query in the **Type** field diff --git a/rtd/docs/source/asf/grafanause.rst b/rtd/docs/source/asf/grafanause.rst new file mode 100644 index 000000000..8d2eacf36 --- /dev/null +++ b/rtd/docs/source/asf/grafanause.rst @@ -0,0 +1,2 @@ +Basic Grafana Usage +=================== diff --git a/rtd/docs/source/asf/index.rst b/rtd/docs/source/asf/index.rst new file mode 100644 index 000000000..0b067db4d --- /dev/null +++ b/rtd/docs/source/asf/index.rst @@ -0,0 +1,22 @@ +.. image:: ../images/appsysfusion.png + :width: 300 + :height: 125 + :align: center + +ASF +==== +AppSysFusion provides analysis and visualization capabilities aimed at serving insights from HPC monitoring data gathered with LDMS, though could be generalized outside of that scope. +It combines a Grafana front-end with a Django back-end to perform in-query analyses on raw data and return transformed information back to the end user. +By performing in-query analyses, only data of interest to the end-user is operated on rather than the entirety of the dataset for all analyses for all time. +This saves significant computation and storage resources with the penalty of slightly higher query times. +These analyses are modular python scripts that can be easily added or changed to suit evolving needs. +The current implementation is aimed at querying DSOS databases containing LDMS data, though efforts are in progress to abstract this functionality out to other databases and datatypes. + +.. toctree:: + :maxdepth: 2 + + asf-quickstart + asf-tutorial + grafanapanel + grafanause + pyanalysis diff --git a/rtd/docs/source/asf/pyanalysis.rst b/rtd/docs/source/asf/pyanalysis.rst new file mode 100644 index 000000000..d14ea5aa8 --- /dev/null +++ b/rtd/docs/source/asf/pyanalysis.rst @@ -0,0 +1,299 @@ +Python Analysis Creation +======================== + +Analysis I/O +----------- +An analysis module is a python script and has a general template. There is a class, which must be called the same name as the python script itself, and two class functions: ``__init__`` and ``get_data``. The module is first initialized and then ``get_data`` is called. This should return a pandas DataFrame or a NumSOS DataSet (preferably the former if you are using python3). Below are the variables passed from the Grafana interface to these class functions. + +``__init__`` + * ``cont`` - A Sos.Container object which contains the path information to the SOS container specified in the Grafana query + * ``start`` - The beginning of the time range of the Grafana query (in epoch time). + * ``end`` - The end of the time range of the Grafana query (in epoch time). + * ``schema`` - The LDMS schema specified by the Grafana query (e.g. meminfo). + * ``maxDataPoints`` - the maximum number of points that Grafana can display on the user's screen. + +``get_data`` + * ``metrics`` - a python list of metrics specified by the Grafana query (e.g. ['Active','MemFree']). + * ``job_id`` - a string of the job_id specified by the Grafana query. + * ``user_name`` - a string of the user name specified by the Grafana query. + * ``params`` - a string of the extra parameters specified by the Grafana query (e.g. 'threshold = 10'). + * ``filters`` - a python list of filter strings for the DSOS query (e.g. ['job_id == 30','component_id < 604']). +/t +Example Analysis Module +------------------------------------- + +Below is a basic analysis that simply queries the database and returns the DataFrame of the metrics passed in along with the timestamp, component_id, and job_id for each metric. + +.. code-block :: python + + import os, sys, traceback + import datetime as dt + from graf_analysis.grafanaAnalysis import Analysis + from sosdb import Sos + import pandas as pd + import numpy as np + class dsosTemplate(Analysis): + def __init__(self, cont, start, end, schema='job_id', maxDataPoints=4096): + super().__init__(cont, start, end, schema, 1000000) + + def get_data(self, metrics, filters=[],params=None): + try: + sel = f'select {",".join(metrics)} from {self.schema}' + where_clause = self.get_where(filters) + order = 'time_job_comp' + orderby='order_by ' + order + self.query.select(f'{sel} {where_clause} {orderby}') + res = self.get_all_data(self.query) + # Fun stuff here! + print(res.head) + return res + except Exception as e: + a, b, c = sys.exc_info() + print(str(e)+' '+str(c.tb_lineno)) + +In the ``__init__`` function, most things are set to be self variables to access them later in the ``get_data`` using the ``super()`` function. The ``super()`` function also sets up a variable called ``self.query`` which is a ``Sos.SqlQuery`` object. The 1000000 in the ``super()`` function sets the block size for this ``self.query`` object. An optimal block size is dependent on the query, however 1 million has been sufficiently performant to this point. + +In the ``get_data`` function we create a select clause for the DSOS query by joining the metrics and schema variables. The ``self.get_where`` is a graf_analysis class function which takes filter parameters and makes an SQL-like where clause string with ``self.start`` and ``self.end`` as timestamp boundaries. There is also the orderby variable which we are setting as ``time_job_comp`` here. This refers to what index we should use when querying the database. Our SOS databases are setup to use permutations of ``timestamp``, ``job ID``, and ``component ID`` as multi-indices. Depending on your filter, you may want to use a different multi-index. + +The ``self.get_all_data`` takes the Sos.SqlQuery object, ``self.query``, and calls ``self.query.next``. This returns a block size number of records that match the query from database defined by the cont variable. If there are more than a block size number of records, it continues calling ``self.query.next`` and appending the results to a pandas DataFrame until all data is returned. + +Additional analysis can be added where the "Fun stuff here!" comment is. + +With the example parameters specified in the last section, our select statement here would be ``select Active,MemFree from meminfo where timestamp > start and timestamp < end and job_id == 30 and component_id < 604 order_by time_job_comp``. + +.. note:: + + ``job_id`` and ``user_name`` must exist in the schema passed in for this command to work. + +Testing an Analysis Module +-------------------------- +This section goes over how to test your python analysis module as a user. + +You do not need to query from the Grafana interface to test your module. Below is a simple code which mimics the Grafana pipeline and prints the JSON returned to Grafana. + +.. note:: + + **If Grafana and SOS are already installed on your system then please skip the `Required Scripts`_ section** and ask your system administrator where these scripts reside on the system so that you may copy all necessary python scripts and modules to your home directory, edit/modify exisiting python analysis modules and create new ones. + + +.. code-block :: bash + + export PYTHONPATH=/usr/bin/python://lib/python/site-packages/ + export PATH=/usr/bin://bin://sbin::$PATH + +Then you can imitate the Grafana query to call your analysis module using a python script such as: + +.. code-block :: python + + #!/usr/bin/python3 + + import time,sys + from sosdb import Sos + from grafanaFormatter import DataFormatter + from table_formatter import table_formatter + from time_series_formatter import time_series_formatter + from dsosTemplate import dsosTemplate + + sess = Sos.Session("//config/dsos.conf") + cont = '' + cont = sess.open(cont) + + model = dsosTemplate(cont, time.time()-300, time.time(), schema='meminfo', maxDataPoints=4096) + + x = model.get_data(['Active']) + + #fmt = table_formatter(x) + fmt = time_series_formatter(x) + x = fmt.ret_json() + print(x) + +* The ``model.get_data`` is where you can define the type of metrics to collect (in this case it is "Active"), what filters and extra parameters you want to add to your query. The syntax is as follows: ``([''], filters=['job_id>0'], params='')`` + +* If you would like to query all metrics then replace ``Active`` with ``*``. +* To query a specific job_id: set ``job_id`` to you job_id with ``==``. +* To query from a specific time range: update the start time, ``time.time()-300`` and end time, ``time.time()`` to an epoch timestamp. +* To add a string metric, filter or parameter, you must include a double quote, ``"``, before and after the string (i.e. ``filters=['user=="myusername"']``) + +.. note:: + + The ``params`` can be any number or string that you want to define in your analysis module to better manage, output or analyze the data. For example, you can program your module to return specific analyses such as the average with ``params='analysis=average'`` by parsing the arguement, using ``if`` statements to determine what analysis to apply to the data and, to make things cleaner, a function to perform these calculations in. +/t +Required Scripts +//////////////// +The following scripts are needed to run the python analysis module. If these python scripts or modules **do not exist on your system and you have no way of accessing them** then please continue. Otherwise, you can skip this section + +**If you do not have access to these existing scripts** then please create them in the same directory as your python analysis module. + +.. note:: + + If Grafana and SOS are installed on your system then please ask your system administator where these files reside on the system so that you can copy them to your home directory. + +grafanaFormatter: + +.. code:: RST + + from sosdb import Sos + from sosdb.DataSet import DataSet + import numpy as np + import pandas as pd + import copy + + class RowIter(object): + def __init__(self, dataSet): + self.dset = dataSet + self.limit = dataSet.get_series_size() + self.row_no = 0 + + def __iter__(self): + return self + + def cvt(self, value): + if type(value) == np.datetime64: + return [ value.astype(np.int64) / 1000 ] + return value + + def __next__(self): + if self.row_no >= self.limit: + raise StopIteration + res = [ self.cvt(self.dset[[col, self.row_no]]) for col in range(0, self.dset.series_count) ] + self.row_no += 1 + return res + + class DataFormatter(object): + def __init__(self, data): + self.result = [] + self.data = data + self.fmt = type(self.data).__module__ + self.fmt_data = { + 'sosdb.DataSet' : self.fmt_dataset, + 'pandas.core.frame' : self.fmt_dataframe, + 'builtins' : self.fmt_builtins + } + + def ret_json(self): + return self.fmt_data[self.fmt]() + + def fmt_dataset(self): + pass + + def fmt_dataframe(self): + pass + + def fmt_builtins(self): + pass + +table_formatter: + +.. code:: RST + + from graf_analysis.grafanaFormatter import DataFormatter, RowIter + from sosdb.DataSet import DataSet + from sosdb import Sos + import numpy as np + import pandas as pd + import copy + + class table_formatter(DataFormatter): + def fmt_dataset(self): + # Format data from sosdb DataSet object + if self.data is None: + return {"columns" : [{ "text" : "No papi jobs in time range" }] } + + self.result = { "type" : "table" } + self.result["columns"] = [ { "text" : colName } for colName in self.data.series ] + rows = [] + for row in RowIter(self.data): + rows.append(row) + self.result["rows"] = rows + return self.result + + def fmt_dataframe(self): + if self.data is None: + return {"columns" : [{ "text" : "No papi jobs in time range" }] } + + self.result = { "type" : "table" } + self.result["columns"] = [ { "text" : colName } for colName in self.data.columns ] + self.result["rows"] = self.data.to_numpy() + return self.result + + def fmt_builtins(self): + if self.data is None: + return { "columns" : [], "rows" : [], "type" : "table" } + else: + return self.data + +time_series_formatter: + +.. code:: RST + + from graf_analysis.grafanaFormatter import DataFormatter + from sosdb.DataSet import DataSet + from sosdb import Sos + import numpy as np + import pandas as pd + import copy + + class time_series_formatter(DataFormatter): + def fmt_dataset(self): + # timestamp is always last series + if self.data is None: + return [ { "target" : "", "datapoints" : [] } ] + + for series in self.data.series: + if series == 'timestamp': + continue + ds = DataSet() + ds.append_series(self.data, series_list=[series, 'timestamp']) + plt_dict = { "target" : series } + plt_dict['datapoints'] = ds.tolist() + self.result.append(plt_dict) + del ds + return self.result + + def fmt_dataframe(self): + if self.data is None: + return [ { "target" : "", "datapoints" : [] } ] + + for series in self.data.columns: + if series == 'timestamp': + continue + plt_dict = { "target" : series } + plt_dict['datapoints'] = self.fmt_datapoints([series, 'timestamp']) + self.result.append(plt_dict) + return self.result + + def fmt_datapoints(self, series): + ''' Format dataframe to output expected by grafana ''' + aSet = [] + for row_no in range(0, len(self.data)): + aRow = [] + for col in series: + v = self.data[col].values[row_no] + typ = type(v) + if typ.__module__ == 'builtins': + pass + elif typ == np.ndarray or typ == np.string_ or typ == np.str_: + v = str(v) + elif typ == np.float32 or typ == np.float64: + v = float(v) + elif typ == np.int64 or typ == np.uint64: + v = int(v) + elif typ == np.int32 or typ == np.uint32: + v = int(v) + elif typ == np.int16 or typ == np.uint16: + v = int(v) + elif typ == np.datetime64: + # convert to milliseconds from microseconds + v = v.astype(np.int64) / int(1e6) + else: + raise ValueError("Unrecognized numpy type {0}".format(typ)) + aRow.append(v) + aSet.append(aRow) + return aSet + + def fmt_builtins(self): + if self.data is None: + return [ { "target" : "", "datapoints" : [] } ] + else: + return self.data + diff --git a/rtd/docs/source/conf.py b/rtd/docs/source/conf.py new file mode 100644 index 000000000..7de1dbac6 --- /dev/null +++ b/rtd/docs/source/conf.py @@ -0,0 +1,50 @@ +# Configuration file for the Sphinx documentation builder. + +# -- Project information + +project = 'OVIS-HPC' +copyright = '2024, Sandia National Laboratories and Open Grid Computing, Inc.' +author = 'SNL/OGC' + +release = '0.1' +version = '0.1.0' + +# -- General configuration + +extensions = [ + 'sphinx.ext.duration', + 'sphinx.ext.doctest', + 'sphinx.ext.autodoc', + 'sphinx.ext.autosummary', + 'sphinx.ext.intersphinx', +] + +intersphinx_mapping = { + 'python': ('https://docs.python.org/3/', None), + 'sphinx': ('https://www.sphinx-doc.org/en/master/', None), + + # Link to the "apis" of the "hpc-ovis" project and subprojects + "ovis-hpc": ("https://ovis-hpc.readthedocs.io/en/latest/", None), + "sos": ("https://ovis-hpc.readthedocs.io/projects/sos/en/latest/", None), + "maestro": ("https://ovis-hpc.readthedocs.io/projects/maestro/en/latest/", None), + "baler": ("https://ovis-hpc.readthedocs.io/projects/baler/en/latest/", None), + "ldms": ("https://ovis-hpc.readthedocs.io/projects/ldms/en/latest/", None), + +} +intersphinx_disabled_domains = ['std'] +intersphinx_disabled_reftypes = ["*"] + +templates_path = ['_templates'] + +# -- Options for HTML output + +html_theme = 'sphinx_rtd_theme' +html_static_path = ['static'] +html_logo = "https://github.com/ovis-hpc/readthedocs/blob/main/docs/source/images/ovis-logo.png?raw=true" +html_theme_options = { + 'logo_only': True, + 'display_version': False, +} + +# -- Options for EPUB output +epub_show_urls = 'footnote' diff --git a/rtd/docs/source/container-quickstart.rst b/rtd/docs/source/container-quickstart.rst new file mode 100644 index 000000000..c03b5ba4f --- /dev/null +++ b/rtd/docs/source/container-quickstart.rst @@ -0,0 +1,931 @@ +LDMS Containers +=============== + +``ovis-hpc/ldms-containers`` git repository contains recipes and scripts +for building Docker Images of various components in LDMS, namely: + +- ``ovishpc/ldms-dev``: an image containing dependencies for building + OVIS binaries and developing LDMS plugins. +- ``ovishpc/ldms-samp``: an image containing ``ldmsd`` binary and + sampler plugins. +- ``ovishpc/ldms-agg``: an image containing ``ldmsd`` binary, sampler + plugins, and storage plugins (including SOS). +- ``ovishpc/ldms-maestro``: an image containing ``maestro`` and + ``etcd``. +- ``ovishpc/ldms-ui``: an image containing UI back-end elements, + providing LDMS data access over HTTP (``uwsgi`` + ``django`` + + `ovis-hpc/numsos `__ + + `ovis-hpc/sosdb-ui `__ + + `ovis-hpc/sosdb-grafana `__) +- ``ovishpc/ldms-grafana``: an image containing ``grafana`` and the SOS + data source plugin for grafana + (`sosds `__) + +Table of Contents: + +- `Brief Overview About Docker + Containers <#brief-overview-about-docker-containers>`__ +- `Sites WITHOUT internet access <#sites-without-internet-access>`__ +- `SYNOPSIS <#SYNOPSIS>`__ +- `EXAMPLES <#EXAMPLES>`__ +- `LDMS Sampler Container <#ldms-sampler-container>`__ +- `LDMS Aggregator Container <#ldms-aggregator-container>`__ +- `Maestro Container <#maestro-container>`__ +- `LDMS UI Back-End Container <#ldms-ui-back-end-container>`__ +- `LDMS-Grafana Container <#ldms-grafana-container>`__ +- `SSH port forwarding to grafana <#ssh-port-forwarding-to-grafana>`__ +- `Building Containers <#building-containers>`__ + +Brief Overview About Docker Containers +-------------------------------------- + +A docker container is a runnable instance of an image. In Linux, it is +implemented using namespaces +(`namespaces(7) `__). +``docker create`` command creates a container that can later be started +with ``docker start``, while ``docker run`` creates and starts the +container in one go. When a container starts, the first process being +run, or a root process, is the program specified by the ``--entrypoint`` +CLI option or ``ENTRYPOINT`` Dockerfile directive. When the root process +exits or is killed, the container status becomes ``exited``. +``docker stop`` command sends ``SIGTERM`` to the root process, and +``docker kill`` command send ``SIGKILL`` to the root process. The other +processes in the container are also terminated or killed when the root +process is terminated or killed. ``docker ps`` shows "running" +containers, while ``docker ps -a`` shows ALL containers (including the +exited one). + +When a container is created (before started), its mount namespace +(`mount_namespaces(7) `__) +is prepared by the Docker engine. This isolates container's filesystems +from the host. The Docker Image is the basis of the filesystem mounted +in the container. The image itself is read-only, and the modification to +the files/directories inside the container at runtime is done on the +writable layer on top of the image. They are "unified" and presented to +the container as a single filesystem by OverlayFS (most preferred by +Docker, but other drivers like ``btrfs`` could also be used). A Docker +Image is actually a collection of "layers" of root directories (``/``). +When a container is ``stopped`` (the root process exited/killed), the +writable top layer still persists until ``docker rm`` command removes +the container. + +The network namespace +(`network_namespace `__) +and the process namespace (`process +namespace `__) +of a container are normally isolated, but could also use host's +namespaces. The LDMS sampler containers (``ovishpc/ldms-samp``) require +host process namespace (``--pid=host`` option) so that the ``ldmsd`` +reads host's ``/proc`` data. Otherwise, we will be collecting +container's metric data. Other LDMS containers do not need host process +namespace. For the network namespace, it is advisable to use host's +network namespace (``--network=host``) to fully utilize RDMA hardware on +the host with minimal effort in network configuration. + +Sites WITHOUT internet access +----------------------------- + +#. On your laptop (or a machine that HAS the Internet access) + +.. code:: sh + + $ docker pull ovishpc/ldms-dev + $ docker pull ovishpc/ldms-samp + $ docker pull ovishpc/ldms-agg + $ docker pull ovishpc/ldms-maestro + $ docker pull ovishpc/ldms-ui + $ docker pull ovishpc/ldms-grafana + + $ docker save ovishpc/ldms-dev > ovishpc-ldms-dev.tar + $ docker save ovishpc/ldms-samp > ovishpc-ldms-samp.tar + $ docker save ovishpc/ldms-agg > ovishpc-ldms-agg.tar + $ docker save ovishpc/ldms-maestro > ovishpc-ldms-maestro.tar + $ docker save ovishpc/ldms-ui > ovishpc-ldms-ui.tar + $ docker save ovishpc/ldms-grafana > ovishpc-ldms-grafana.tar + + # Then, copy these tar files to the site + +#. On the site that has NO Internet access + +.. code:: sh + + $ docker load < ovishpc-ldms-dev.tar + $ docker load < ovishpc-ldms-samp.tar + $ docker load < ovishpc-ldms-agg.tar + $ docker load < ovishpc-ldms-maestro.tar + $ docker load < ovishpc-ldms-ui.tar + $ docker load < ovishpc-ldms-grafana.tar + +Then, the images are available locally (no need to ``docker pull``). + +SYNOPSIS +-------- + +In this section, the options in ``[ ]`` are optional. Please see the +``#`` comments right after the options for the descriptions. Please also +note that the options BEFORE the Docker Image name are for +``docker run``, and the options AFTER the image name are for the +entrypoint script. The following is the information regarding entrypoint +options for each image: + +- ``ovishpc/ldms-dev`` entrypoint options are pass-through to + ``/bin/bash``. +- ``ovishpc/ldms-samp`` entrypoint options are pass-through to ldmsd. +- ``ovishpc/ldms-agg`` entrypoint options are pass-through to ldmsd. +- ``ovishpc/ldms-maestro`` entrypoint options are ignored. +- ``ovishpc/ldms-ui`` entrypoint options are pass-through to uwsgi. +- ``ovishpc/ldms-grafana`` entrypoint options are pass-through to + grafana-server program. + +.. code:: sh + + # Pulling images + $ docker pull ovishpc/ldms-dev + $ docker pull ovishpc/ldms-samp + $ docker pull ovishpc/ldms-agg + $ docker pull ovishpc/ldms-maestro + $ docker pull ovishpc/ldms-ui + $ docker pull ovishpc/ldms-grafana + + # munge remark: munge.key file must be owned by 101:101 (which is munge:munge in + # the container) and has 0600 mode. + + # ovishpc/ldms-maestro + $ docker run -d --name= --network=host --privileged + [ -v /run/munge:/run/munge:ro ] # expose host's munge to the container + [ -v /on-host/munge.key:/etc/munge/munge.key:ro ] # use container's munged with custom key + -v /on-host/ldms_cfg.yaml:/etc/ldms_cfg.yaml:ro # bind ldms_cfg.yaml, used by maestro_ctrl + ovishpc/ldms-maestro # the image name + + + # ovishpc/ldms-samp + $ docker run -d --name= --network=host --pid=host --privileged + -e COMPID= # set COMPID environment variable + [ -v /run/munge:/run/munge:ro ] # expose host's munge to the container + [ -v /on-host/munge.key:/etc/munge/munge.key:ro ] # use container's munged with custom key + ovishpc/ldms-samp # the image name + -x : # transport, listening port + [ -a munge ] # use munge authentication + [ OTHER LDMSD OPTIONS ] + + + # ovishpc/ldms-agg + $ docker run -d --name= --network=host --pid=host --privileged + -e COMPID= # set COMPID environment variable + [ -v /on-host/storage:/storage:rw ] # bind 'storage/'. Could be any path, depending on ldmsd configuration + [ -v /on-host/dsosd.json:/etc/dsosd.json:ro ] # bind dsosd.json configuration, if using dsosd to export SOS data + [ -v /run/munge:/run/munge:ro ] # expose host's munge to the container + [ -v /on-host/munge.key:/etc/munge/munge.key:ro ] # use container's munged with custom key + ovishpc/ldms-agg # the image name + -x : # transport, listening port + [ -a munge ] # use munge authentication + [ OTHER LDMSD OPTIONS ] + # Run dsosd to export SOS data + $ docker exec -it /bin/bash + () $ rpcbind + () $ export DSOSD_DIRECTORY=/etc/dsosd.json + () $ dsosd >/var/log/dsosd.log 2>&1 & + () $ exit + + + # ovishpc/ldms-ui + $ docker run -d --name= --network=host --privileged + -v /on-host/dsosd.conf:/opt/ovis/etc/dsosd.conf # dsosd.conf file, required to connect to dsosd + -v /on-host/settings.py:/opt/ovis/ui/sosgui/settings.py # sosdb-ui Django setting file + ovishpc/ldms-ui # the image name + [ --http-socket=: ] # addr:port to serve, ":80" by default + [ OTHER uWSGI OPTIONS ] + + + # ovishpc/ldms-grafana + $ docker run -d --name= --network=host --privileged + [ -v /on-host/grafana.ini:/etc/grafana/grafana.ini:ro ] # custom grafana config + [ -e GF_SERVER_HTTP_ADDR= ] # env var to override Grafana IP address binding (default: all addresses) + [ -e GF_SERVER_HTTP_PORT= ] # env var to override Grafana port binding (default: 3000) + ovishpc/ldms-grafana # the image name + [ OTHER GRAFANA-SERVER OPTIONS ] # other options to grafana-server + + + # ------------------------------------- + # configuration files summary + # ------------------------------------- + # - /on-host/dsosd.json: contains dictionary mapping hostname - container + # location in the host, e.g. + # { + # "host1": { + # "dsos_cont":"/storage/cont_host1" + # }, + # "host2": { + # "dsos_cont":"/storage/cont_host2" + # } + # } + # + # - /on-host/dsosd.conf: contains host names (one per line) of the dsosd, e.g. + # host1 + # host2 + # + # - /on-host/settings.py: Django settings. Pay attention to DSOS_ROOT and + # DSOS_CONF variables. + +EXAMPLES +-------- + +In this example, we have 8-nodes cluster with host names cygnus-01 to +cygnus-08. ``cygnus-0[1-4]`` are used as compute nodes (deploying +``ovishpc/ldms-samp`` containers). ``cygnus-0[5-6]`` are used as L1 +aggregator (``ovishpc/ldms-agg`` containers without storage). +``cygnus-07`` is used as L2 aggregator with a DSOS storage +(``ovishpc/ldms-agg`` with dsosd). ``cygnus-07`` will also host +``ovishpc/maestro``, ``ovishpc/ldms-ui`` and ``ovishpc/ldms-grafana`` +containers. We will be running commands from ``cygnus-07``. The cluster +has ``munged`` pre-configured and running on all nodes with the same +key. + +Configuration files used in this example are listed at the end of the +section. The following is a list of commands that deploys various +containers on the cygnus cluster: + +.. code:: sh + + # Start sampler containers on cygnus-01,02,03,04 + root@cygnus-07 $ pdsh -w cygnus-0[1-4] 'docker run -d --name=samp --network=host --pid=host --privileged -v /run/munge:/run/munge:ro -e COMPONENT_ID=${HOSTNAME#cygnus-0} ovishpc/ldms-samp -x rdma:411 -a munge' + # Notice the COMPONENT_ID environment variable setup using Bash substitution. + # The COMPONENT_ID environment variable is later used in LDMSD sampler plugin + # configuration `component_id: ${COMPONENT_ID}` in the `ldms_cfg.yaml` file. + + # Start L1 aggregator containers on cygnus-05,06 + root@cygnus-07 $ pdsh -w cygnus-0[5-6] docker run -d --name=agg1 --network=host --pid=host --privileged -v /run/munge:/run/munge:ro ovishpc/ldms-agg -x rdma:411 -a munge + + # Start L2 aggregator container on cygnus-07 + root@cygnus-07 $ docker run -d --name=agg2 --network=host --pid=host --privileged -v /run/munge:/run/munge:ro -v /store:/store:rw ovishpc/ldms-agg -x rdma:411 -a munge + + # Start dsosd in the `agg2`, our L2 aggregator container + root@cygnus-07 $ echo 'rpcbind ; dsosd > /var/log/dsosd.log 2>&1 &' | docker exec -i agg2 /bin/bash + + # Start maestro container on cygnus-07 + root@cygnus-07 $ docker run -d --name=maestro --network=host --privileged -v /run/munge:/run/munge:ro -v ${PWD}/ldms_cfg.yaml:/etc/ldms_cfg.yaml:ro ovishpc/ldms-maestro + + # Start Django UI container + root@cygnus-07 $ docker run -d --name=ui --network=host --privileged -v ${PWD}/dsosd.conf:/opt/ovis/etc/dsosd.conf -v ${PWD}/settings.py:/opt/ovis/ui/sosgui/settings.py ovishpc/ldms-ui + + # Start Grafana container + root@cygnus-07 $ docker run -d --name=grafana --privileged --network=host ovishpc/ldms-grafana + +Related configuration files + +.. code:: sh + + # dsosd.conf + cygnus-07 + +.. code:: yaml + + # ldms_cfg.yaml + xprt: &xprt "rdma" + daemons: + - names : &samp-names "samp-[1-4]" + hosts : &samp-hosts "cygnus-0[1-4]-iw" + endpoints : + - names : &samp-eps "cygnus-0[1-4]-iw-ep" + ports : 411 + xprt : *xprt + maestro_comm : True + auth : + name : munge + plugin : munge + - names : &L1-names "agg-[11-12]" + hosts : &L1-hosts "cygnus-0[5-6]-iw" + endpoints : + - names : &L1-eps "agg-[11-12]-ep" + ports : 411 + xprt : *xprt + maestro_comm : True + auth : + name : munge + plugin : munge + - names : &L2-name "agg-2" + hosts : &L2-host "cygnus-07-iw" + endpoints : + - names : &L2-ep "agg-2-ep" + ports : 411 + xprt : *xprt + maestro_comm : True + auth : + name : munge + plugin : munge + + aggregators: + - daemons : *L1-names + peers : + - daemons : *samp-names + endpoints : *samp-eps + reconnect : 1s + type : active + updaters : + - mode : pull + interval : "1.0s" + offset : "200ms" + sets : + - regex : .* + field : inst + - daemons : *L2-name + peers: + - daemons : *L1-names + endpoints : *L1-eps + reconnect : 1s + type : active + updaters : + - mode : pull + interval : "1.0s" + offset : "400ms" + sets : + - regex : .* + field : inst + + samplers: + - daemons : *samp-names + plugins : + - name : meminfo # Variables can be specific to plugin + interval : "1s" # Used when starting the sampler plugin + offset : "0s" + config : &simple_samp_config + component_id : "${COMPONENT_ID}" + perm : "0777" + + stores: + - name : sos-meminfo + daemons : *L2-name + container : meminfo + schema : meminfo + flush : 10s + plugin : + name : store_sos + config : + path : /store + +.. code:: py + + # settings.py + """ + Django settings for sosgui project. + + Generated by 'django-admin startproject' using Django 1.8.2. + + For more information on this file, see + https://docs.djangoproject.com/en/1.8/topics/settings/ + + For the full list of settings and their values, see + https://docs.djangoproject.com/en/1.8/ref/settings/ + """ + + # Build paths inside the project like this: os.path.join(BASE_DIR, ...) + import os + import json + + log = open('/var/log/sosgui/settings.log', 'a') + BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) + + # Quick-start development settings - unsuitable for production + # See https://docs.djangoproject.com/en/1.8/howto/deployment/checklist/ + + # SECURITY WARNING: keep the secret key used in production secret! + SECRET_KEY = 'blablablablablablablablablablablablablablablablablabla' + + # SECURITY WARNING: don't run with debug turned on in production! + DEBUG = True + + ALLOWED_HOSTS = [ + '*', + ] + + APPEND_SLASH = False + + STATIC_ROOT = os.path.join(BASE_DIR, "assets") + + AUTH_USER_MODEL = 'sosdb_auth.SosdbUser' + + # Application definition + + INSTALLED_APPS = ( + 'corsheaders', + 'django.contrib.admin', + 'django.contrib.auth', + 'django.contrib.contenttypes', + 'django.contrib.sessions', + 'django.contrib.messages', + 'django.contrib.staticfiles', + 'container', + 'jobs', + 'objbrowser', + 'sos_db', + 'sosdb_auth', + ) + + try: + from . import ldms_settings + INSTALLED_APPS = INSTALLED_APPS + ldms_settings.INSTALLED_APPS + except: + pass + + try: + from . import grafana_settings + INSTALLED_APPS = INSTALLED_APPS + grafana_settings.INSTALLED_APPS + except: + pass + + try: + from . import baler_settings + INSTALLED_APPS = INSTALLED_APPS + baler_settings.INSTALLED_APPS + except: + pass + + MIDDLEWARE = ( + 'corsheaders.middleware.CorsMiddleware', + 'django.contrib.sessions.middleware.SessionMiddleware', + 'django.middleware.common.CommonMiddleware', + 'django.contrib.auth.middleware.AuthenticationMiddleware', + 'django.contrib.messages.middleware.MessageMiddleware', + 'django.middleware.clickjacking.XFrameOptionsMiddleware', + 'django.middleware.security.SecurityMiddleware', + ) + + ROOT_URLCONF = 'sosgui.urls' + + TEMPLATES = [ + { + 'BACKEND': 'django.template.backends.django.DjangoTemplates', + 'DIRS': [ + '/opt/ovis/ui/templates', + ], + 'APP_DIRS': True, + 'OPTIONS': { + 'context_processors': [ + 'django.contrib.auth.context_processors.auth', + 'django.template.context_processors.debug', + 'django.template.context_processors.request', + 'django.contrib.messages.context_processors.messages', + ], + }, + }, + ] + + WSGI_APPLICATION = 'sosgui.wsgi.application' + + + # Database + # https://docs.djangoproject.com/en/1.8/ref/settings/#databases + + DATABASES = { + 'default': { + 'ENGINE': 'django.db.backends.sqlite3', + 'NAME': os.path.join(BASE_DIR, 'db.sqlite3'), + } + } + + LANGUAGE_CODE = 'en-us' + + TIME_ZONE = 'UTC' + + USE_I18N = True + + USE_L10N = True + + USE_TZ = True + + + # Static files (CSS, JavaScript, Images) + # https://docs.djangoproject.com/en/1.8/howto/static-files/ + + STATIC_URL = '/static/' + + STATICFILES_DIRS = [ + '/opt/ovis/ui/static/', + ] + + SESSION_EXPIRE_AT_BROWSER_CLOSE = True + SOS_ROOT = "/store/" + DSOS_ROOT = "/store/" + DSOS_CONF = "/opt/ovis/etc/dsosd.conf" + LOG_FILE = "/var/log/sosgui/sosgui.log" + LOG_DATE_FMT = "%F %T" + ODS_LOG_FILE = "/var/log/sosgui/ods.log" + ODS_LOG_MASK = "255" + ODS_GC_TIMEOUT = 10 + BSTORE_PLUGIN="bstore_sos" + os.environ.setdefault("BSTORE_PLUGIN_PATH", "/opt/ovis/lib64") + os.environ.setdefault("SET_POS_KEEP_TIME", "3600") + + + try: + import ldms_cfg + LDMS_CFG = ldms_cfg.aggregators + except Exception as e: + log.write(repr(e)+'\n') + LDMS_CFG = { "aggregators" : [] } + + try: + import syslog + SYSLOG_CFG = syslog.syslog + except Exception as e: + log.write('SYSLOG_SETTINGS ERR '+repr(e)+'\n') + SYSLOG_CFG = { "stores" : [] } + +LDMS Sampler Container +---------------------- + +.. code:: sh + + # SYNOPSIS + $ docker run -d --name= --network=host --pid=host --privileged + -e COMPID= # set COMPID environment variable + [ -v /run/munge:/run/munge:ro ] # expose host's munge to the container + [ -v /on-host/munge.key:/etc/munge/munge.key:ro ] # use container's munged with custom key + ovishpc/ldms-samp # the image name + -x : # transport, listening port + [ -a munge ] # use munge authentication + [ OTHER LDMSD OPTIONS ] # e.g. -v INFO + +``ovishpc/ldms-samp`` entrypoint executes ``ldmsd -F``, making it the +leader process of the container. Users can append ``[OPTIONS]`` and they +will be passed to ``ldmsd -F`` CLI. If ``-a munge`` is given, the +entrypoint script will check if ``/run/munge`` is a bind-mount from the +host. If so, munge encoding/decoding is done through ``munged`` on the +host via the bind-mounged ``/run/munge`` -- no need to run ``munged`` +inside the container. Otherwise, in the case that ``-a munge`` is given +and ``/run/munge`` is not host-bind-mounted, the entrypoint script runs +``munged`` and tests it BEFORE ``ldmsd``. + +Usage examples: + +.. code:: sh + + ## On a compute node + + # Pull the container image + $ docker pull ovishpc/ldms-samp + + # Start ldmsd container, with host network namespace and host PID namespace; + # - COMPID env var is HOSTNAME without the non-numeric prefixes and the leading + # zeroes (e.g. nid00100 => 100, nid10000 => 10000). Note that this uses + # bash(1) Parameter Expansion and Pattern Matching features. + # + # - serving on socket transport port 411 with munge authentication + # + # - using host munge + $ docker run -d --name=samp --network=host --pid=host --privileged \ + -e COMPID=${HOSTNAME##*([^1-9])} \ + -v /run/munge:/run/munge:ro \ + ovishpc/ldms-samp -x sock:411 -a munge + +We encourage to use ``maestro`` to configure a cluster of ``ldmsd``. +However, if there is a need to configure ``ldmsd`` manually, one can do +from within the container. In this case: + +.. code:: sh + + $ docker exec samp /bin/bash + (samp) $ ldmsd_controller --xprt sock --port 411 --host localhost --auth munge + LDMSD_CONTROLLER_PROMPT> + +LDMS Aggregator Container +------------------------- + +.. code:: sh + + # SYNOPSIS + $ docker run -d --name= --network=host --pid=host --privileged + -e COMPID= # set COMPID environment variable + [ -v /on-host/storage:/storage:rw ] # bind 'storage/'. Could be any path, depending on ldmsd configuration + [ -v /on-host/dsosd.json:/etc/dsosd.json:ro ] # bind dsosd.json configuration, if using dsosd to export SOS data + [ -v /run/munge:/run/munge:ro ] # expose host's munge to the container + [ -v /on-host/munge.key:/etc/munge/munge.key:ro ] # use container's munged with custom key + ovishpc/ldms-samp # the image name + -x : # transport, listening port + [ -a munge ] # use munge authentication + [ OTHER LDMSD OPTIONS ] + # dsosd to export SOS data + $ docker exec -it /bin/bash + () $ rpcbind + () $ export DSOSD_DIRECTORY=/etc/dsosd.json + () $ dsosd >/var/log/dsosd.log 2>&1 & + () $ exit + +``ovishpc/ldms-agg`` entrypoint executes ``ldmsd -F``, making it the +leader process of the container. It also handles ``-a munge`` the same +way that ``ovishpc/ldms-samp`` does. In the case of exporting SOS data +through ``dsosd``, the daemon is required to execute after the container +is up. + +Example usage: + +.. code:: sh + + ## On a service node + + # Pull the container image + $ docker pull ovishpc/ldms-agg + + # Start ldmsd container, using host network namespace and host PID namespace; + # - with host munge + # - serving port 411 + # - The `-v /on-host/storage:/storage:rw` option is to map on-host storage + # location `/on-host/storage` to `/storage` location in the container. The + # data written to `/storage/` in the container will persist in + # `/on-host/storage/` on the host. + $ docker run -d --name=agg --network=host --privileged \ + -v /run/munge:/run/munge:ro \ + -v /on-host/storage:/storage:rw \ + ovishpc/ldms-agg -x sock:411 -a munge + + # Start dsosd service for remote SOS container access (e.g. by UI), by first + # bring up a shell inside the container, then start rpcbind and dsosd. + $ docker exec agg /bin/bash + (agg) $ rpcbind + (agg) $ export DSOSD_DIRECTORY=/etc/dsosd.json + (agg) $ dsosd >/var/log/dsosd.log 2>&1 & + (agg) $ exit + +``dsosd.json`` contains a collection of ``container_name`` - ``path`` +mappings for each host. For example: + +.. code:: json + + { + "host1": { + "dsos_cont":"/storage/cont_host1", + "tmp_cont":"/tmp/ram_cont" + }, + "host2": { + "dsos_cont":"/storage/cont_host2", + "tmp_cont":"/tmp/ram_cont" + } + } + +Maestro Container +----------------- + +.. code:: sh + + # SYNOPSIS + $ docker run -d --name= --network=host --privileged + [ -v /run/munge:/run/munge:ro ] # expose host's munge to the container + [ -v /on-host/munge.key:/etc/munge/munge.key:ro ] # use container's munged with custom key + -v /on-host/ldms_cfg.yaml:/etc/ldms_cfg.yaml:ro # bind ldms_cfg.yaml, used by maestro_ctrl + ovishpc/ldms-maestro # the image name + +``ovishpc/ldms-maestro`` containers will run at the least two daemons: +``etcd`` and ``maestro``. It may also run ``munged`` if host's munge is +not used (i.e. ``-v /run/munge:/run/munge:ro`` is not given to +``docker run``). The entrypoint script does the following: + +#. starts ``etcd`` +#. starts ``munged`` if host's munge is not used. +#. execute ``maestro_ctrl`` with ``--ldms_config /etc/ldms_cfg.yaml``. + Notice that the ``ldms_cfg.yaml`` file is given by the user by the + ``-v`` option. +#. execute ``maestro`` process. ``maestro`` will periodically connect to + all ``ldmsd`` specified by ``ldms_cfg.yaml`` and send the + corresponding configuration. + +REMARK: For now, the ``etcd`` and ``maestro`` processes in the +``ovishpc/ldms-maestro`` container run as stand-alone processes. We will +support a cluster of ``ovishpc/ldms-maestro`` containers in the future. + +Example usage: + +.. code:: sh + + ## On a service node + + # Pull the container image + $ docker pull ovishpc/ldms-maestro + + # Start maestro container, using host network namespace, and using host's munge + $ docker run -d --network=host --privileged \ + -v /run/munge:/run/munge:ro \ + -v /my/ldms_cfg.yaml:/etc/ldms_cfg.yaml:rw \ + ovishpc/ldms-maestro + +Please see `ldms_cfg.yaml `__ for +an example. + +LDMS UI Back-End Container +-------------------------- + +.. code:: sh + + # SYNOPSIS + $ docker run -d --name= --network=host --privileged + -v /on-host/dsosd.conf:/opt/ovis/etc/dsosd.conf # dsosd.conf file, required to connect to dsosd + -v /on-host/settings.py:/opt/ovis/ui/sosgui/settings.py # sosdb-ui Django setting file + ovishpc/ldms-ui # the image name + [ --http-socket=: ] # addr:port to serve, ":80" by default + [ OTHER uWSGI OPTIONS ] + +``ovishpc/ldms-ui`` execute ``uwsgi`` process with ``sosgui`` (the +back-end GUI WSGI module) application module. It is the only process in +the container. The ``uwsgi`` in this container by default will listen to +port 80. The ``--http-socket=ADDR:PORT`` will override this behavior. +Other options given to ``docker run`` will also be passed to the +``uwsgi`` command as well. + +The ``sosgui`` WSGI application requires two configuration files: + +#. ``dsosd.conf``: containing a list of hostnames of dsosd, one per + line. See `here `__ for an + example. +#. ``settings.py``: containing a WSGI application settings. Please pay + attention to DSOS_ROOT and DSOS_CONF. See + `here `__ for an example. + +Usage example: + +.. code:: sh + + ## On a service node + + # Pull the container image + $ docker pull ovishpc/ldms-ui + + # Start ldms-ui container, using host network namespace + $ docker run -d --name=ui --network=host --privileged \ + -v /HOST/dsosd.conf:/opt/ovis/etc/dsosd.conf \ + -v /HOST/settings.py:/opt/ovis/ui/sosgui/settings.py \ + ovishpc/ldms-ui + +LDMS-Grafana Container +---------------------- + +.. code:: sh + + # SYNOPSIS + $ docker run -d --name= --network=host --privileged + [ -v /on-host/grafana.ini:/etc/grafana/grafana.ini:ro ] # custom grafana config + [ -e GF_SERVER_HTTP_ADDR= ] # env var to override Grafana IP address binding (default: all addresses) + [ -e GF_SERVER_HTTP_PORT= ] # env var to override Grafana port binding (default: 3000) + ovishpc/ldms-grafana # the image name + [ OTHER GRAFANA-SERVER OPTIONS ] # other options to grafana-server + +``ovishpc/ldms-grafana`` is based on +`grafana/grafana-oss:9.1.0-ubuntu `__ +with Sos data source plugin to access distributed-SOS data. The grafana +server listens to port 3000 by default. The options specified at the +``docker run`` CLI will be passed to the ``grafana-server`` command. + +.. code:: sh + + ## On a service node + + # Pull the container image + $ docker pull ovishpc/ldms-grafana + + # Start ldms-grafana container, this will use port 3000 + $ docker run -d --name=grafana --privileged --network=host ovishpc/ldms-grafana + + # Use a web browser to navigate to http://HOSTNAME:3000 to access grafana + +SSH port forwarding to grafana +------------------------------ + +In the case that the grafana server cannot be accessed directly, use SSH +port forwarding as follows: + +.. code:: sh + + (laptop) $ ssh -L 127.0.0.1:3000:127.0.0.1:3000 LOGIN_NODE + (LOGIN_HODE) $ ssh -L 127.0.0.1:3000:127.0.0.1:3000 G_HOST + # Assuming that the ldms-grafana container is running on G_HOST. + +Then, you should be able to access the grafana web server via +``http://127.0.0.1:3000/`` on your laptop. + +Building Containers +------------------- + +TL;DR: edit `config.sh `__, customize the ``*_REPO``, +``*_BRANCH`` and ``*_OPTIONS``, then run ``./scripts/build-all.sh``. + +The following steps describe the building process executed by the +`scripts/build-all.sh `__ script: + +#. Build ``ovishpc/ldms-dev`` docker image. This "development" image + contains development programs and libraries for building + ``/opt/ovis`` binaries and ``dsosds``. + + - See + `recipes/ldms-dev/docker-build.sh `__ + and `recipes/ldms-dev/Dockerfile `__. + +#. Build ``/opt/ovis`` binaries with + `scripts/build-ovis-binaries.sh `__ + script. The environment variables specified in + `config.sh `__ file inform the build script which + reposositories or branches to check out and build. The variables + categorized by the components are as follows: + + - ovis: the main component of OVIS project (``ldmsd`` and LDMS + python) + + - ``OVIS_REPO`` + - ``OVIS_BRANCH`` + + - sos: the Scalable Object Storage technology + + - ``SOS_REPO`` + - ``SOS_BRANCH`` + + - maestro: the ``ldmsd`` cluster configurator + + - ``MAESTRO_REPO`` + - ``MAESTRO_BRANCH`` + + - numsos: + + - ``NUMSOS_REPO`` + - ``NUMSOS_BRANCH`` + + - sosdb-ui: + + - ``SOSDBUI_REPO`` + - ``SOSDBUI_BRANCH`` + + - sosdb-grafana: + + - ``SOSDBGRAFANA_REPO`` + - ``SOSDBGRAFANA_BRANCH`` The binaries output directory + (absolute, or relative to the top source directory) is + specified by the ``OVIS`` variable in + `config.sh `__. + +#. Build ``dsosds`` grafana data source plugin for SOS data access with + `scripts/build-dsosds.sh `__. The following + envronment variables in `config.sh `__ determine which + repository and branch to check the code out for building ``dsosds``: + + - ``DSOSDS_REPO`` + - ``DSOSDS_BRANCH`` The ``dsosds`` output directory (absolute, or + relative to the top source directory) is specified by ``DSOSDS`` + variable in `config.sh `__. + +#. Build ``ovishpc/ldms-samp`` image using the ``ovis`` binaries built + in step 2. The LDMS Sampler Image contains only ``ldmsd``, the + sampler plugins and their dependencies. The storage plugins are not + included. + + - See + `recipes/ldms-samp/docker-build.sh `__ + and + `recipes/ldms-samp/Dockerfile `__. + - Also see ``OVIS_OPTIONS`` in `config.sh `__ for the + build options that enable/disable plugins. + +#. Build ``ovishpc/ldms-agg`` image using the ``ovis`` binaries built in + step 2. The LDMS Aggregator Image contains SOS, ``ldmsd`` and all + plugins (both samplers and stores). + + - See + `recipes/ldms-agg/docker-build.sh `__ + and `recipes/ldms-agg/Dockerfile `__. + - Also see ``OVIS_OPTIONS`` in `config.sh `__ for the + build options that enable/disable plugins. + +#. Build ``ovishpc/ldms-maestro`` image using the maestro binaries from + ``ovis`` binaries built in step 2. This image also includes ``etcd``, + a dependency of ``maestro``. + + - See + `recipes/ldms-maestro/docker-build.sh `__ + and + `recipes/ldms-maestro/Dockerfile `__. + +#. Build ``ovishpc/ldms-ui`` image using the UI components from ``ovis`` + binaries built in step 2 (``ovis/ui/``). The image includes ``uwsgi`` + web server that is used to serve ``sosdb-ui`` Django application, + providing SOS data access over HTTP. + + - See + `recipes/ldms-ui/docker-build.sh `__ + and `recipes/ldms-ui/Dockerfile `__. + +#. Build ``ovishpc/ldms-grafana`` image based on ``grafana`` image and + include ``dsosds`` grafana data source plugin built in step 3. A + container that instantiates from this image is bacially a grafana + server with ``dsosds`` data source plugin pre-installed. + + - See + `recipes/ldms-grafana/docker-build.sh `__ + and + `recipes/ldms-grafana/Dockerfile `__. + +Note that many of the ``docker-build.sh`` scripts use ``tar`` to create +docker build context (a set of files / directories for Docker Build +process to ADD) instead of using the working directory that contains +``Dockerfile``. This is so that we don't have to copy the selected files +from ``ovis`` into each of the ``Dockerfile`` directories. + +It is also possible to manually run an ``ovishpc/ldms-dev`` container +and build your version of ``ovis`` (e.g. creating a new plugin) and +package a custom ``ovishpc/ldms-samp`` with +``recipes/ldms-samp/docker-buildingn.sh`` because the +``docker-building.sh`` script uses whatever binaries available in the +``ovis`` directory. diff --git a/rtd/docs/source/contributing/docreqs.rst b/rtd/docs/source/contributing/docreqs.rst new file mode 100644 index 000000000..444c3a2f0 --- /dev/null +++ b/rtd/docs/source/contributing/docreqs.rst @@ -0,0 +1,2 @@ +Documentation Requirements for Contributions +=========================================== diff --git a/rtd/docs/source/contributing/index.rst b/rtd/docs/source/contributing/index.rst new file mode 100644 index 000000000..77499458d --- /dev/null +++ b/rtd/docs/source/contributing/index.rst @@ -0,0 +1,10 @@ +Contributing to LDMS +==== + +.. toctree:: + :maxdepth: 2 + + samplerwrite + storewrite + docreqs + diff --git a/rtd/docs/source/contributing/samplerwrite.rst b/rtd/docs/source/contributing/samplerwrite.rst new file mode 100644 index 000000000..397a34d97 --- /dev/null +++ b/rtd/docs/source/contributing/samplerwrite.rst @@ -0,0 +1,2 @@ +How to write an LDMS Sampler Plugin +==================================== diff --git a/rtd/docs/source/contributing/storewrite.rst b/rtd/docs/source/contributing/storewrite.rst new file mode 100644 index 000000000..b95a2fe46 --- /dev/null +++ b/rtd/docs/source/contributing/storewrite.rst @@ -0,0 +1,2 @@ +How to write an LDMS Store Plugin +==================================== diff --git a/rtd/docs/source/deployment/index.rst b/rtd/docs/source/deployment/index.rst new file mode 100644 index 000000000..54dee431b --- /dev/null +++ b/rtd/docs/source/deployment/index.rst @@ -0,0 +1,9 @@ +LDMS Deployment +=============== +This section covers how to deploy and test LDMS + +.. toctree:: + :maxdepth: 2 + + ldms-test + ldms-jenkins diff --git a/rtd/docs/source/deployment/ldms-jenkins.rst b/rtd/docs/source/deployment/ldms-jenkins.rst new file mode 100644 index 000000000..2c3f77fce --- /dev/null +++ b/rtd/docs/source/deployment/ldms-jenkins.rst @@ -0,0 +1,2 @@ +LDMS Build, Install and RPM Testing on Jenkins +----------------------------------------------- diff --git a/rtd/docs/source/deployment/ldms-test.rst b/rtd/docs/source/deployment/ldms-test.rst new file mode 100644 index 000000000..88966742f --- /dev/null +++ b/rtd/docs/source/deployment/ldms-test.rst @@ -0,0 +1,3 @@ +Github Repository for LDMS Functional Testing +---------------------------------------------- + diff --git a/rtd/docs/source/images/appsysfusion.png b/rtd/docs/source/images/appsysfusion.png new file mode 100644 index 000000000..28769ee14 Binary files /dev/null and b/rtd/docs/source/images/appsysfusion.png differ diff --git a/rtd/docs/source/images/darshanConnector.png b/rtd/docs/source/images/darshanConnector.png new file mode 100644 index 000000000..173411a3b Binary files /dev/null and b/rtd/docs/source/images/darshanConnector.png differ diff --git a/rtd/docs/source/images/grafana/grafana_output.png b/rtd/docs/source/images/grafana/grafana_output.png new file mode 100644 index 000000000..293eec6e7 Binary files /dev/null and b/rtd/docs/source/images/grafana/grafana_output.png differ diff --git a/rtd/docs/source/images/grafana/grafana_query.png b/rtd/docs/source/images/grafana/grafana_query.png new file mode 100644 index 000000000..96acc8693 Binary files /dev/null and b/rtd/docs/source/images/grafana/grafana_query.png differ diff --git a/rtd/docs/source/images/grafana/grafana_time.png b/rtd/docs/source/images/grafana/grafana_time.png new file mode 100644 index 000000000..3e15059f0 Binary files /dev/null and b/rtd/docs/source/images/grafana/grafana_time.png differ diff --git a/rtd/docs/source/images/grafana/grafana_timerange.png b/rtd/docs/source/images/grafana/grafana_timerange.png new file mode 100644 index 000000000..73af4aff7 Binary files /dev/null and b/rtd/docs/source/images/grafana/grafana_timerange.png differ diff --git a/rtd/docs/source/images/grafana/grafanapanel.png b/rtd/docs/source/images/grafana/grafanapanel.png new file mode 100644 index 000000000..6e2133537 Binary files /dev/null and b/rtd/docs/source/images/grafana/grafanapanel.png differ diff --git a/rtd/docs/source/images/grafana/grafanapanel_variables.png b/rtd/docs/source/images/grafana/grafanapanel_variables.png new file mode 100644 index 000000000..cc978d890 Binary files /dev/null and b/rtd/docs/source/images/grafana/grafanapanel_variables.png differ diff --git a/rtd/docs/source/images/ldmscon/ldmscon2020tutorial.png b/rtd/docs/source/images/ldmscon/ldmscon2020tutorial.png new file mode 100644 index 000000000..29807db12 Binary files /dev/null and b/rtd/docs/source/images/ldmscon/ldmscon2020tutorial.png differ diff --git a/rtd/docs/source/images/ldmscon/ldmscon2021pres.PNG b/rtd/docs/source/images/ldmscon/ldmscon2021pres.PNG new file mode 100644 index 000000000..6643f2a7d Binary files /dev/null and b/rtd/docs/source/images/ldmscon/ldmscon2021pres.PNG differ diff --git a/rtd/docs/source/images/ldmscon/ldmscon2021tutorial.PNG b/rtd/docs/source/images/ldmscon/ldmscon2021tutorial.PNG new file mode 100644 index 000000000..3e8631434 Binary files /dev/null and b/rtd/docs/source/images/ldmscon/ldmscon2021tutorial.PNG differ diff --git a/rtd/docs/source/images/ldmscon/ldmscon2022pres.PNG b/rtd/docs/source/images/ldmscon/ldmscon2022pres.PNG new file mode 100644 index 000000000..ace4b1891 Binary files /dev/null and b/rtd/docs/source/images/ldmscon/ldmscon2022pres.PNG differ diff --git a/rtd/docs/source/images/ldmscon/ldmscon2022tutorial.PNG b/rtd/docs/source/images/ldmscon/ldmscon2022tutorial.PNG new file mode 100644 index 000000000..695707372 Binary files /dev/null and b/rtd/docs/source/images/ldmscon/ldmscon2022tutorial.PNG differ diff --git a/rtd/docs/source/images/ldmscon/ldmscon2023pres.PNG b/rtd/docs/source/images/ldmscon/ldmscon2023pres.PNG new file mode 100644 index 000000000..12bd57e8e Binary files /dev/null and b/rtd/docs/source/images/ldmscon/ldmscon2023pres.PNG differ diff --git a/rtd/docs/source/images/ldmscon/ldmscon2023tutorial.png b/rtd/docs/source/images/ldmscon/ldmscon2023tutorial.png new file mode 100644 index 000000000..a19458edf Binary files /dev/null and b/rtd/docs/source/images/ldmscon/ldmscon2023tutorial.png differ diff --git a/rtd/docs/source/images/ovis-hpc_homepage.png b/rtd/docs/source/images/ovis-hpc_homepage.png new file mode 100644 index 000000000..a50e74bc6 Binary files /dev/null and b/rtd/docs/source/images/ovis-hpc_homepage.png differ diff --git a/rtd/docs/source/index.rst b/rtd/docs/source/index.rst new file mode 100644 index 000000000..1c5b2b950 --- /dev/null +++ b/rtd/docs/source/index.rst @@ -0,0 +1,67 @@ +.. Copyright 2023 Sandia National Laboratories, LLC + (c.f. AUTHORS, NOTICE.LLNS, COPYING) + + SPDX-License-Identifier: (LGPL-3.0) + +.. Flux documentation master file, created by + sphinx-quickstart on Fri Jan 10 15:11:07 2020. + You can adapt this file completely to your liking, but it should at least + contain the root `toctree` directive. + +Welcome To OVIS-HPC Documentation! +==================================== +.. image:: https://github.com/ovis-hpc/readthedocs/blob/main/docs/source/images/ovis-logo.png?raw=true + :width: 225 + :height: 250 + :align: center + +**OVIS** is a modular system for HPC data collection, transport, storage, analysis, visualization, and log message exploration. The Lightweight Distributed Metric Service (**LDMS**) is a scalable low-overhead, low-latency framework for collection, movement, and storage of metric/event data on distributed computer systems. + +.. toctree:: + :maxdepth: 2 + :caption: OVIS and Group Activity + + About Ovis + LDMS Users Group Conference (LDMSCON) + LDSM Users Group + OVIS Publications + +.. toctree:: + :maxdepth: 4 + :caption: OVIS Components + + ldms-index + SOS + Maestro + Baler + ASF + +.. toctree:: + :maxdepth: 6 + :caption: Deployment + + LDMS + SOS + Maestro + Baler + ASF + + +Other Projects +==================================== + +`ldms `_ +`ovis-publications `_ +`maestro `_ +`sos `_ +`baler `_ + + + + + + + + + + diff --git a/rtd/docs/source/ldms-index.rst b/rtd/docs/source/ldms-index.rst new file mode 100644 index 000000000..0c5f28033 --- /dev/null +++ b/rtd/docs/source/ldms-index.rst @@ -0,0 +1,46 @@ +LDMS +====== + +.. image:: images/ovis-hpc_homepage.png + :width: 1000 + :height: 150 + +LDMS GitHub: https://github.com/ovis-hpc/ovis + +To join the LDMS Users Group Mailing List: https://github.com/ovis-hpc/ovis-wiki/wiki/Mailing-Lists + +.. toctree:: + :maxdepth: 2 + :caption: Introduction To LDMS + + ldms-quickstart + ldms-tutorial + ldms-streams + container-quickstart + +.. toctree:: + :maxdepth: 2 + :caption: LDMS Man Pages + + ldms_man/index + +.. toctree:: + :maxdepth: 2 + :caption: Sampler Plugin Man Pages + + sampler_man/index + +.. toctree:: + :maxdepth: 2 + :caption: Store Plugin Man Pages + + store_man/index + + +.. toctree:: + :maxdepth: 2 + :caption: Contributing to LDMS + + contributing/index + + diff --git a/rtd/docs/source/ldms-quickstart.rst b/rtd/docs/source/ldms-quickstart.rst new file mode 100644 index 000000000..1670aa793 --- /dev/null +++ b/rtd/docs/source/ldms-quickstart.rst @@ -0,0 +1,623 @@ +LDMS Quick Start +########################### + +Installation +***************** + +AlmaLinux8 +------------ + +Prerequisites + +*********************** +* AlmaLinux8 (AlmaLinux is binary compatible with RHEL®) +* openssl-dev +* gnu compiler +* swig +* autoconf +* libtool +* readline +* readline-devel +* libevent +* libevent-dev +* autogen-libopts +* gettext +* python3.8 +* python38-Cython +* python38-libs +* glib2-devel +* git +* bison +* make +* byacc +* flex + +Prerequisite Installation +--------------------------- +The following steps were ran on AlmaLinux8 arm64v8 + +.. code-block:: RST + + sudo dnf update -y + sudo dnf install -y openssl + sudo dnf install -y openssl-devel + sudo dnf install -y swig + sudo dnf install -y libtool + sudo dnf install -y readline + sudo dnf install -y readline-devel + sudo dnf install -y libevent + sudo dnf install -y libevent-devel + sudo dnf install -y autogen-libopts + sudo dnf install -y gettext.a + sudo dnf install -y glib2 + sudo dnf install -y glib2-devel + sudo dnf install -y git + sudo dnf install -y bison + sudo dnf install -y make + sudo dnf install -y byacc + sudo dnf install -y flex + sudo dnf install -y python38 + sudo dnf install -y python38-devel + sudo dnf install -y python38-Cython + sudo dnf install -y python38-libs + + +RHEL 9 +------------ + +Prerequisites +============= +* RHEL 9 +* openssl-devel +* pkg-config +* automake +* libtool +* python3 (or higher) +* python3-devel (or higher) +* cython +* bison +* flex + +Prerequisite Installation +--------------------------- +The following steps were ran on a basic RHEL 9 instance via AWS. + +.. code-block:: RST + + sudo yum update -y + sudo yum install automake -y + sudo yum install openssl-devel -y + sudo yum install pkg-config -y + sudo yum install libtool -y + sudo yum install python3 -y + sudo yum install python3-devel.x86_64 -y + sudo yum install python3-Cython -y + sudo yum install make -y + sudo yum install bison -y + sudo yum install flex -y + + +LDMS Source Installation Instructions +-------------------------- + +Getting the Source + +*********************** +* This example shows cloning into $HOME/Source/ovis-4 and installing into $HOME/ovis/4.4.2 + +.. code-block:: RST + + mkdir $HOME/Source + mkdir $HOME/ovis + cd $HOME/Source + git clone -b OVIS-4.4.2 https://github.com/ovis-hpc/ovis.git ovis-4 + +Building the Source +----------------------- + +* Run autogen.sh +.. code-block:: RST + + cd $HOME/Source/ovis + ./autogen.sh + +* Configure and Build (Builds default linux samplers. Build installation directory is prefix): + +.. code-block:: RST + + mkdir build + cd build + ../configure --prefix=$HOME/ovis/4.4.2 + make + make install + +Basic Configuration and Running +******************************* +* Set up environment: + +.. code-block:: RST + + OVIS=$HOME/ovis/4.4.2 + export LD_LIBRARY_PATH=$OVIS/lib:$LD_LIBRARY_PATH + export LDMSD_PLUGIN_LIBPATH=$OVIS/lib/ovis-ldms + export ZAP_LIBPATH=$OVIS/lib/ovis-ldms + export PATH=$OVIS/sbin:$OVIS/bin:$PATH + export PYTHONPATH=$OVIS/lib/python3.8/site-packages + +Sampler +*********************** +* Edit a new configuration file, named `sampler.conf`, to load the `meminfo` and `vmstat` samplers. For this example, it can be saved anywhere, but it will be used later to start the LDMS Daemon (`ldmsd`) + +The following configuration employs generic hostname, uid, gid, component id, and permissions octal set values. + +Sampling intervals are set using a "microsecond" time unit (i.e., 1 sec=1e+6 µs), and are adjustable, as needed. +Some suggestions include: + +.. list-table:: LDMS Sampler Plugin Interval Settings + :widths: 25 25 25 + :header-rows: 1 + + * - Sampler + - Seconds (sec) + - Microseconds (µs) + * - Power + - 0.1 sec + - 100000 µs + * - Meminfo + - 1.0 sec + - 1000000 µs + * - VMstat + - 10 sec + - 10000000 µs + * - Link Status + - 60 sec + - 60000000 µs + + +.. note:: + Sampling offset is typically set to 0 for sampler plugins. + + +.. code-block:: RST + :linenos: + + # Meminfo Sampler Plugin using 1 second sampling interval + load name=meminfo + config name=meminfo producer=host1 instance=host1/meminfo component_id=1 schema=meminfo job_set=host1/jobinfo uid=12345 gid=12345 perm=0755 + start name=meminfo interval=1000000 offset=0 + # VMStat Sampler Plugin using 10 second sampling interval + load name=vmstat + config name=vmstat producer=host1 instance=host1/vmstat component_id=1 schema=vmstat job_set=host1/jobinfo uid=0 gid=0 perm=0755 + start name=vmstat interval=10000000 offset=0 + +As an alternative to the configuration above, one may, instead, export environmental variables to set LDMS's runtime configuration by using variables to reference those values in the sampler configuration file. + +The following setup will set the samplers to collect at 1 second, (i.e., 1000000 µs) intervals: + +.. code-block:: RST + + export HOSTNAME=${HOSTNAME:=$(hostname -s)} #Typically already is set, set if not + export COMPONENT_ID=1 + export SAMPLE_INTERVAL=1000000 + export SAMPLE_OFFSET=50000 + +.. code-block:: RST + :linenos: + + # Meminfo Sampler Plugin using environment variables HOSTNAME, COMPONENT_ID, SAMPLE_INTERVAL, and SAMPLE_OFFSET + load name=meminfo + config name=meminfo producer=${HOSTNAME} instance=${HOSTNAME}/meminfo component_id=${COMPONENT_ID} schema=meminfo job_set=${HOSTNAME}/jobinfo uid=12345 gid=12345 perm=0755 + start name=meminfo interval=${SAMPLE_INTERVAL} offset=${SAMPLE_OFFSET} + # VMStat Sampler Plugin using environment variables HOSTNAME, COMPONENT_ID, SAMPLE_INTERVAL, and SAMPLE_OFFSET + load name=vmstat + config name=vmstat producer=${HOSTNAME} instance=${HOSTNAME}/vmstat component_id=${COMPONENT_ID} schema=vmstat job_set=${HOSTNAME}/jobinfo uid=0 gid=0 perm=0755 + start name=vmstat interval=${SAMPLE_INTERVAL} offset=${SAMPLE_OFFSET} + +* Run a daemon using munge authentication: + +.. code-block:: RST + + ldmsd -x sock:10444 -c sampler.conf -l /tmp/demo_ldmsd.log -v DEBUG -a munge -r $(pwd)/ldmsd.pid + +Or in non-cluster environments where munge is unavailable: + +.. code-block:: RST + + ldmsd -x sock:10444 -c sampler.conf -l /tmp/demo_ldmsd.log -v DEBUG -r $(pwd)/ldmsd.pid + +.. note:: + For the rest of these instructions, omit the "-a munge" if you do not have munge running. This will also write out DEBUG-level information to the specified (-l) log. + +* Run ldms_ls on that node to see set, meta-data, and contents: + +.. code-block:: RST + + ldms_ls -h localhost -x sock -p 10444 -a munge + ldms_ls -h localhost -x sock -p 10444 -v -a munge + ldms_ls -h localhost -x sock -p 10444 -l -a munge + +.. note:: + Note the use of munge. Users will not be able to query a daemon launched with munge if not querying with munge. Users will only be able to see sets as allowed by the permissions in response to `ldms_ls`. + +Example (note permissions and update hint): + +.. code-block:: RST + + ldms_ls -h localhost -x sock -p 10444 -l -v -a munge + +Output: + +.. code-block:: RST + + host1/vmstat: consistent, last update: Mon Oct 22 16:58:15 2018 -0600 [1385us] + APPLICATION SET INFORMATION ------ + updt_hint_us : 5000000:0 + METADATA -------- + Producer Name : host1 + Instance Name : host1/vmstat + Schema Name : vmstat + Size : 5008 + Metric Count : 110 + GN : 2 + User : root(0) + Group : root(0) + Permissions : -rwxr-xr-x + DATA ------------ + Timestamp : Mon Oct 22 16:58:15 2018 -0600 [1385us] + Duration : [0.000106s] + Consistent : TRUE + Size : 928 + GN : 110 + ----------------- + M u64 component_id 1 + D u64 job_id 0 + D u64 app_id 0 + D u64 nr_free_pages 32522123 + ... + D u64 pglazyfree 1082699829 + host1/meminfo: consistent, last update: Mon Oct 22 16:58:15 2018 -0600 [1278us] + APPLICATION SET INFORMATION ------ + updt_hint_us : 5000000:0 + METADATA -------- + Producer Name : host1 + Instance Name : host1/meminfo + Schema Name : meminfo + Size : 1952 + Metric Count : 46 + GN : 2 + User : myuser(12345) + Group : myuser(12345) + Permissions : -rwx------ + DATA ------------ + Timestamp : Mon Oct 22 16:58:15 2018 -0600 [1278us] + Duration : [0.000032s] + Consistent : TRUE + Size : 416 + GN : 46 + ----------------- + M u64 component_id 1 + D u64 job_id 0 + D u64 app_id 0 + D u64 MemTotal 131899616 + D u64 MemFree 130088492 + D u64 MemAvailable 129556912 + ... + D u64 DirectMap1G 134217728 + + +Aggregator Using Data Pull +*********************** +* Start another sampler daemon with a similar configuration on host2 using component_id=2, as above. +* Make a configuration file (called agg11.conf) to aggregate from the two samplers at different intervals with the following contents: + +.. code-block:: RST + :linenos: + + prdcr_add name=host1 host=host1 type=active xprt=sock port=10444 interval=20000000 + prdcr_start name=host1 + updtr_add name=policy_h1 interval=1000000 offset=100000 + updtr_prdcr_add name=policy_h1 regex=host1 + updtr_start name=policy_h1 + prdcr_add name=host2 host=host2 type=active xprt=sock port=10444 interval=20000000 + prdcr_start name=host2 + updtr_add name=policy_h2 interval=2000000 offset=100000 + updtr_prdcr_add name=policy_h2 regex=host2 + updtr_start name=policy_h2 + +* On host3, set up the environment as above and run a daemon: + +.. code-block:: RST + + ldmsd -x sock:10445 -c agg11.conf -l /tmp/demo_ldmsd.log -v ERROR -a munge + + +* Run `ldms_ls` on the aggregator node to see set listing: + +.. code-block:: RST + + ldms_ls -h localhost -x sock -p 10445 -a munge + +Output: + +.. code-block:: RST + + host1/meminfo + host1/vmstat + host2/meminfo + host2/vmstat + +You can also run `ldms_ls` to query the ldms daemon on the remote node: + +.. code-block:: RST + + ldms_ls -h host1 -x sock -p 10444 -a munge + +Output: + +.. code-block:: RST + + host1/meminfo + host1/vmstat + + +.. note:: + `ldms_ls -l` shows the detailed output, including timestamps. This can be used to verify that the aggregator is aggregating the two hosts' sets at different intervals. + +Aggregator Using Data Push +*********************** +* Use same sampler configurations as above. +* Make a configuration file (called agg11_push.conf) to cause the two samplers to push their data to the aggregator as they update. + + * Note that the prdcr configs remain the same as above but the updater_add includes the additional options: push=onchange auto_interval=false. + + * Note that the updtr_add interval has no effect in this case but is currently required due to syntax checking + +.. code-block:: RST + + prdcr_add name=host1 host=host1 type=active xprt=sock port=10444 interval=20000000 + prdcr_start name=host1 + prdcr_add name=host2 host=host2 type=active xprt=sock port=10444 interval=20000000 + prdcr_start name=host2 + updtr_add name=policy_all interval=5000000 push=onchange auto_interval=false + updtr_prdcr_add name=policy_all regex=.* + updtr_start name=policy_all + + +* On host3, set up the environment as above and run a daemon: + +.. code-block:: RST + + ldmsd -x sock:10445 -c agg11_push.conf -l /tmp/demo_ldmsd_log -v DEBUG -a munge + +* Run ldms_ls on the aggregator node to see set listing: + +.. code-block:: RST + + ldms_ls -h localhost -x sock -p 10445 -a munge + +Output: + +.. code-block:: RST + + host1/meminfo + host1/vmstat + host2/meminfo + host2/vmstat + + +Two Aggregators Configured as Failover Pairs +*********************** +* Use same sampler configurations as above +* Make a configuration file (called agg11.conf) to aggregate from one sampler with the following contents: + +.. code-block:: RST + + prdcr_add name=host1 host=host1 type=active xprt=sock port=10444 interval=20000000 + prdcr_start name=host1 + updtr_add name=policy_all interval=1000000 offset=100000 + updtr_prdcr_add name=policy_all regex=.* + updtr_start name=policy_all + failover_config host=host3 port=10446 xprt=sock type=active interval=1000000 peer_name=agg12 timeout_factor=2 + failover_start + +* On host3, set up the environment as above and run two daemons as follows: + +.. code-block:: RST + + ldmsd -x sock:10445 -c agg11.conf -l /tmp/demo_ldmsd_log -v ERROR -n agg11 -a munge + ldmsd -x sock:10446 -c agg12.conf -l /tmp/demo_ldmsd_log -v ERROR -n agg12 -a munge + +* Run ldms_ls on each aggregator node to see set listing: + +.. code-block:: RST + + ldms_ls -h localhost -x sock -p 10445 -a munge + host1/meminfo + host1/vmstat + ldms_ls -h localhost -x sock -p 10446 -a munge + host2/meminfo + host2/vmstat + +* Kill one daemon: + +.. code-block:: RST + + kill -SIGTERM + +* Make sure it died +* Run ldms_ls on the remaining aggregator to see set listing: + +.. code-block:: RST + + ldms_ls -h localhost -x sock -p 10446 -a munge + +Output: + +.. code-block:: RST + + host1/meminfo + host1/vmstat + host2/meminfo + host2/vmstat + +Set Groups +*********************** +A set group is an LDMS set with special information to represent a group of sets inside ldmsd. A set group would appear as a regular LDMS set to other LDMS applications, but ldmsd and `ldms_ls` will treat it as a collection of LDMS sets. If ldmsd updtr updates a set group, it also subsequently updates all the member sets. Performing ldms_ls -l on a set group will also subsequently perform a long-query all the sets in the group. + +To illustrate how a set group works, we will configure 2 sampler daemons with set groups and 1 aggregator daemon that updates and stores the groups in the following subsections. + +Creating a set group and inserting sets into it +*********************** +The following is a configuration file for our s0 LDMS daemon (sampler #0) that collects sda disk stats in the s0/sda set and lo network usage in the s0/lo set. The s0/grp set group is created to contain both s0/sda and s0/lo. + +.. code-block:: RST + + ### s0.conf + load name=procdiskstats + config name=procdiskstats device=sda producer=s0 instance=s0/sda + start name=procdiskstats interval=1000000 offset=0 + + load name=procnetdev + config name=procnetdev ifaces=lo producer=s0 instance=s0/lo + start name=procnetdev interval=1000000 offset=0 + + setgroup_add name=s0/grp producer=s0 interval=1000000 offset=0 + setgroup_ins name=s0/grp instance=s0/sda,s0/lo + +The following is the same for s1 sampler daemon, but with different devices (sdb and eno1). + +.. code-block:: RST + + ### s1.conf + load name=procdiskstats + config name=procdiskstats device=sdb producer=s1 instance=s1/sdb + start name=procdiskstats interval=1000000 offset=0 + + load name=procnetdev + config name=procnetdev ifaces=eno1 producer=s1 instance=s1/eno1 + start name=procnetdev interval=1000000 offset=0 + + setgroup_add name=s1/grp producer=s1 interval=1000000 offset=0 + setgroup_ins name=s1/grp instance=s1/sdb,s1/eno1 + +The s0 LDMS daemon is listening on port 10000 and the s1 LDMS daemon is listening on port 10001. + +Perform `ldms_ls` on a group +*********************** +Performing `ldms_ls -v` or `ldms_ls -l` on a LDMS daemon hosting a group will perform the query on the set representing the group itself as well as iteratively querying the group's members. + +Example: + +.. code-block:: RST + + ldms_ls -h localhost -x sock -p 10000 + +Output: + +.. code-block:: RST + + ldms_ls -h localhost -x sock -p 10000 -v s0/grp | grep consistent + +Output: + +.. code-block:: RST + + s0/grp: consistent, last update: Mon May 20 15:44:30 2019 -0500 [511879us] + s0/lo: consistent, last update: Mon May 20 16:13:16 2019 -0500 [1126us] + s0/sda: consistent, last update: Mon May 20 16:13:17 2019 -0500 [1176us] + +.. code-block:: RST + + ldms_ls -h localhost -x sock -p 10000 -v s0/lo | grep consistent # only query lo set from set group s0 + +.. note:: + The update time of the group set is the time that the last set was inserted into the group. + +Update / store with set group +*********************** +The following is an example of an aggregator configuration to match-update only the set groups, and their members, with storage policies: + +.. code-block:: RST + + # Stores + load name=store_csv + config name=store_csv path=csv + # strgp for netdev, csv file: "./csv/net/procnetdev" + strgp_add name=store_net plugin=store_csv container=net schema=procnetdev + strgp_prdcr_add name=store_net regex=.* + strgp_start name=store_net + # strgp for diskstats, csv file: "./csv/disk/procdiskstats" + strgp_add name=store_disk plugin=store_csv container=disk schema=procdiskstats + strgp_prdcr_add name=store_disk regex=.* + strgp_start name=store_disk + + # Updater that updates only groups + updtr_add name=u interval=1000000 offset=500000 + updtr_match_add name=u regex=ldmsd_grp_schema match=schema + updtr_prdcr_add name=u regex=.* + updtr_start name=u + +Performing `ldms_ls` on the LDMS aggregator daemon exposes all the sets (including groups) + +.. code-block:: RST + + ldms_ls -h localhost -x sock -p 9000 + +Output: + +.. code-block:: RST + + s1/sdb + s1/grp + s1/eno1 + s0/sda + s0/lo + s0/grp + +Performing `ldms_ls -v` on a LDMS daemon hosting a group again but only querying the group and its members: + +.. code-block:: RST + + ldms_ls -h localhost -x sock -p 9000 -v s1/grp | grep consistent + +Output: + +.. code-block:: RST + + s1/grp: consistent, last update: Mon May 20 15:42:34 2019 -0500 [891643us] + s1/sdb: consistent, last update: Mon May 20 16:38:38 2019 -0500 [1805us] + s1/eno1: consistent, last update: Mon May 20 16:38:38 2019 -0500 [1791us] + + +The following is an example of the CSV output: + +.. code-block:: RST + + > head csv/*/* + +.. code-block:: RST + + #Time,Time_usec,ProducerName,component_id,job_id,app_id,reads_comp#sda,reads_comp.rate#sda,reads_merg#sda,reads_merg.rate#sda,sect_read#sda,sect_read.rate#sda,time_read#sda,time_read.rate#sda,writes_comp#sda,writes_comp.rate#sda,writes_merg#sda,writes_merg.rate#sda,sect_written#sda,sect_written.rate#sda,time_write#sda,time_write.rate#sda,ios_in_progress#sda,ios_in_progress.rate#sda,time_ios#sda,time_ios.rate#sda,weighted_time#sda,weighted_time.rate#sda,disk.byte_read#sda,disk.byte_read.rate#sda,disk.byte_written#sda,disk.byte_written.rate#sda + 1558387831.001731,1731,s0,0,0,0,197797,0,9132,0,5382606,0,69312,0,522561,0,446083,0,418086168,0,966856,0,0,0,213096,0,1036080,0,1327776668,0,1380408297,0 + 1558387832.001943,1943,s1,0,0,0,108887,0,32214,0,1143802,0,439216,0,1,0,0,0,8,0,44,0,0,0,54012,0,439240,0,1309384656,0,1166016512,0 + 1558387832.001923,1923,s0,0,0,0,197797,0,9132,0,5382606,0,69312,0,522561,0,446083,0,418086168,0,966856,0,0,0,213096,0,1036080,0,1327776668,0,1380408297,0 + 1558387833.001968,1968,s1,0,0,0,108887,0,32214,0,1143802,0,439216,0,1,0,0,0,8,0,44,0,0,0,54012,0,439240,0,1309384656,0,1166016512,0 + 1558387833.001955,1955,s0,0,0,0,197797,0,9132,0,5382606,0,69312,0,522561,0,446083,0,418086168,0,966856,0,0,0,213096,0,1036080,0,1327776668,0,1380408297,0 + 1558387834.001144,1144,s1,0,0,0,108887,0,32214,0,1143802,0,439216,0,1,0,0,0,8,0,44,0,0,0,54012,0,439240,0,1309384656,0,1166016512,0 + 1558387834.001121,1121,s0,0,0,0,197797,0,9132,0,5382606,0,69312,0,522561,0,446083,0,418086168,0,966856,0,0,0,213096,0,1036080,0,1327776668,0,1380408297,0 + 1558387835.001179,1179,s0,0,0,0,197797,0,9132,0,5382606,0,69312,0,522561,0,446083,0,418086168,0,966856,0,0,0,213096,0,1036080,0,1327776668,0,1380408297,0 + 1558387835.001193,1193,s1,0,0,0,108887,0,32214,0,1143802,0,439216,0,1,0,0,0,8,0,44,0,0,0,54012,0,439240,0,1309384656,0,1166016512,0 + + ==> csv/net/procnetdev <== + #Time,Time_usec,ProducerName,component_id,job_id,app_id,rx_bytes#lo,rx_packets#lo,rx_errs#lo,rx_drop#lo,rx_fifo#lo,rx_frame#lo,rx_compressed#lo,rx_multicast#lo,tx_bytes#lo,tx_packets#lo,tx_errs#lo,tx_drop#lo,tx_fifo#lo,tx_colls#lo,tx_carrier#lo,tx_compressed#lo + 1558387831.001798,1798,s0,0,0,0,12328527,100865,0,0,0,0,0,0,12328527,100865,0,0,0,0,0,0 + 1558387832.001906,1906,s0,0,0,0,12342153,100925,0,0,0,0,0,0,12342153,100925,0,0,0,0,0,0 + 1558387832.001929,1929,s1,0,0,0,3323644475,2865919,0,0,0,0,0,12898,342874081,1336419,0,0,0,0,0,0 + 1558387833.002001,2001,s0,0,0,0,12346841,100939,0,0,0,0,0,0,12346841,100939,0,0,0,0,0,0 + 1558387833.002025,2025,s1,0,0,0,3323644475,2865919,0,0,0,0,0,12898,342874081,1336419,0,0,0,0,0,0 + 1558387834.001106,1106,s0,0,0,0,12349089,100953,0,0,0,0,0,0,12349089,100953,0,0,0,0,0,0 + 1558387834.001130,1130,s1,0,0,0,3323647234,2865923,0,0,0,0,0,12898,342875727,1336423,0,0,0,0,0,0 + 1558387835.001247,1247,s0,0,0,0,12351337,100967,0,0,0,0,0,0,12351337,100967,0,0,0,0,0,0 + 1558387835.001274,1274,s1,0,0,0,3323647298,2865924,0,0,0,0,0,12898,342875727,1336423,0,0,0,0,0,0 + + diff --git a/rtd/docs/source/ldms-streams.rst b/rtd/docs/source/ldms-streams.rst new file mode 100644 index 000000000..3f4659a05 --- /dev/null +++ b/rtd/docs/source/ldms-streams.rst @@ -0,0 +1,836 @@ +Streams-enabled Application Data Collectors +########################### + +Caliper +*********************** + +This section covers the basic steps on how to compile, build and use the caliperConnector. + +**What Is Caliper?** + +A program instrumentation and performance measurement framework that allows users to implement analysiscapabilities (e.g. performance profiling, tracing, monitoring, and auto-tuning) into their applications using Caliper’s annotation API. + +**What Is the caliperConnector?** + +A Caliper-LDMS functionality that utilizes LDMS Streams to collect Caliper related data and absolute timestamp during runtime. It formats the data to a JSON message and *publishes* it to an LDMS streams interface. + +Setup & Configuration +---------------------- +Build the Caliper program with the application you wish to analyze. No modifications to the Caliper's instrumentations were required to integrate LDMS, so you will just need to follow the build and install instructions from `Caliper's Build and Install Webpage `_ + +One built, you will need to poin the $LD_LIBRARY_PATH to Caliper's library: + +.. code-block:: RST + + LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/lib64 + +Now, to enable LDMS data collection, set (or export) the following list of caliper variables to ``ldms`` when executing a program. An example is shown below: + +.. code-block:: RST + + CALI_LOOP_MONITOR_ITERATION_INTERVAL=10 ./caliper_example.o 400 + CALI_SERVICES_ENABLE=loop_monitor,mpi,ldms + +The ``CALI_LOOP_MONITOR_ITERATION_INTERVAL`` collects measurements every n loop iterations of the acpplicaiton and the ``CALI_SERVICES_ENABLE`` define which services will be combined to collect the data. + +Once done, you will just need to execute your program and you will have application data collected by Caliper and LDMS. + +.. note:: + + The MPI service (i.e., mpi) is required when enabling LDMS because it is used for associating the MPI rank data collected by LDMS. + +LDMS Expected Output +-------------------- +LDMS collects a set of runtime timeseries data of the application in parallel with Caliper. Below is an example output of the data collect, formatted into a JSON string: + +.. code-block:: + + {"job_id":11878171,"ProducerName":“n1","rank":0,"timestamp":1670373198.056455,"region":"init","time":33.172237 } + {"job_id":11878171,"ProducerName":"n1","rank":0,"timestamp":1670373198.056455,"region":"initialization","time":33.211929 } + {"job_id":11878171,"ProducerName":“n1","rank":0,"timestamp":1670373198.056455,"region":"main","time":44.147736 } + {"job_id":11878171,"ProducerName":“n1","rank":0,"timestamp":1670373203.556555,"region":"main","time":0.049086 } + {"job_id":11878171,"ProducerName":“n1","rank":0,"timestamp":1670373203.556555,"region":"run","time":0.049086 } + +Any data collected by LDMS should have the same fields as the one shown above and can be viewed in a csv file **if** the LDMS csv_store plugin is configured in the LDMSD aggregator. + +.. note:: + More information about starting and configuring and LDMS daemon to store to CSV can be found in `Run An LDMS Streams Daemon`_ or `LDMS Quickstart `_. + + + +Darshan +*********************** +This section covers basics steps on how to compile, build and use the Darshan-LDMS Integration code (i.e. darshanConnector). The following application tests are part of the Darshan program and can be found under ``/darshan/darshan-test/regression/test-cases/src/``. + +**What Is Darshan?** + +A lightweight I/O characterization tool that transparently captures application I/O behavior from HPC applications with minimal overhead. + +**What Is The darshanConnector?** + +A Darshan-LDMS functionality that utilizes LDMS Streams to collect Darshan’s original I/O tracing, Darshan’s eXtended tracing (DXT) and absolute timestamp during runtime. It formats the data to a JSON message and *publishes* it to an LDMS streams interface. This data is a timeseries (i.e. absolute timestamp is collected) that will contain information about each individual I/O event. + +.. image:: ../images/darshanConnector.png + :caption: The above diagrams provieds a high level visualization of the darshanConnector. During the Darshan initialization, the connector (on the left-hand side) checks to see if darshan has been built against the ldms library and if it has it will initialize a connection to the LDMS stream daemon when the DARSHAN_LDMS_ENABLE is set. Once initialized, the connecter will know which module data we want to collect by checking which environment variables are set. For example, if MPI-IO_ENABLE_LDMS is set, that specific I/O event data will be collected. The runtime data collection and JSON message formatting is then performed in the darshan ldms connector send function. This function is triggered whenever an I/O event occurs. The data is then published to LDMS streams interface and sent to through the LDMS Transport to be stored into a database. As you can see at the very bottom left is the JSON formatted message. Meanwhile, on the right, darshan is running as usual by initializing their modules, collecting the I/O event data for these modules, aggregating and calculating the data and then outputting the information into a Darshan log file. As you can see, the LDMS Streams implementation does not interfere with Darshan + +.. note:: + + LDMS must already be installed on the system or locally. If it is not, then please following ``Getting The Source`` and ``Building The Source`` in the `LDMS Quickstart Guide `_. If the Darshan-LDMS code is already deployed on your system, please skip to `Run An LDMS Streams Daemon`_ + +**Metric Definitions** +Below are the list of Darshan metrics that are currently being collected by the darshanConnector: + +* ``schema:`` Schema name of the data collected by the darshanConnector. This is an LDMS related metric and is only used for storing the data to the correct location in DSOS. + +* ``module:`` Name of the Darshan module data being collected. + +* ``uid:`` User ID of the job run. + +* ``exe:`` Full path to the application executable. Only set to the full path when the "type" metric is set to "MET". Otherwise it is set to N/A. + +* ``ProducerName:`` Name of the compute node the application is running on. + +* ``switches:`` Number of times access alternated between read and write. + +* ``file:`` Path to the filename of the I/O operations. Only set to the full path when the "type" metric is set to "MET". Otherwise it is set to N/A. + +* ``rank:`` Rank of the processes at I/O + +* ``flushes:`` Number of times the "flush" operation was performed. For H5F and H5D it is the HDF5 file flush and dataset flush operation counts, respectively. + +* ``record_id:`` Darshan file record ID of the file the dataset belongs to. + +* ``max_byte:`` Highest offset byte read and written (i.e. Darshan's "\_MAX_BYTE_*" parameter). + +* ``type:`` The type of json data being published. It is either set to MOD for gathering "module" data or MET for gathering static "meta" data (i.e. record id, rank ,etc.) + +* ``job_id:`` The Job ID of the application run. + +* ``op:`` Type of operation being performed (i.e. read, open, close, write). + +* ``cnt:`` The count of the operations ("op" field) performed per module per rank. Resets to 0 after each "close" operation. + +* ``seg:`` Contains the following array metrics from the operation ("op" field): + + ``pt_sel: HDF5 number of different access selections. + reg_hslab: HDF5 number of regular hyperslabs. + irreg_hslab: HDF5 number of irregular hyperslabs. + ndims: HDF5 number of dimensions in dataset's dataspace. + npoints: HDF5 number of points in dataset's dataspace. + off: Cumulative total bytes read and cumulative total bytes written, respectively, for each module per rank. (i.e. Darshan's "offset" DXT parameter) + len: Number of bytes read/written for the given operation per rank. + start: Start time (seconds) of each I/O operation performed for the given rank + dur: Duration of each operation performed for the given rank. (i.e. a rank takes "X" time to perform a r/w/o/c operation.) + total: Cumulative time since the application run after the I/O operation (i.e. start of application + dur) + timestamp: End time of given operation (i.e. "op" field) for the given rank (i.e. "rank" field). In epoch time.`` + +For all metric fields that don't apply to a module, a value of ``-1`` is given. + +All data fields which that not change throughout the entire application run (i.e. constant), unless the darshanConnector is reconnected/restarted, are listed below: + +* ``ProducerName`` +* ``job_id`` +* ``schema`` +* ``exe`` +* ``uid`` + + +Compile and Build with LDMS +--------------------------- +1. Run the following to build Darshan and link against an existing LDMS library on the system. + +.. code-block:: RST + + git clone https://github.com/darshan-hpc/darshan.git + cd darshan && mkdir build/ + ./prepare.sh && cd build/ + ../configure CC= \ + --with-log-path-by-env=LOGFILE_PATH_DARSHAN \ + --prefix=/darshan/ \ + --with-JOB_ID-env= \ + --enable-ldms-mod \ + --with-ldms= + make && make install +.. note:: + + * This configuration is specific to the system. should be replaced by the compiler wrapper for your MPI Library, (e.g., ``mpicc`` for Open MPI, or ``cc`` for Cray Development Environment MPI wrappers). +* If running an MPI program, make sure an MPI library is installed/loaded on the system. + For more information on how to install and build the code across various platforms, please visit `Darshan's Runtime Installation Page `_ +* ``--with-jobid-env=`` expects a string that is the environment variable that the hosted job scheduler utilizes on the HPC system. (e.g., Slurm would use ``--with-jobid-env=SLURM_JOB_ID``) + +2. **OPTIONAL** To build HDF5 module for Darshan, you must first load the HDF5 modulefile with ``module load hdf5-parallel``, then run configure as follows: + +.. code-block:: RST + + ../configure CC= \ + --with-log-path-by-env=LOGFILE_PATH_DARSHAN \ + --prefix=/darshan/ \ + --with-jobid-env= \ + --enable-ldms-mod \ + --with-ldms= + --enable-hdf5-mod \ + --with-hdf5= + make && make install + +2a. **OPTIONAL** If you do not have HDF5 installed on your system, you may install Python's ``h5py`` package with: + +.. code-block:: RST + + sudo apt-get install -y hdf5-tools libhdf5-openmpi-dev openmpi-bin + # we need to build h5py with the system HDF5 lib backend + export HDF5_MPI="ON" + CC=cc python -m pip install --no-binary=h5py h5py + +.. note:: + + If the HDF5 library is installed this way, you do not need to include the ``--with-hdf5`` flag during configuration. For more information on other methods and HDF5 versions to install, please visit `Darshan's Runtime Installation Page `_. + + +Run an LDMS Streams Daemon +--------------------------- +This section will go over how to start and configure a simple LDMS Streams deamon to collect the Darshan data and store to a CSV file. +If an LDMS Streams daemon is already running on the system then please skip to `Test the Darshan-LDMS Integrated Code (Multi Node)`_. + +1. First, initialize an ldms streams daemon on a compute node as follows: + +.. code-block:: RST + + salloc -N 1 --time=2:00:00 -p + *ssh to node* + +2. Once on the compute node (interactive session), set up the environment for starting an LDMS daemon: + +.. code-block:: RST + + LDMS_INSTALL= + export LD_LIBRARY_PATH="$LDMS_INSTALL/lib/:$LDMS_INSTALL/lib:$LD_LIBRARY_PATH" + export LDMSD_PLUGIN_LIBPATH="$LDMS_INSTALL/lib/ovis-ldms/" + export ZAP_LIBPATH="$LDMS_INSTALL/lib/ovis-ldms" + export PATH="$LDMS_INSTALL/sbin:$LDMS_INSTALL/bin:$PATH" + export PYTHONPATH= + export COMPONENT_ID="1" + export SAMPLE_INTERVAL="1000000" + export SAMPLE_OFFSET="0" + export HOSTNAME="localhost" + +.. note:: + + LDMS must already be installed on the system or locally. If it is not, then please follow ``Getting The Source`` and ``Building The Source`` in the `LDMS Quickstart Guide `_. + +3. Next, create a file called **"darshan\_stream\_store.conf"** and add the following content to it: + +.. code-block:: RST + + load name=hello_sampler + config name=hello_sampler producer=${HOSTNAME} instance=${HOSTNAME}/hello_sampler stream=darshanConnector component_id=${COMPONENT_ID} + start name=hello_sampler interval=${SAMPLE_INTERVAL} offset=${SAMPLE_OFFSET} + + load name=stream_csv_store + config name=stream_csv_store path=./streams/store container=csv stream=darshanConnector rolltype=3 rollover=500000 + +4. Next, run the LDSM Streams daemon with the following command: + +.. code-block:: RST + + ldmsd -x sock:10444 -c darshan_stream_store.conf -l /tmp/darshan_stream_store.log -v DEBUG -r ldmsd.pid + +.. note:: + + To check that the ldmsd daemon is connected running, run ``ps auwx | grep ldmsd | grep -v grep``, ``ldms_ls -h -x sock -p -a none -v`` or ``cat /tmp/darshan_stream_store.log``. Where is the node where the LDMS daemon exists and is the port number it is listening on. + +Test the Darshan-LDMS Integrated Code (Multi Node) +--------------------------- +This section gives step by step instructions on how to test the Darshan-LDMS Integrated code (i.e. darshanConnector) by executing a simple test application provided by Darshan. + +Set The Environment +//////////////////// +1. Once the LDMS streams daemon is initialized, **open another terminal window (login node)** and set the following environment variables before running an application test with Darshan: + +.. code-block:: RST + + export DARSHAN_INSTALL_PATH= + export LD_PRELOAD=$DARSHAN_INSTALL_PATH/lib/libdarshan.so + export LD_LIBRARY_PATH=$DARSHAN_INSTALL_PATH/lib:$LD_LIBRARY_PATH + # optional. Please visit Darshan's webpage for more information. + export DARSHAN_MOD_ENABLE="DXT_POSIX,DXT_MPIIO" + + # uncomment if hdf5 is enabled + #export C_INCLUDE_PATH=$C_INCLUDE_PATH:/usr/include/hdf5/openmpi + #export HDF5_LIB=/lib/libhdf5.so + + #set env variables for ldms streams daemon testing + export DARSHAN_LDMS_STREAM=darshanConnector + export DARSHAN_LDMS_XPRT=sock + export DARSHAN_LDMS_HOST= + export DARSHAN_LDMS_PORT=10444 + export DARSHAN_LDMS_AUTH=none + + # enable LDMS data collection. No runtime data collection will occur if this is not exported. + export DARSHAN_LDMS_ENABLE= + + # determine which modules we want to publish to ldmsd + #export DARSHAN_LDMS_ENABLE_MPIIO= + #export DARSHAN_LDMS_ENABLE_POSIX= + #export DARSHAN_LDMS_ENABLE_STDIO= + #export DARSHAN_LDMS_ENABLE_HDF5= + #export DARSHAN_LDMS_ENABLE_ALL= + #export DARSHAN_LDMS_VERBOSE= + +.. note:: + + The ```` is set to the node name the LDMS Streams daemon is running on (e.g. the node we previous ssh'd into). Make sure the ``LD_PRELOAD`` and at least one of the ``DARSHAN_LDMS_ENABLE_*`` variables are set. If not, no data will be collected by LDMS. + +.. note:: + + ``DARSHAN_LDMS_VERBOSE`` outputs the JSON formatted messages sent to the LDMS streams daemon. The output will be sent to STDERR. + +Execute Test Application +///////////////////////// +Now we will test the darshanConnector with Darshan's example ``mpi-io-test.c`` code by setting the following environment variables: + +.. code-block:: RST + + export PROG=mpi-io-test + export DARSHAN_TMP=/tmp/darshan-ldms-test + export DARSHAN_TESTDIR=/darshan/darshan-test/regression + export DARSHAN_LOGFILE_PATH=$DARSHAN_TMP + +Now ``cd`` to the executable and test the appilcation with the darshanConnector enabled. + +.. code-block:: RST + + cd darshan/darshan-test/regression/test-cases/src + $DARSHAN_TESTDIR/test-cases/src/${PROG}.c -o $DARSHAN_TMP/${PROG} + cd $DARSHAN_TMP + srun ${PROG} -f $DARSHAN_TMP/${PROG}.tmp.dat + +Once the application is complete, to view the data please skip to `Check Results`_. + +Test the Darshan-LDMS Integrated Code (Single Node) +---------------------------------- +The section goes over step-by-step instructions on how to compile and execute the ``mpi-io-test.c`` program under ``darshan/darshan-test/regression/test-cases/src/``, collect the data with the LDMS streams daemon and store it to a CSV file on a single login node. This section is for those who will not be running their applications on a cluster (i.e. no compute nodes). + +1. Set Environment Variables for Darshan, LDMS and Darshan-LDMS Integrated code (i.e. darshanConnector). + +.. code-block:: RST + + # Darshan + export DARSHAN_INSTALL_PATH= + export LD_PRELOAD=/lib/libdarshan.so + export LD_LIBRARY_PATH=$DARSHAN_INSTALL_PATH/lib:$LD_LIBRARY_PATH + # Optional. Please visit Darshan's runtime webpage for more information. + #export DARSHAN_MOD_ENABLE="DXT_POSIX,DXT_MPIIO" + + # uncomment if hdf5 is enabled + #export C_INCLUDE_PATH=$C_INCLUDE_PATH:/usr/include/hdf5/openmpi + #export HDF5_LIB=/libhdf5.so + + # LDMS + + LDMS_INSTALL= + export LD_LIBRARY_PATH="$LDMS_INSTALL/lib/:$LDMS_INSTALL/lib:$LD_LIBRARY_PATH" + export LDMSD_PLUGIN_LIBPATH="$LDMS_INSTALL/lib/ovis-ldms/" + export ZAP_LIBPATH="$LDMS_INSTALL/lib/ovis-ldms" + export PATH="$LDMS_INSTALL/sbin:$LDMS_INSTALL/bin:$PATH" + export PYTHONPATH= + export COMPONENT_ID="1" + export SAMPLE_INTERVAL="1000000" + export SAMPLE_OFFSET="0" + export HOSTNAME="localhost" + + # darshanConnector + export DARSHAN_LDMS_STREAM=darshanConnector + export DARSHAN_LDMS_XPRT=sock + export DARSHAN_LDMS_HOST= + export DARSHAN_LDMS_PORT=10444 + export DARSHAN_LDMS_AUTH=none + + # enable LDMS data collection. No runtime data collection will occur if this is not exported. + export DARSHAN_LDMS_ENABLE= + + # determine which modules we want to publish to ldmsd + #export DARSHAN_LDMS_ENABLE_MPIIO= + #export DARSHAN_LDMS_ENABLE_POSIX= + #export DARSHAN_LDMS_ENABLE_STDIO= + #export DARSHAN_LDMS_ENABLE_HDF5= + #export DARSHAN_LDMS_ENABLE_ALL= + #export DARSHAN_LDMS_VERBOSE= + +.. note:: + + ``DARSHAN_LDMS_VERBOSE`` outputs the JSON formatted messages sent to the LDMS streams daemon. The output will be sent to STDERR. + +2. Generate the LDMSD Configuration File and Start the Daemon + +.. code-block:: RST + + cat > darshan_stream_store.conf << EOF + load name=hello_sampler + config name=hello_sampler producer=${HOSTNAME} instance=${HOSTNAME}/hello_sampler stream=darshanConnector component_id=${COMPONENT_ID} + start name=hello_sampler interval=${SAMPLE_INTERVAL} offset=${SAMPLE_OFFSET} + + load name=stream_csv_store + config name=stream_csv_store path=./streams/store container=csv stream=darshanConnector rolltype=3 rollover=500000 + EOF + + ldmsd -x sock:10444 -c darshan_stream_store.conf -l /tmp/darshan_stream_store.log -v DEBUG + # check daemon is running + ldms_ls -p 10444 -h localhost -v + +3. Set Up Test Case Variables + +.. code-block:: RST + + export PROG=mpi-io-test + export DARSHAN_TMP=/tmp/darshan-ldms-test + export DARSHAN_TESTDIR=/darshan/darshan-test/regression + export DARSHAN_LOGFILE_PATH=$DARSHAN_TMP + +4. Run Darshan's mpi-io-test.c program + +.. code-block:: RST + + cd darshan/darshan-test/regression/test-cases/src + $DARSHAN_TESTDIR/test-cases/src/${PROG}.c -o $DARSHAN_TMP/${PROG} + cd $DARSHAN_TMP + ./${PROG} -f $DARSHAN_TMP/${PROG}.tmp.dat + +Once the application is complete, to view the data please skip to `Check Results`_. + +Pre-Installed Darshan-LDMS +--------------------------- +If both the Darshan-LDMS integrated code (i.e., darshanConnector) and LDMS are already installed, and a system LDMS streams daemon is running, then there are two ways to enable the LDMS functionality: + +1. Set the environment via sourcing the ``darshan_ldms.env`` script  + +2. Load the Darshan-LDMS module via ``module load darshan_ldms``  + +.. note:: + + Only when executing an application or submitting a job does the user need to load the ``darshan_ldms`` modulefile or source the ``darshan_ldms.env`` script.  Compiling, building, or installing the application does not affect the darshanConnector and vice versa.  + +1. Set Environment +/////////////////// + +In order to enable the darshanConnector code on the system, just source the following env script: + +.. code-block:: RST + + module use /projects/ovis/modules/ + source /projects/ovis/modules//darshan_ldms.env + +**OPTIONAL**: Add a "-v" when sourcing this file to enable verbose: + +.. code-block:: RST + + $ source /projects/ovis/modules//darshan_ldms.env -v + +This will output json messages collected by ldms to the terminal window. + +.. note:: + + The STDIO data will NOT be collected by LDMS. This is to prevent any recursive LDMS function calls.  + +2. Load Module +/////////////// + +If you do not wish to set the environment using the env script from above, you can always load the ``darshan_ldms`` modulefile, as follows: + +.. code-block:: RST + + module use /projects/ovis/modules/ + module load darshan_ldms + +**OPTIONAL**: If you decide to load the module, you will need to turn on verbose by setting the following environment variable in your run script: + +.. code-block:: RST + export DARSHAN_LDMS_VERBOSE="true" + +Script Information +/////////////////// + +The darshan_ldms module and .env file set the following env variables to define where the Darshan install is located, the LDMS daemon connection and what kind of file level access data will be published and stored to DSOS (via LDMS streams). + +If you only want to collect a specific type of data such as "MPIIO" then you will only set the ``DARSHAN_LDMS_ENABLE_MPIIO`` variable: + +.. code-block:: RST + export DARSHAN_LDMS_ENABLE_MPIIO="" + +If you want to collect all types of data then set all *_ENABLE_LDMS variables: + +.. code-block:: RST + export DARSHAN_LDMS_ENABLE_MPIIO="" + export DARSHAN_LDMS_ENABLE_POSIX="" + export DARSHAN_LDMS_ENABLE_HDF5="" + +.. note:: + + All Darshan binary log-files (i.e. .darshan) will be saved to ``$LOGFILE_PATH_DARSHAN``, as specified at build time and exported in the user environment. + +.. code-block:: RST + + # Set variables for darshan install + export LD_PRELOAD=$LD_PRELOAD:$DARSHAN_INSTALL_PATH/lib/libdarshan.so + export PATH=$PATH:$DARSHAN_INSTALL_PATH/bin + export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$DARSHAN_INSTALL_PATH/lib + export LIBRARY_PATH=$LIBRARY_PATH:$DARSHAN_INSTALL_PATH/lib + + export DARSHAN_RUNTIME_DIR=$DARSHAN_INSTALL_PATH + export DARSHAN_RUNTIME_BIN=$DARSHAN_INSTALL_PATH/bin + export DARSHAN_RUNTIME_LIB=$DARSHAN_INSTALL_PATH/lib + export HDF5_USE_FILE_LOCKING=1 + + # Set logfile path + export DARSHAN_TMP=/projects/ovis/darshanConnector//darshan/build/logs/ + export LOGFILE_PATH_DARSHAN=$DARSHAN_TMP + + # Connect to ldms daemon + export DARSHAN_LDMS_STREAM=darshanConnector + export DARSHAN_LDMS_PORT=412 + export DARSHAN_LDMS_HOST=localhost + export DARSHAN_LDMS_XPRT=sock + export DARSHAN_LDMS_AUTH=munge + + # Specify type of data to collect + export DARSHAN_LDMS_ENABLE= + export DARSHAN_LDMS_ENABLE_MPIIO= + export DARSHAN_LDMS_ENABLE_POSIX= + export DARSHAN_LDMS_ENABLE_STDIO= + export DARSHAN_LDMS_ENABLE_HDF5= + #export DARSHAN_LDMS_ENABLE_ALL= + #export DARSHAN_LDMS_VERBOSE= + + # check if verbose is requested + if [ "$1" == "-v" ]; then + export DARSHAN_LDMS_VERBOSE= + echo "Verbose is set." + else + unset DARSHAN_LDMS_VERBOSE + fi + + +Run application +/////////////// +Once the module is loaded and the environment is set, you will just need to run your application. All darshan related logs will automatically be saved in the directory specified in ``$LOGFILE_PATH_DARSHAN``. + +.. note:: + + If runtime errors or issues occur, then this is most likely due to incompatibility issues with the application build, or the Darshan-LDMS build that is using ``LD_PRELOAD``. You may debug the issue, as follows: + + 1. Unset the ``LD_PRELOAD`` environment variable (e.g., ``unset LD_PRELOAD``), then run the application with: ``mpiexec -env LD_PRELOAD $DARSHAN_INSTALL_PATH/lib/libdarshan.so`` or ``srun --export=LD_PRELOAD=$DARSHAN_INSTALL_PATH/lib/libdarshan.so``. + For more information please see section 5.2 in `Darshan's Runtime Installation Page `_. + + 2. If you are still running into runtime issues, please send an email to ldms@sandia.gov and provide: + a) mpi-io, hdf5, pnetcdf, compiler version (if applicable) used to build your application + b) Contents of your environment variables: $PATH, $LIBRARY_PATH, $LD_LIBRARY_PATH and $LD_PRELOAD. + + +Check Results +------------- +LDMS Output +//////////// +This section provides the expected output of an application run with the data published to LDMS streams daemon with a CSV storage plugin (see section `Run An LDMS Streams Daemon`_). + +* If you are publishing to a Local Streams Daemon (compute or login nodes) to collect the Darshan data, then compare the generated ``csv`` file to the one shown below in this section. + +* If you are publishing to a System Daemon, that aggregates the data and stores to a Scalable Object Store (SOS), please skip this section and go to the :doc:`SOS Quickstart Guide ` for more information about viewing and accessing data from this database. + +LDMS Log File +///////////// +* Once the application has completed, run ``cat /tmp/hello_stream_store.log`` in the terminal window where the ldmsd is running (compute node). You should see a similar output to the one below. + +.. code-block:: RST + + cat /tmp/hello_stream_store.log + Fri Feb 18 11:35:23 2022: INFO : stream_type: JSON, msg: "{ "job_id":53023,"rank":3,"ProducerName":"nid00052","file":"darshan-output/mpi-io-test.tmp.dat","record_id":1601543006480890062,"module":"POSIX","type":"MET","max_byte":-1,"switches":-1,"flushes":-1,"cnt":1,"op":"opens_segment","seg":[{"data_set":"N/A","pt_sel":-1,"irreg_hslab":-1,"reg_hslab":-1,"ndims":-1,"npoints":-1,"off":-1,"len":-1,"dur":0.00,"timestamp":1645209323.082951}]}", msg_len: 401, entity: 0x155544084aa0 + Fri Feb 18 11:35:23 2022: INFO : stream_type: JSON, msg: "{ "job_id":53023,"rank":3,"ProducerName":"nid00052","file":"N/A","record_id":1601543006480890062,"module":"POSIX","type":"MOD","max_byte":-1,"switches":-1,"flushes":-1,"cnt":1,"op":"closes_segment","seg":[{"data_set":"N/A","pt_sel":-1,"irreg_hslab":-1,"reg_hslab":-1,"ndims":-1,"npoints":-1,"off":-1,"len":-1,"dur":0.00,"timestamp":1645209323.083581}]}", msg_len: 353, entity: 0x155544083f60 + ... + +CSV File +//////// +* To view the data stored in the generated CSV file from the streams store plugin, kill the ldmsd daemon first by running: ``killall ldmsd`` +* Then ``cat`` the file in which the CSV file is located. Below is the stored DXT module data from LDMS's streams\_csv_\_store plugin for the ``mpi-io-test-dxt.sh`` test case. + +.. code-block:: RST + + #module,uid,ProducerName,switches,file,rank,flushes,record_id,exe,max_byte,type,job_id,op,cnt,seg:off,seg:pt_sel,seg:dur,seg:len,seg:ndims,seg:reg_hslab,seg:irreg_hslab,seg:data_set,seg:npoints,seg:timestamp,seg:total,seg:start + POSIX,99066,n9,-1,/lustre//darshan-ldms-output/mpi-io-test_lC.tmp.out,278,-1,9.22337E+18,/lustre//darshan-ldms-output/mpi-io-test,-1,MET,10697754,open,1,-1,-1,0.007415,-1,-1,-1,-1,N/A,-1,1662576527,0.007415,0.298313 + MPIIO,99066,n9,-1,/lustre//darshan-ldms-output/mpi-io-test_lC.tmp.out,278,-1,9.22337E+18,/lustre//darshan-ldms-output/mpi-io-test,-1,MET,10697754,open,1,-1,-1,0.100397,-1,-1,-1,-1,N/A,-1,1662576527,0.100397,0.209427 + POSIX,99066,n11,-1,/lustre//darshan-ldms-output/mpi-io-test_lC.tmp.out,339,-1,9.22337E+18,/lustre//darshan-ldms-output/mpi-io-test,-1,MET,10697754,open,1,-1,-1,0.00742,-1,-1,-1,-1,N/A,-1,1662576527,0.00742,0.297529 + POSIX,99066,n6,-1,/lustre//darshan-ldms-output/mpi-io-test_lC.tmp.out,184,-1,9.22337E+18,/lustre//darshan-ldms-output/mpi-io-test,-1,MET,10697754,open,1,-1,-1,0.007375,-1,-1,-1,-1,N/A,-1,1662576527,0.007375,0.295111 + POSIX,99066,n14,-1,/lustre//darshan-ldms-output/mpi-io-test_lC.tmp.out,437,-1,9.22337E+18,/lustre//darshan-ldms-output/mpi-io-test,-1,MET,10697754,open,1,-1,-1,0.007418,-1,-1,-1,-1,N/A,-1,1662576527,0.007418,0.296812 + POSIX,99066,n7,-1,/lustre//darshan-ldms-output/mpi-io-test_lC.tmp.out,192,-1,9.22337E+18,/lustre//darshan-ldms-output/mpi-io-test,-1,MET,10697754,open,1,-1,-1,0.007435,-1,-1,-1,-1,N/A,-1,1662576527,0.007435,0.294776 + MPIIO,99066,n7,-1,/lustre//darshan-ldms-output/mpi-io-test_lC.tmp.out,192,-1,9.22337E+18,/lustre//darshan-ldms-output/mpi-io-test,-1,MET,10697754,open,1,-1,-1,0.033042,-1,-1,-1,-1,N/A,-1,1662576527,0.033042,0.273251 + ... + +Compare With Darshan Log File(s) +//////////////////////////////// +Parse the Darshan binary file using Darshan's standard and DXT (only if the ``DXT Module`` is enabled) parsers. + +.. code-block:: RST + + $DARSHAN_INSTALL_PATH/bin/darshan-parser --all $LOGFILE_PATH_DARSHAN/.darshan > $DARSHAN_TMP/${PROG}.darshan.txt + $DARSHAN_INSTALL_PATH/bin/darshan-dxt-parser --show-incomplete $LOGFILE_PATH_DARSHAN/.darshan > $DARSHAN_TMP/${PROG}-dxt.darshan.txt + +Now you can view the log(s) with ``cat $DARSHAN_TMP/${PROG}.darshan.txt`` or ``cat $DARSHAN_TMP/${PROG}-dxt.darshan.txt`` and compare them to the data collected by LDMS. + +The ``producerName``, file path and record_id of each job should match and, if ``dxt`` was enabled, the individual I/O statistics of each rank (i.e., start time and number of I/O operations). + + +Kokkos +*********************** +* Appropriate Kokkos function calls must be included in the application code. Add the following environmental variables to your run script to push Kokkos data from the application to stream for collection. + +**What Is Kokkos?** + +A C++ parallel programming ecosystem for performance portability across multi-core, many-core, and GPU node architectures. Provides abstractions of parallel execution of code and data management. + +Setup and Configuration +---------------------- +**The KokkosConnector** + +A Kokkos-LDMS functionality that utilizes LDMS Streams to collect Kokkos related data during runtime. Kokkos sampler, provided by the Kokkos-tools library, controls the sampling rate and provides the option to sample data using a count-based push. It then formats the data to a JSON message and *publishes* it to an LDMS streams interface. + +.. warning:: + To use kokkosConnector, all users will need to install Kokkos-Tools. You can find their repository and instructions on installing it here: https://github.com/kokkos/kokkos-tools + + +The following environmental variables are needed in an application's runscript to run the kokkos-sampler and LDMS's kokkosConnector: + +.. code-block:: RST + + export KOKKOS_LDMS_HOST="localhost" + export KOKKOS_LDMS_PORT="412" + export KOKKOS_PROFILE_LIBRARY="/kokkos-tools/common/kokkos_sampler/kp_sampler.so;/ovis/kokkosConnector/kp_kernel_ldms.so" + export KOKKOS_SAMPLER_RATE=101 + export KOKKOS_LDMS_VERBOSE=0 + export KOKKOS_LDMS_AUTH="munge" + export KOKKOS_LDMS_XPRT="sock" + +* The KOKKOS_SAMPLER_RATE variable determines the rate of messages pushed to streams and collected. Please note that it is in best practice to set this to a prime number to avoid collecting information from the same kernels. +* The KOKKOS_LDMS_VERBOSE variable can be set to 1 for debug purposes which prints all collected kernel data to the console. + +How To Make A Data Connector +***************************** +In order to create a data connector with LDMS to collect runtime timeseries application data, you will need to utilize LDMS's Streams Functionality. This section will provide the necessary functions and Streams API required to make the data connector. + +The example (code) below is pulled from the Darshan-LDMS Integration code. + +.. note:: + + The LDMS Streams functionality uses a push-based method to reduce memory consumed and data loss on the node. + +Include the following LDMS files +--------------------------------------- +* First, the following libaries will need to be included in the program as these contain all the functions that the data connector will be using/calling. +.. code-block:: RST + + #include + #include + #include + +Initialize All Necessary Variables +----------------------------------- + +* Next, the following variables will need to be initialized globally or accessible by the Streams API Functions described in the next section: + +.. code-block:: RST + + #define SLURM_NOTIFY_TIMEOUT 5 + ldms_t ldms_g; + pthread_mutex_t ln_lock; + int conn_status, to; + ldms_t ldms_darsh; + sem_t conn_sem; + sem_t recv_sem; + + +Copy "Hello Sampler" Streams API Functions +------------------------------------------ +Next, copy the ``ldms_t setup_connection`` and ``static void event_cb`` functions listed below. These functions originated from the `ldmsd_stream_subscribe.c `_ code. + +The ``setup_connection`` contains LDMS API calls that connects to the LDMS daemon and the ``static void event_cb`` is a callback function to check the connection status of the LDMS Daemon. + +.. code-block:: RST + + static void event_cb(ldms_t x, ldms_xprt_event_t e, void *cb_arg) + { + switch (e->type) { + case LDMS_XPRT_EVENT_CONNECTED: + sem_post(&conn_sem); + conn_status = 0; + break; + case LDMS_XPRT_EVENT_REJECTED: + ldms_xprt_put(x); + conn_status = ECONNREFUSED; + break; + case LDMS_XPRT_EVENT_DISCONNECTED: + ldms_xprt_put(x); + conn_status = ENOTCONN; + break; + case LDMS_XPRT_EVENT_ERROR: + conn_status = ECONNREFUSED; + break; + case LDMS_XPRT_EVENT_RECV: + sem_post(&recv_sem); + break; + case LDMS_XPRT_EVENT_SEND_COMPLETE: + break; + default: + printf("Received invalid event type %d\n", e->type); + } + } + + ldms_t setup_connection(const char *xprt, const char *host, + const char *port, const char *auth) + { + char hostname[PATH_MAX]; + const char *timeout = "5"; + int rc; + struct timespec ts; + + if (!host) { + if (0 == gethostname(hostname, sizeof(hostname))) + host = hostname; + } + if (!timeout) { + ts.tv_sec = time(NULL) + 5; + ts.tv_nsec = 0; + } else { + int to = atoi(timeout); + if (to <= 0) + to = 5; + ts.tv_sec = time(NULL) + to; + ts.tv_nsec = 0; + } + + ldms_g = ldms_xprt_new_with_auth(xprt, auth, NULL); + if (!ldms_g) { + printf("Error %d creating the '%s' transport\n", + errno, xprt); + return NULL; + } + + sem_init(&recv_sem, 1, 0); + sem_init(&conn_sem, 1, 0); + + rc = ldms_xprt_connect_by_name(ldms_g, host, port, event_cb, NULL); + if (rc) { + printf("Error %d connecting to %s:%s\n", + rc, host, port); + return NULL; + } + sem_timedwait(&conn_sem, &ts); + if (conn_status) + return NULL; + return ldms_g; + } + +Initialize and Connect to LDMSD +------------------------------------------ +Once the above functions have been copied, the ``setup_connection`` will need to be called in order to establish a connection an LDMS Streams Daemon. + +.. note:: + + The LDMS Daemon is configured with the `Streams Plugin `_ and should already be running on the node. The host is set to the node the daemon is running on and port is set to the port the daemon is listening to. Below you will find an example of the Darshan Connector for reference. + +.. code-block:: RST + + void darshan_ldms_connector_initialize() + { + const char* env_ldms_stream = getenv("DARSHAN_LDMS_STREAM"); + const char* env_ldms_xprt = getenv("DARSHAN_LDMS_XPRT"); + const char* env_ldms_host = getenv("DARSHAN_LDMS_HOST"); + const char* env_ldms_port = getenv("DARSHAN_LDMS_PORT"); + const char* env_ldms_auth = getenv("DARSHAN_LDMS_AUTH"); + + /* Check/set LDMS transport type */ + if (!env_ldms_xprt || !env_ldms_host || !env_ldms_port || !env_ldms_auth || env_ldms_stream){ + printf("Either the transport, host, port or authentication is not given\n"); + return; + } + + pthread_mutex_lock(ln_lock); + ldms_darsh = setup_connection(env_ldms_xprt, env_ldms_host, env_ldms_port, env_ldms_auth); + if (conn_status != 0) { + printf("Error setting up connection to LDMS streams daemon: %i -- exiting\n", conn_status); + pthread_mutex_unlock(ln_lock); + return; + } + else if (ldms_darsh->disconnected){ + printf("Disconnected from LDMS streams daemon -- exiting\n"); + pthread_mutex_unlock(ln_lock); + return; + } + pthread_mutex_unlock(ln_lock); + return; + } + +The environment variables ``DARSHAN_LDMS_X`` are used to define the stream name (configured in the daemon), transport type (sock, ugni, etc.), host, port and authentication of the LDMSD. In this specific example, the stream name is set to "darshanConnector" so the environment variable, ``DARSHAN_LDMS_STREAM`` is exported as follows: ``export DARSHAN_LDMS_STREAM=darshanConnector`` + +.. note:: + The environment variables are not required. The stream, transport, host, port and authentication can be initialized and set within in the code. + +.. note:: + If you run into the following error: ``error:unknown type name 'sem_t'`` then you will need to add the following libraries to your code: + + * ``#include `` + * ``#include `` + +Publish Event Data to LDMSD +------------------------------------- +Now we will create a function that will collect all relevent application events and publish to the LDMS Streams Daemon. In the Darshan-LDMS Integration, the following Darshan's I/O traces for each I/O event (i.e. open, close, read, write) are collected along with the absolute timestamp (for timeseries data) for each I/O event: + +.. code-block:: RST + + void darshan_ldms_connector_send(int64_t record_count, char *rwo, int64_t offset, int64_t length, int64_t max_byte, int64_t rw_switch, int64_t flushes, double start_time, double end_time, struct timespec tspec_start, struct timespec tspec_end, double total_time, char *mod_name, char *data_type) + { + char jb11[1024]; + int rc, ret, i, size, exists; + env_ldms_stream = getenv("DARSHAN_LDMS_STREAM"); + + pthread_mutex_lock(ln_lock); + if (ldms_darsh != NULL) + exists = 1; + else + exists = 0; + pthread_mutex_unlock(ln_lock); + + if (!exists){ + return; + } + + sprintf(jb11,"{ \"uid\":%ld, \"exe\":\"%s\",\"job_id\":%ld,\"rank\":%ld,\"ProducerName\":\"%s\",\"file\":\"%s\",\"record_id\":%"PRIu64",\"module\":\"%s\",\"type\":\"%s\",\"max_byte\":%ld,\"switches\":%ld,\"flushes\":%ld,\"cnt\":%ld,\"op\":\"%s\",\"seg\":[{\"data_set\":\"%s\",\"pt_sel\":%ld,\"irreg_hslab\":%ld,\"reg_hslab\":%ld,\"ndims\":%ld,\"npoints\":%ld,\"off\":%ld,\"len\":%ld,\"start\":%0.6f,\"dur\":%0.6f,\"total\":%.6f,\"timestamp\":%lu.%.6lu}]}", dC.uid, dC.exename, dC.jobid, dC.rank, dC.hname, dC.filename, dC.record_id, mod_name, data_type, max_byte, rw_switch, flushes, record_count, rwo, dC.data_set, dC.hdf5_data[0], dC.hdf5_data[1], dC.hdf5_data[2], dC.hdf5_data[3], dC.hdf5_data[4], offset, length, start_time, end_time-start_time, total_time, tspec_end.tv_sec, micro_s); + + rc = ldmsd_stream_publish(ldms_darsh, env_ldms_stream, LDMSD_STREAM_JSON, jb11, strlen(jb11) + 1); + if (rc) + printf("Error %d publishing data.\n", rc); + + out_1: + return; + } + +.. note:: + + For more information about the various Darshan I/O traces and metrics collected, please visit `Darshan's Runtime Installation Page `_ and `Darshan LDMS Metrics Collected `_ pages. + +Once this function is called, it initializes a connection to the LDMS Streams Daemon, attempts reconnection if the connection is not established, then formats the given arguements/variables into a JSON message format and finally publishes to the LDMS Streams Deamon. + +There are various types of formats that can be used to publish the data (i.e. JSON, string, etc.) so please review the `Defining A Format`_ section for more information. + +Collect Event Data +///////////////////////// + +To collect the application data in real time (and using the example given in this section), the ``void darshan_ldms_connector_send(arg1, arg2, arg3,....)`` will be placed in all sections of the code where we want to publish a message. From the Darshan-LDMS Integration code we would have: + +.. code-block:: RST + + darshan_ldms_connector_send(rec_ref->file_rec->counters[MPIIO_COLL_OPENS] + rec_ref->file_rec->counters[MPIIO_INDEP_OPENS], "open", -1, -1, -1, -1, -1, __tm1, __tm2, __ts1, __ts2, rec_ref->file_rec->fcounters[MPIIO_F_META_TIME], "MPIIO", "MET"); + +This line of code is placed within multiple macros (`MPIIO_RECORD_OPEN/READ/WRITE `_) in Darshan's MPIIO module. + +* Doing this will call the function everytime Darshan detects an I/O event from the application (i.e. read, write, open, close). Once called, the arguements will be passed to the function, added to the JSON formatted message and pushed to the LDMS daemon. + +.. note:: + + For more information about how to store the published data from and LDMS Streams Daemon, please see the Stream CSV Store plugin man pages on a system where LDMS Docs are installed: ``man Plugin_stream_csv_store`` diff --git a/rtd/docs/source/ldms-tutorial.rst b/rtd/docs/source/ldms-tutorial.rst new file mode 100644 index 000000000..ce4b26df7 --- /dev/null +++ b/rtd/docs/source/ldms-tutorial.rst @@ -0,0 +1,4 @@ +Additional LDMS Tutorial Material +=============================== +* `Tutorial Videos `_ +* `Tutorial Slides `_ diff --git a/rtd/docs/source/ldms_man/Plugin_cray_dvs_sampler.rst b/rtd/docs/source/ldms_man/Plugin_cray_dvs_sampler.rst new file mode 100644 index 000000000..7788bea20 --- /dev/null +++ b/rtd/docs/source/ldms_man/Plugin_cray_dvs_sampler.rst @@ -0,0 +1,108 @@ +======================= +Plugin_cray_dvs_sampler +======================= + +:Date: 05 Feb 2018 + +.. contents:: + :depth: 3 +.. + +NAME +======================== + +Plugin_cray_dvs_sampler - man page for the LDMS cray_dvs_sampler plugin + +SYNOPSIS +============================ + +| Within ldmsd_controller or a configuration file: +| config name=cray_dvs_sampler [ = ] + +DESCRIPTION +=============================== + +With LDMS (Lightweight Distributed Metric Service), plugins for the +ldmsd (ldms daemon) are configured via ldmsd_controller or a +configuration file. + +The cray_dvs_sampler plugin provides memory info from +/proc/fs/dvs/mount/[mount-id]/stats. A separate metric set is produced +for each mount point. Metric set names are of the form \`XXX'. + +See section \`DATA AND THE CONFIGURATION FILE' for information on the +variables and configuration file. + +This sampler is for Cray systems only. + +CONFIGURATION ATTRIBUTE SYNTAX +================================================== + +The cray_dvs_sampler plugin uses the sampler_base base class. This man +page covers only the configuration attributes, or those with default +values, specific to the this plugin; see ldms_sampler_base.man for the +attributes of the base class. + +**config** + | name= [schema= conffile=] + | configuration line + + name= + | + | This MUST be cray_dvs_sampler + + schema= + | + | Optional schema name. It is intended that the same sampler on + different nodes with different metrics have a different schema. + If not specified, will default to \`cray_dvs_sampler\`. + + conffile= + | + | Optional path to the configuration file + +DATA AND THE CONFIGURATION FILE +=================================================== + +| The data source is /proc/fs/dvs/mount/[mount-id]/stats. This file + consists of a number of lines of the format +| variablename: v1 v2 ... vN + +The number of values varies between 1 and 6. Each line will then produce +between 1 and 6 metrics with names of the form variablename appended by +an additional string associated with thr interpretation of that value +(e.g, min, err). + +By default, this sampler will collect all the variables for all mount +points. The number of metrics can be downselected by using a +configuration file (see conffile argument). The format of this file is +one variablename per line, comments start with '#' and blank lines are +skipped. Note that the variablename from the dataline is what is +specified in the configuration file, not the metricnames associated with +that variablename in the data source file. As a result, all metrics +associated with a give line in the dvs stats source are included or +excluded together. + +NOTES +========================= + +- In the config, the sampler is called cray_dvs_sampler. Also the + library is called libcray_dvs_sampler. However, the source file is + dvs_sampler.c + +- This sampler is for Cray systems only. + +BUGS +======================== + +None known. + +EXAMPLES +============================ + +TBD + +SEE ALSO +============================ + +ldmsd(8), ldms_quickstart(7), ldmsd_controller(8), ldms_sampler_base(7) diff --git a/rtd/docs/source/ldms_man/Plugin_jobid.rst b/rtd/docs/source/ldms_man/Plugin_jobid.rst new file mode 100644 index 000000000..73cb07581 --- /dev/null +++ b/rtd/docs/source/ldms_man/Plugin_jobid.rst @@ -0,0 +1,125 @@ +============ +Plugin_jobid +============ + +:Date: 03 Dec 2016 + +.. contents:: + :depth: 3 +.. + +NAME +============= + +Plugin_jobid - man page for the LDMS jobid plugin + +SYNOPSIS +================= + +| Within ldmsd_controller or in a configuration file +| config name=jobid [ = ] + +DESCRIPTION +==================== + +With LDMS (Lightweight Distributed Metric Service), plugins for the +ldmsd (ldms daemon) are configured via ldmsd_controller or a +configuration file. The jobid plugin provides jobid info from +/var/run/ldms.jobinfo or similar files replaced periodically by resource +managers. When files are missing, the value 0 or equivalent is reported. + +CONFIGURATION ATTRIBUTE SYNTAX +======================================= + +**config** + | name= producer= instance= + [component_id= schema=] [with_jobid=] + file= + | configuration line + + name= + | + | This MUST be jobid. + + producer= + | + | The producer name value. + + instance= + | + | The name of the metric set. + + schema= + | + | Optional schema name. It is intended that the same sampler on + different nodes with different metrics have a different schema. + If not specified, will default to \`vmstat\`. + + component_id= + | + | Optional component identifier. Defaults to zero. + + with_jobid= + | + | Option to lookup job_id with set or 0 if not. The job_id column + will always appear, but populated witn zero. + +BUGS +============= + +No known implementation bugs. Design features you may not like: Relies +on site-specific resource manager configuration to produce the file +read. Does not query local or remote manager daemons. May be slow to +sample and generate undesirable filesystem events if filepath is on a +networked filesystem instead of a node-local RAM partition as is usual +in clusters. + +NOTES +============== + +The colname option from LDMS v2 slurmjobid plugin is no longer +supported. The sampler offset for the jobid plugin should be slightly +less than all other plugins to ensure consistency in the job information +reported for a given time interval across all other plugins. The time +interval for the jobid plugin need only be approximately the clock +granularity of the resource manager. + +Other samplers use the jobid plugin as the jobid data source. If the +jobid sampler is not loaded, these samplers will report 0 jobid values. + +EXAMPLES +================= + +:: + + Within ldmsd_controller or in a configuration file + load name=jobid + config name=jobid component_id=1 producer=vm1_1 instance=vm1_1/jobid + start name=jobid interval=1000000 offset=-100000 + + + Within ldmsd_controller or in a configuration file + load name=jobid + config name=jobid component_id=1 producer=vm1_1 instance=vm1_1/jobid file=/var/run/rman/node/jobinfo + start name=jobid interval=1000000 offset=-100000 + +Slurm 2.x installations can populate /var/run/ldms.jobid by adding the +following lines to slurm.epilog and slurm.prolog, respectively. + +:: + + + echo "JOBID=0" > /var/run/ldms.jobinfo + + and + + echo JOBID=$SLURM_JOBID > /var/run/ldms.jobinfo + echo UID=$SLURM_UID >> /var/run/ldms.jobinfo + echo USER=$SLURM_JOB_USER >> /var/run/ldms.jobinfo + +These slurm files might be found in /etc/nodestate/bin/. + +SEE ALSO +================= + +ldms(7), ldmsd(8), ldmsd_controller(8) diff --git a/rtd/docs/source/ldms_man/Plugin_lustre2_client.rst b/rtd/docs/source/ldms_man/Plugin_lustre2_client.rst new file mode 100644 index 000000000..36707f4d0 --- /dev/null +++ b/rtd/docs/source/ldms_man/Plugin_lustre2_client.rst @@ -0,0 +1,100 @@ +===================== +Plugin_lustre2_client +===================== + +:Date: 26 Oct 2017 + +.. contents:: + :depth: 3 +.. + +NAME +====================== + +Plugin_lustre2_client - man page for the LDMS lustre2_client plugin + +SYNOPSIS +========================== + +| Within ldmsd_controller or a configuration file: +| ldmsctl> config name=lustre2_client [ = ] + +DESCRIPTION +============================= + +The lustre2_client plugin provides Lustre metric information. + +CONFIGURATION ATTRIBUTE SYNTAX +================================================ + +This plugin uses the sampler_base base class. This man page covers only +the configuration attributes, or those with default values, specific to +the this plugin; see **ldms_sampler_base**\ (7) for the attributes of +the base class. + +**config** **name**\ *=* * osc*\ **=** *mdc*\ **=** *llite*\ **=** *osc_path =* **mdc_path=**\ *"*\ **llite_path=** + +Descriptions: + + **name**\ *=* + This MUST be lustre2_client. + + **** + Please see **ldms_sampler_base**\ (7) for sampler_base options. + + **osc**\ *=* + CSV list of OSC's. + + **mdc**\ *=* + CSV list of MDC's. + + **llite**\ *=* + CSV list of LLITE's. + + **osc_path**\ *=* + A user custom path to osc. + + **mdc_path**\ *=* + A user custom path to osc. + + **llite_path**\ *=* + A user custom path to llite. + +NOTES +======================= + +For oscs,mdcs and llites: if not specified, NONE of the oscs/mdcs/llites +will be added. If {oscs,mdcs,llites} is set to \*, all of the available +{oscs,mdcs,llites} at the time will be added. + +The names that make up the list of oscs, mdcs and llites do not have to +include the uid part. For example, 'lustre-ffff8803245d4000' is the +actual file in /proc/fs/lustre/llite/, but you can just say +llites=lustre to include this component into the set. + +osc_path, mdc_path, llite_path are optional full path names of stats +files if not in default location. The default locations are: +/sys/kernel/debug/lustre/{osc, mdc, llite}, and /proc/fs/lustre/{osc, +mdc, llite} depends on the Lustre version. Be aware that +/sys/kernel/debug is only readable by privileged users. + +BUGS +====================== + +None known. + +EXAMPLES +========================== + +:: + + load name=lustre2_client + config name=lustre2_client producer=compute1 component_id=1 instance=compute1/lustre2_client llites=* + ldmsctl> start name=lustre2_client interval=1000000 + ldmsctl> quit + +SEE ALSO +========================== + +**ldms_sampler_base**\ (7), **ldmsd**\ (8), **ldms_quickstart**\ (7), +**ldmsd_controller**\ (8) diff --git a/rtd/docs/source/ldms_man/Plugin_papi.rst b/rtd/docs/source/ldms_man/Plugin_papi.rst new file mode 100644 index 000000000..d4e5cfa54 --- /dev/null +++ b/rtd/docs/source/ldms_man/Plugin_papi.rst @@ -0,0 +1,112 @@ +=========== +Plugin_papi +=========== + +:Date: 09 May 2016 + +.. contents:: + :depth: 3 +.. + +NAME +============ + +Plugin_papi - man page for the LDMS papi sampler plugin. + +SYNOPSIS +================ + +| Within ldmsctl +| ldmsctl> config name=spapi [ = ] + +DESCRIPTION +=================== + +With LDMS (Lightweight Distributed Metric Service), plugins for the +ldmsd (ldms daemon) are configured via ldmsctl. The papi sampler plugin +runs on the nodes and provides data about the the occurrence of +micro-architectural events using papi library by accessing hardware +performance counters. + +ENVIRONMENT +=================== + +You will need to build LDMS with --enable-papi. Papi library should be +available through plugin library path. + +LDMSCTL CONFIGURATION ATTRIBUTE SYNTAX +============================================== + +**config** + name= events= + pid= producer= instance= + [schema=] [component_id= with_jobid=] ldmsctl + configuration line + +name= + | + | This MUST be spapi. + +producer= + | + | The producer string value. + +instance= + | + | The name of the metric set + +schema= + | + | Optional schema name. It is intended that the same sampler on + different nodes with different metrics have a different schema. + +component_id= + | + | Optional component identifier. Defaults to zero. + +with_jobid= + | + | Option to collect job id with set or 0 if not. + +events= + | + | Comma separated list of events. Available events can be determined + using papi_avail command if papi is installed on system. + +pid - The PID for the process being monitored + | + +NOTES +============= + +In order to check if an event is available on the system you can run +papi_avail. + +BUGS +============ + +No known bugs. + +EXAMPLES +================ + +The following is a short example that measures 4 events. + | + | Total CPU cycles + | Total CPU instructions + | Total branch instructions + | Mispredicted branch instructions + +$ldmsctl -S $LDMSD_SOCKPATH + +| ldmsctl> load name=spapi +| ldmsctl> config name=spapi producer=$PRODUCER_NAME + instance=$INSTANCE_NAME pid=$PID + events=PAPI_TOT_INS,PAPI_TOT_CYC,PAPI_BR_INS,PAPI_BR_MSP +| ldmsctl> start name=spapi interval=$INTERVAL_VALUE +| ldmsctl> quit + +SEE ALSO +================ + +papi_avail(1) , ldmsd(7), ldms_quickstart(7) diff --git a/rtd/docs/source/ldms_man/Plugin_rapl.rst b/rtd/docs/source/ldms_man/Plugin_rapl.rst new file mode 100644 index 000000000..8705cc98d --- /dev/null +++ b/rtd/docs/source/ldms_man/Plugin_rapl.rst @@ -0,0 +1,80 @@ +=========== +Plugin_rapl +=========== + +:Date: 18 Feb 2018 + +.. contents:: + :depth: 3 +.. + +NAME +============ + +Plugin_rapl - man page for the LDMS rapl plugin + +SYNOPSIS +================ + +| Within ldmsd_controller or a configuration file: +| config name=rapl [ = ] + +DESCRIPTION +=================== + +With LDMS (Lightweight Distributed Metric Service), plugins for the +ldmsd (ldms daemon) are configured via ldmsd_controller or a +configuration file. The rapl plugin provides energy sampling using RAPL +via the PAPI interface for sandybridge. + +WARNING: This sampler is unsupported. + +CONFIGURATION ATTRIBUTE SYNTAX +====================================== + +The rapl plugin uses the sampler_base base class. This man page covers +only the configuration attributes, or those with default values, +specific to the this plugin; see ldms_sampler_base.man for the +attributes of the base class. + +**config** + | name= [schema=] + | configuration line + + name= + | + | This MUST be rapl. + + schema= + | + | Optional schema name. It is intended that the same sampler on + different nodes with different metrics have a different schema. + If not specified, will default to \`rapl\`. + +BUGS +============ + +No known bugs. + +NOTES +============= + +- WARNING: This is for sandybridge only. + +- This sampler is unsupported. + +EXAMPLES +================ + +Within ldmsd_controller or a configuration file: + +:: + + load name=rapl + config name=rapl producer=vm1_1 instance=vm1_1/rapl component_id=1 + start name=rapl interval=1000000 + +SEE ALSO +================ + +ldmsd(8), ldms_quickstart(7), ldmsd_controller(8), ldms_sampler_base(7) diff --git a/rtd/docs/source/ldms_man/Plugin_shm_sampler.rst b/rtd/docs/source/ldms_man/Plugin_shm_sampler.rst new file mode 100644 index 000000000..df34722ea --- /dev/null +++ b/rtd/docs/source/ldms_man/Plugin_shm_sampler.rst @@ -0,0 +1,71 @@ +================== +Plugin_shm_sampler +================== + +:Date: 5 March 2018 + +.. contents:: + :depth: 3 +.. + +This is a sampler plug-in module within the the LDMS that can read from +a dynamic number of shm files. + +| Within ldmsd_controller or a configuration file: +| load name=shm_sampler +| config name=shm_sampler [ = ] + +is a sampler plug-in module within the the LDMS. This sampler can read +from a dynamic number of shm files. These files are tracked by a central +index file in shared memory. The main usage of this sampler is to stream +application performance data. + +| Configuration options: +| producer= instance= + [shm_index=][shm_boxmax=][shm_array_max=][shm_metric_max=] + [shm_set_timeout=][component_id=] [schema=] + [job_set= job_id= app_id= job_start= + job_end=] + +A unique name for the host providing the data + +A unique name for the metric set + +A unique name for the shared memory index file + +Maximum number of entries in the shared memory index file + +Maximum number of elements in array metrics + +Maximum number of metrics + +No read/write timeout in seconds + +A unique number for the component being monitored, Defaults to zero. + +The name of the metric set schema, Defaults to the sampler name + +The instance name of the set containing the job data, default is +'job_info' + +The name of the metric containing the Job Id, default is 'job_id' + +The name of the metric containing the Application Id, default is +'app_id' + +The name of the metric containing the Job start time, default is +'job_start' + +The name of the metric containing the Job end time, default is 'job_end' + +None known. + +Within ldmsd_controller or a configuration file: + +:: + + load name=shm_sampler + config name=shm_sampler producer=samplerd instance=samplerd/shm_sampler shm_index=/ldms_shm_mpi_index shm_boxmax=4 component_id=23 + start name=shm_sampler interval=1000000 offset=0 + +ldmsd(8), ldms_quickstart(7), ldmsd_controller(8) diff --git a/rtd/docs/source/ldms_man/Plugin_store_app.rst b/rtd/docs/source/ldms_man/Plugin_store_app.rst new file mode 100644 index 000000000..237233afa --- /dev/null +++ b/rtd/docs/source/ldms_man/Plugin_store_app.rst @@ -0,0 +1,119 @@ +================ +Plugin_store_app +================ + +:Date: 30 Sep 2019 + +.. contents:: + :depth: 3 +.. + +NAME +================== + +ldmsd_store_app - LDMSD store_app storage plugin + +SYNOPSIS +====================== + +**load** **name**\ =\ **store_app** + +**config** **name**\ =\ **store_app** **path**\ =\ *STORE_ROOT_PATH* [ +**perm\ =\ OCTAL_PERM** ] + +**strgp_add** **name**\ =\ *STRGP_NAME* **plugin**\ =\ **store_app** +**container**\ =\ *CONTAINER_NAME* **schema**\ =\ *LDMS_SCHEMA* + +**strgp_prdcr_add** **name**\ =\ *STRGP_NAME* +**regex**\ =\ *PRDCR_REGEX* + +DESCRIPTION +========================= + +**``store_app``** is an LDMSD storage plugin for storing data from the +sets from **``app_sampler``** LDMSD sampler plugin. **``store_app``** +uses **``SOS``** as its database back-end. The **``path``** option +points to the directory containing **``SOS``** containers for this +plugin (one container per **``strgp``**). If the container does not +exist, it will be created with permission given by **``perm``** option +(default: 0660). The container contains multiple schemas, each of which +assoicates with a metric from the sets from **``app_sampler``** (e.g. +**``stat_utime``**). Schemas in the container have the following +attributes: + +- **``timestamp``** : the data sampling timestamp. + +- **``component_id``**: the component ID producing the data. + +- **``job_id``**: the Slurm job ID. + +- **``app_id``**: the application ID. + +- **``rank``**: the Slurm task rank. + +- **METRIC_NAME**: the metric value (the name of this attribute is the + metric name of the metric). + +- **``comp_time``**: (indexed) the join of **``component_id``** and + **``timestamp``**. + +- **``time_job``**: (indexed) the join of **``timestamp``** and + **``job_id``**. + +- **``job_rank_time``**: (indexed) the join of **``job_id``**, + **``rank``**, and **``timestamp``**. + +- **``job_time_rank``**: (indexed) the join of **``job_id``**, + **``timestamp``**, and **``rank``**. + +CONFIG OPTIONS +============================ + +name + The name of the plugin instance to configure. + +path + The path to the directory that contains SOS containers (one container + per strgp). + +perm + The octal mode (e.g. 0777) that is used in SOS container creation. + The default is **0660**. + +EXAMPLES +====================== + + :: + + # in ldmsd config file + load name=store_app + config name=store_app path=/sos perm=0600 + strgp_add name=app_strgp plugin=mstore_app container=app schema=app_sampler + # NOTE: the schema in strgp is LDMS set schema, not to confuse with the one + # schema per metric in our SOS container. + strgp_prdcr_add name=app_strgp regex=.* + strgp_start name=app_strgp + +The following is an example on how to retrieve the data using Python: + + :: + + from sosdb import Sos + cont = Sos.Container() + cont.open('/sos/app') + sch = cont.schema_by_name('status_vmsize') + attr = sch.attr_by_name('time_job') # attr to iterate over must be indexed + itr = attr.attr_iter() + b = itr.begin() + while b == True: + obj = itr.item() + print(obj['status_vmsize']) # object attribute access by name + print(obj[5]) # equivalent to above + print(obj[:]) # get everything at once + b = itr.next() + +SEE ALSO +===================== + +**Plugin_app_sampler**\ (7), **ldmsd**\ (8), **ldms_quickstart**\ (7), +**ldmsd_controller**\ (8), diff --git a/rtd/docs/source/ldms_man/Plugin_store_csv.rst b/rtd/docs/source/ldms_man/Plugin_store_csv.rst new file mode 100644 index 000000000..a0aad9e78 --- /dev/null +++ b/rtd/docs/source/ldms_man/Plugin_store_csv.rst @@ -0,0 +1,487 @@ +================ +Plugin_store_csv +================ + +:Date: 26 Nov 2018 + +.. contents:: + :depth: 3 +.. + +NAME +================= + +Plugin_store_csv - man page for the LDMS store_csv plugin + +SYNOPSIS +===================== + +| Within ldmsd_controller script or a configuration file: +| load name=store_csv +| config name=store_csv [ = ] +| config name=store_csv [opt_file=filename] [ = ] +| config name=store_csv [container=c schema=s] [ = ] +| strgp_add name= plugin=store_csv container= schema= + [decomposition=] + +DESCRIPTION +======================== + +With LDMS (Lightweight Distributed Metric Service), store plugins for +the ldmsd (ldms daemon) are configured via the ldmsd_controller or a +configuration file. The store_csv plugin is a CSV store. + +STORE_CSV CONFIGURATION SOURCES +============================================ + +Default configuration options can be defined on the config line or in +the store_csv line of the options file. Options for the specific +instance matching 'container=c schema=s" can be given in the file +indicated by opt_file=filename when configuring the defaults (see +section OPTIONS FILE below) or can be scripted. + +The configuration parameters rolltype, rollover, and rollagain are +applied to all metric sets alike from the values given on the command +line or in the "store_csv" line of the options file. All other options +can be specified per instance. + +The config defaults (a config line without container or schema defined) +can be specified once in scripting or the opt_file. They are used for +any container/schema pair not explicitly configured. + +The config values for a specific container/schema pair can be specified +once in scripting or in the opt_file. Any attribute not specifically +defined will take on the value configured in the default config line or +opt_file. + +STORE_CSV CONFIGURATION ATTRIBUTE SYNTAX +===================================================== + +**config** + | name= path= [ altheader=<0/!0> + typeheader= time_format=<0/1> ietfcsv=<0/1> + buffer=<0/1/N> buffertype=<3/4> rolltype= + rollover= rollempty=<0/1> userdata=<0/!0>] + [rename_template= [rename_uid= + [rename_gid=]] + [create_uid=] [create_gid=] [opt_file=filename] [ietfcsv=<0/1>] + [typeheader=<0/1/2>] [array_expand=] + [array_sep=] [array_lquote=] + [array_rquote=] + | ldmsd_controller configuration line + + name= + | + | This MUST be store_csv. + + opt_file= + | + | The options for the plugin and specific instances will be read + from the named file. See OPTIONS FILE. + + path= + | + | This option is required; the config line or the options file + must supply a default value. The output files will be put into a + directory whose root is specified by the path argument. This + directory must exist; the subdirectories and files will be + created. The full path to the output files will be + //. Container and schema are set when + the strgp is added. If you choose a rollover option, then the + filename will also be postpended by "." followed by the epoch + time e.g., XXX/meminfo_ctr/meminfo-123456789. + + altheader=<0/!0> + | + | Distinguishes whether or not to write the header to a separate + file than the data file. 0 = same file. Any non-zero is a + separate file. Default is the same file. If a separate file is + used then, if the data file is called "meminfo" the additional + header file will be called "meminfo.HEADER"). If you choose a + rollover option, the header file name will be postpended with + the epochtime, similar to the data file, and a new one will be + written at each rollover. Default is altheader=0. + + typeheader= + | + | Controls the presence and format of a .KIND file. The kind CSV + file gives type information on each metric (or metric array). + For example, if the metric file is named meminfo, the kind file + is named meminfo.KIND and if the metric file is named + meminfo.15111111, the kind file is named meminfo.KIND.15111111. + The typeformat parameter is 0 (no kind file), 1 (ldms kinds with + arrays flattend out into scalars), 2 (LDMS kinds with arrays). + The typeformat supporting arrays uses the notation + [] for extraction of lengths by scripting tools. + The default typeformat is 0. + + time_format=<0/1> + Controls the format of the initial time fields in each line of the + CSV files. + + A value of 0 means the classic LDMS format where the first field + (Time) is ., and the second field + (Time_usec) is repeated. + + A value of 1 chooses an alternate format where the first field + (Time_msec) is , and the second field + (Time_usec) is just the additional since the epoch in + excess of the milliseconds since epoch. In other words, there is no + overlap of the values in the first and seconds fields, which is in + contrast to the repetition employed by format 0. + + ietfcsv=<0/1> + | + | Turns on (1) or off (0) use of IETF 4180 quoting for header + column names. + + userdata=<0/!0> + | + | Distinguishes whether or not to write each metrics' user data + along with each data value. 0 = no write. Any non-zero means to + write the values. Default is to not write. + + buffer=<0/1/N> + | + | Distinguishes whether or not to buffer the data for the + writeout. 0 = does not buffer. 1 enables buffering with the + system determining the flush. N will flush after approximately N + kB of data (> 4) or N lines -- buffertype determines which of + these it is. Default is system controlled buffering (1). + + buffertype=<3/4> + | + | If buffer=N then buffertype determines if the buffer parameter + refers to kB of writeout or number of lines. The values are the + same as in rolltype, so only 3 and 4 are applicable. + + rolltype= + | + | By default, the store does not rollover and the data is written + to a continously open filehandle. Rolltype and rollover are used + in conjunction to enable the store to manage rollover, including + flushing before rollover. The header will be rewritten when a + roll occurs. Valid options are: + + 1 + | + | wake approximately every rollover seconds and roll. Rollover + is suppressed if no data at all has been written and + rollempty=0. + + 2 + | + | wake daily at rollover seconds after midnight (>=0) and roll. + Rollover is suppressed if no data at all has been written and + rollempty=0. + + 3 + | + | roll after approximately rollover records are written. + + 4 + roll after approximately rollover bytes are written. + + 5 + | + | wake at rollover seconds after midnight (>=0) and roll, then + repeat every rollagain (> rollover) seconds during the day. + For example "rollagain=3600 rollover=0 rolltype=5" rolls + files hourly. Rollover is suppressed if no data at all has + been written and rollempty=0. + + rollover= + | + | Rollover value controls the frequency of rollover (e.g., number + of bytes, number of records, time interval, seconds after + midnight). Note that these values are estimates. + + rollempty=0 + | + | Turn off rollover of empty files. Default value is 1 (create + extra empty files). + + create_perm= + | + | Only octal (e.g.0744) specifications are allowed. If unspecified + or 0 is given, then no change is made. The default permission is + 0600 for data files. The mode specified can include execute bits + which will apply to intermediate directories created but not + data files. For example 0755 will yield 0755 for new directories + and 0644 for data files. + + create_uid= + | + | Specify a new user id for data files. If unspecified, no change + in user ownership is made. Changes in ownership of the files do + not affect intermediate directories. + + create_gid= + | + | Specify a new group id for data files. If unspecified, no change + in group ownership is made. + + rename_template= + | + | This option relocates closed CSV files, typically to a + subdirectory, for processing by other tools that watch + directories. The metapath template is applied to define a new + name after file closure. The rename is limited to locations on + the same mount point, per the C rename(2) call. Substitutions + (%) in the provided template are performed as described in + METAPATH SUBSTITUTIONS below. Errors in template specification + will cause the rename to be skipped. As part of the renaming + process, the mode and ownership of the file may also be adjusted + by specifying rename_perm, rename_uid, and rename_gid. Missing + intermediate directories will be created if possible. To enable + greater flexibility than the renaming just described (e.g. + crossing file systems), an external program must monitor the + output directory and handle completed files. + + rename_perm= + | + | Only octal (e.g.0744) specifications are allowed. If unspecified + or 0 is given, then no change is made. The permissions are + changed before the rename and even if the rename fails. This + option is applied only if rename_template is applied. + + rename_uid= + | + | Specify a new user id for the file. If unspecified, no change in + user ownership is made. Changes in ownership of the files do not + affect intermediate directories that might be created following + the template. This option is applied only if rename_template is + applied. + + rename_gid= + | + | Specify a new group id for the file. If unspecified, no change + in group ownership is made. This option is applied only if + rename_template is applied. + + expand_array= + | + | The default is false. Each array element is stored in a column. + True means that all elements are stored in a single column. + + array_sep= + | + | Specify a character to separate array elements. If exand_array + is true, the value is ignored. + + array_lquote= + | + | Specify the left-quote character if expand_array is true. If + expand_array is false, the value is ignored. + + array_rquote= + | + | Specify the right-quote character if expand_array is true. If + expand_array is false, the value is ignored. + +OPTIONS FILE +========================= + +The plug-in options file or repeated scripted config calls replace the +LDMS v3 'action' keyword for defining instance specific settings. + +The options file recognizes lines starting with # as comments. +Continuation lines are allowed (end lines with a \\ to continue them). +Comment lines are continued if ended with a \\. See EXAMPLES below. + +When an option is needed for a plugin instance, the content of the +options file is searched beginning with the options line holding +"container=$c schema=$s". If the matching container/schema is not found +in the options file or the option is not defined among the options on +that line of the file, then the option value from the ldmsd script +'config' command line is used. If the option is not set on the command +line, the defaults are taken from the line of the options file +containing the keyword 'store_csv'. If the option is found in none of +these places, the compiled default is applied. + +STRGP_ADD ATTRIBUTE SYNTAX +======================================= + +The strgp_add sets the policies being added. This line determines the +output files via identification of the container and schema. + +**strgp_add** + | plugin=store_csv name= schema= + container= [decomposition=] + | ldmsd_controller strgp_add line + + plugin= + | + | This MUST be store_csv. + + name= + | + | The policy name for this strgp. + + container= + | + | The container and the schema determine where the output files + will be written (see path above). They also are used to match + any specific config lines. + + schema= + | + | The container and the schema determine where the output files + will be written (see path above). You can have multiples of the + same sampler, but with different schema (which means they will + have different metrics) and they will be stored in different + containers (and therefore files). + + decomposition= + | + | Optionally use set-to-row decomposition with the specified + configuration file in JSON format. See more about decomposition + in ldmsd_decomposition(7). + +STORE COLUMN ORDERING +================================== + +This store generates output columns in a sequence influenced by the +sampler data registration. Specifically, the column ordering is + + Time, Time_usec, ProducerName, \* + +where each is either + + .userdata, .value + +or if userdata has been opted not to include, just: + + + +The column sequence of is the order in which the +metrics are added into the metric set by the sampler (or the order they +are specifed by the user). + +Note that the sampler's number and order of metric additions may vary +with the kind and number of hardware features enabled on a host at +runtime or with the version of kernel. Because of this potential for +variation, down-stream tools consuming the CSV files should always +determine column names or column number of a specific metric by parsing +the header line or .HEADER file. + +METAPATH SUBSTITUTION +================================== + +The following % escape sequence replacements are performed on the +rename_template value for file renamings: + +%P + | + | plugin name + +%C + | + | container name + +%S + | + | schema name + +%T + | + | file type (DATA, HEADER, KIND, UNITS, CNAMES, PYNAMES) + +%B + | + | basename(closed-file-name) + +%D + | + | dirname(closed-file-name) + +%{ENV_VAR_NAME} + | + | getenv(ENV_VAR_NAME). The use of undefined or empty environment + vars yields an empty substitution, not an error. Characters in the + environment variable are restricted to: 'A-Za-z0-9%@()+-\_./:='; + other characters present will prevent the rename. + +%s + | + | timestamp suffix, if it exists. + +NOTES +================== + +- Please note the argument changes from v2 and v3. The notification of + file events has be removed, being redundant with renaming closed + files into a spool directory. + +- The 'sequence' option has been removed. The 'action' option has been + replaced; see "OPTIONS FILE" above. + +- In the opt_file passed by name to store_csv, including the line + prefix "config name=store_csv" is redundant and is disallowed. The + opt_file syntax is plugin specific and is not an ldmsd configuration + script. Scripts written in the store_csv opt_file syntax cannot be + used directly with the ldmsd include statement. + +BUGS +================= + +None known. + +IMPERFECT FEATURES +=============================== + +The rename and create options do not accept symbolic permissions, uid, +or gid. There is no metapath substitution for file creation. + +EXAMPLES +===================== + +Within ldmsd_controller or in a ldmsd command script file + +:: + + load name=store_csv + config name=store_csv opt_file=/etc/sysconfig/ldms.d/store-plugins/store_csv.conf + strgp_add name=csv_mem_policy plugin=store_csv container=loadavg_store schema=loadavg + +Or with interactive modifications to override file properties: + +:: + + load name=store_csv + config name=store_csv altheader=1 rolltype=2 rollover=0 path=/mprojects/ovis/ClusterData/${LDMSCLUSTER} create_gid=1000000039 create_perm=640 rename_template=%D/archive-spool/%{HOSTNAME}/%B rename_perm=444 + +And in the options file for store_csv +(/etc/sysconfig/ldms.d/store-plugins/store_csv.conf by convention) + +:: + + # defaults for csv, unless overridden on ldmsd script config line. + store_csv altheader=1 path=/XXX/storedir rolltype=2 rollover=0 + # tailored setting for loadavg instance + container=loadavg_store schema=loadavg altheader=0 path=/XXX/loaddir \ + create_gid=1000000039 create_perm=640 \ + rename_template=%D/archive-spool/%{HOSTNAME}/%B \ + rename_perm=444 + +Updating from v3: + +If in version 3 "config name=store_csv action=custom container=cstore +schema=meminfo" was used for a specific csv instance, then put the +additional options for that store instance in the store_csv options file +on a line: + +container=cstore schema=meminfo \* + +or use them interactively or in a script as: + +config name=store_csv container=cstore schema=meminfo \* + +after the store_csv defaults have been set. + +SEE ALSO +===================== + +ldmsd(8), ldms_quickstart(7), ldmsd_controller(8), +ldmsd_decomposition(7) diff --git a/rtd/docs/source/ldms_man/Plugin_store_kafka.rst b/rtd/docs/source/ldms_man/Plugin_store_kafka.rst new file mode 100644 index 000000000..08c58605c --- /dev/null +++ b/rtd/docs/source/ldms_man/Plugin_store_kafka.rst @@ -0,0 +1,86 @@ +================== +Plugin_store_kafka +================== + +:Date: 2 Jun 2022 + +.. contents:: + :depth: 3 +.. + +NAME +=================== + +Plugin_store_kafka - man page for the LDMS store_kafka plugin + +SYNOPSIS +======================= + +| Within ldmsd_controller script: +| ldmsd_controller> load name=store_kafka +| ldmsd_controller> config name=store_kafka + [path=] +| ldmsd_controller> strgp_add name= plugin=store_kafka + container= decomposition= + +DESCRIPTION +========================== + +**store_kafka** uses librdkafka to send rows from the decomposition to +the Kafka servers (specified by strgp's *container* parameter) in JSON +format. The row JSON objects have the following format: { "column_name": +COLUMN_VALUE, ... }. + +PLUGIN CONFIGURATION +=================================== + +**config** **name=**\ *store_kafka* [ **path=\ KAFKA_CONFIG_JSON_FILE** +] + +Configuration Options: + + **name=**\ *store_kafka* + | + | The name of the plugin. This must be **store_kafka**. + + **path=**\ *KAFKA_CONFIG_JSON_FILE* + The optional KAFKA_CONFIG_JSON_FILE contains a dictionary with + KEYS being Kafka configuration properties and VALUES being their + corresponding values. **store_kafka** usually does not require + this option. The properties in the KAFKA_CONFIG_JSON_FILE is + applied to all Kafka connections from store_kafka. Please see + `librdkafka CONFIGURATION + page `__ + for a list of supported properties. + +STRGP CONFIGURATION +================================== + +**strgp_add** **name=**\ *NAME* **plugin=**\ store_kafka +**container=**\ *KAFKA_SERVER_LIST* +**decomposition=**\ *DECOMP_CONFIG_JSON_FILE* + +strgp options: + + **name=**\ *NAME* + | + | The name of the strgp. + + **plugin=**\ store_kafka + | + | The plugin must be store_kafka. + + **container=**\ *KAFKA_SERVER_LIST* + | + | A comma-separated list of Kafka servers (host[:port]). For + example: container=localhost,br1.kf:9898. + + **decomposition=**\ *DECOMP_CONFIG_JSON_FILE* + | + | Set-to-row decomposition configuration file (JSON format). See + more about decomposition in **ldmsd_decomposition**\ (7). + +SEE ALSO +======================= + +ldmsd_decomposition(7) diff --git a/rtd/docs/source/ldms_man/Plugin_store_papi.rst b/rtd/docs/source/ldms_man/Plugin_store_papi.rst new file mode 100644 index 000000000..02f5e99c4 --- /dev/null +++ b/rtd/docs/source/ldms_man/Plugin_store_papi.rst @@ -0,0 +1,114 @@ +================= +Plugin_store_papi +================= + +:Date: 30 Sep 2019 + +.. contents:: + :depth: 3 +.. + +.SH NAME + +Plugin_store_papi - man page for the LDMSD store_papi plugin + +.SH SYNOPSIS + +Within ldmsd_controller or a configuration file: + +**load** **name=store_papi** + +**config** **name=store_papi** **path=**\ *STORE_ROOT_PATH* + +**strgp_add** **name=**\ *STRGP_NAME* **plugin=store_papi** +**container=**\ *CONTAINER* **schema=**\ *SCHEMA* + +**strgp_prdcr_add** **name=**\ *STRGP_NAME* **regex=**\ *PRDCR_REGEX* + +.SH DESCRIPTION + +**store_papi** is an LDMSD storage plugin for storing data from +**papi_sampler** specifically as it expects a collection of PAPI event +metrics after a certain job metric (task_ranks) that only +**papi_sampler** produced. **store_papi** stores data in a SOS container +(specified by **strgp** **container** option). Unlike **store_sos** (see +**Plugin_store_sos**\ (7)) where an entire LDMS snapshot results in an +SOS data entry, **store_papi** split the PAPI events in the set into +their own schemas and data points. For example, if we have PAPI_TOT_INS +and PAPI_TOT_CYC as PAPI events in the **papi_sampler** set, we will +have PAPI_TOT_INS and PAPI_TOT_CYC schemas in the SOS container storing +respective PAPI events. This allows storing flexible, user-defined +schemas at run-time by user jobs (LDMS schemas of sets from +**papi_sampler** are defined at run-time by user jobs). Please note that +the schema name defined by user job must match **strgp**'s schema in +order to store the data. + +.SH CONFIG OPTIONS + +**name=store_papi** + This MUST be store_papi (the name of the plugin). + +**path=**\ *STORE_ROOT_PATH* + The path to the root of the store. SOS container for each schema + specified by the storage policy (**strgp**) will be placed in the + *STORE_ROOT_PATH* directory. + +.SH STORAGE POLICY + +An LDMSD storage plugin is like a storage driver that provides only +storing mechanism. A storage policy (**strgp**) is a glue binding data +sets from various producers to a container of a storage plugin. + +**strgp_add** command defines a new storage policy, identified by +**name**. The **plugin** attribute tells the storage policy which +storage plugin to work with. The **schema** attribute identifies LDMS +schema the data set of which is consumed by the storage policy. The +**container** attribute identifies a container inside the storage plugin +that will store data. + +**strgp_prdcr_add** is a command to specify producers that feed data to +the storage policy. + +.SH BUGS + +No known bugs. + +.SH EXAMPLES + +Plugin configuration example: + + :: + + load name=store_papi + config name=store_papi path=/var/store + strgp_add name=papi_strgp plugin=store_papi container=papi schema=papi + strgp_prdcr_add name=papi_strgp regex=.* + +The following job script and PAPI JSON config combination is an example +of submiting a PAPI-enabled job that will end up in the storage of the +configuration above. + +Job script example: + + :: + + #!/bin/bash + export SUBSCRIBER_DATA='{"papi_sampler":{"file":"/tmp/papi.json"}}' + srun bash -c 'for X in {1..60}; do echo $X; sleep 1; done' + +PAPI JSON example (/tmp/papi.json): + + :: + + { + "schema": "papi", + "events": [ + "PAPI_TOT_INS", + "PAPI_L1_DCM" + ] + } + +.SH SEE ALSO + +**Plugin_papi_sampler**\ (7), **ldmsd**\ (8), **ldms_quickstart**\ (7), +**ldmsd_controller**\ (8), **ldms_sampler_base**\ (7). diff --git a/rtd/docs/source/ldms_man/Plugin_store_rabbitkw.rst b/rtd/docs/source/ldms_man/Plugin_store_rabbitkw.rst new file mode 100644 index 000000000..b14499155 --- /dev/null +++ b/rtd/docs/source/ldms_man/Plugin_store_rabbitkw.rst @@ -0,0 +1,230 @@ +===================== +Plugin_store_rabbitkw +===================== + +:Date: 10 Jun 2018 + +.. contents:: + :depth: 3 +.. + +NAME +====================== + +Plugin_store_rabbitkw - man page for the LDMS store_rabbitkw plugin + +SYNOPSIS +========================== + +| Within ldmsd_controller or in a configuration file +| load name=store_rabbitkw +| config name=store_rabbitkw [ = ] +| strgp_add name=store_rabbitkw [ = ] + +DESCRIPTION +============================= + +The store_rabbitkw plugin is a rabbitmq producer. Actual storage of data +must be arranged separately by configuring some other amqp client. + +CONFIGURATION ATTRIBUTE SYNTAX +================================================ + +The configuration parameters routing_key, host, port, exchange, vhost, +user, and pwfile are shared across all metric sets. + +**config** + | name= exchange= routing_key= host= + port= vhost= user= pwfile= + [extraprops= logmsg= useserver=[y/n> heartbeat= + timeout= retry=] + | These parameters are: + + name= + | + | This MUST be store_rabbitkw. + + routing_key + | + | The routing key shared by all metric sets is . + + host= + | + | The rabbitmq server host. The default is localhost. + + port= + | + | The server port on the nearest rabbitmq host. The default is + 5672. + + exchange= + | + | The amqp exchange to publish with is . The default is + amq.topic. This must preexist; the plugin will no cause its + creation. + + vhost= + | + | The virtual host to be used is . The default is "/". + + user= + | + | The amqp username is . The default is "guest". + + pwfile= + | + | The file contains the amqp user password in the format + 'secretword=password. The default password "guest" is assumed if + no file is specified. + + retry= + | + | If amqp connection fails due to network or server issue, retry + every seconds. Default is 60. + + heartbeat= + | + | Heartbeat interval used to detect failed connections. + + timeout= + | + | Timeout to use for connections, in milliseconds. Default is + 1000. + + extraprops= + | + | Turn on (y) or off (n) the use of extra properties with all + messages. If AMQP-based filtering is not planned, 'n' will + reduce message sizes slightly. + + logmsg= + | + | Enable (y) or disable (n, the default) logging all message + metric content at the DEBUG level. This is a debugging option. + + useserver= + | + | Enable (y, the default) or disable (n) calls to the amqp server; + this is a debugging option. + +STORE ATTRIBUTE SYNTAX +======================================== + +**store** + | name= schema= container= + + name= + | + | This MUST be store_rabbitkw. + + schema= + | + | The name of the metric group, independent of the host name. The + schema will be used as a header in messages if extraprops is y. + + container= + | + | The container will be used as a header in messages if extraprops + is y. + +AMQ event contents +==================================== + +This store generates rabbitmq events containing the data from LDMS set +instances. All events are on the single queue that is configured. + +The properties follow the AMQP standard, with LDMS specific +interpretations: + + timestamp + | + | The sample collection time in MICROSECONDS UTC. Divide by + 1,000,000 to get seconds UTC. + + app_id + | + | The app_id is LDMS. + +Optional AMQ event contents +============================================= + +These fields and headers are present if extraprops=y is configured. + +content_type + | + | <"text/plain"> for all. + +reply_to + | + | The metric set instance name. + +container + | + | The container configuration name. + +schema + | + | The schema configuration name. + +PAYLOAD FORMAT +================================ + +Payloads are ASCII formatted, tab separated "label=val" lists. + +Scalar metric values are formatted in obvious C ways to ensure full +precision is retained. Each is a tab-separated triplet 'metric=$name +type=$scalar_type value=$value'. Before the metric values on each line +are the keys and values: timestamp_us, producer, container, schema. + +Array values are formatted as semicolon separated lists: Each metric +appears as a tab-separated quartet 'metric=$name type=$scalar_type +length=$array_length value=$value'. + +CHAR_ARRAY values are formatted as strings. Note these are terminated at +the first nul character. + +NOTES +======================= + +The semantics of LDMS messages are not an extremely close match to +network mail and news messages targeted by AMQP. The interpretations on +message properties used here may be subject to change in future +releases. + +The authentication to AMQP server uses the SASL plaintext method. In HPC +environments this is normally secure. Additional options enabling +encryption are likely to appear in future work at a cost in CPU. +Normally, an amqp server federation member should be hosted on or very +near the LDMS aggregator host. + +Presently each payload contains a single line (with tab separators). +Future versions may capture multiple set instances per message, where +each set is separated by newlines from the others. + +The behavior of this AMQP client when faced with AMQP server +disappearance is to retry connection later and to ignore any metric data +seen while disconnected. + +BUGS +====================== + +String data containing tab characters are not compatible with this data +encoding. This may be fixed when a satisfactory alternate representation +is agreed for these special characters. + +EXAMPLES +========================== + +See the LDMS test script rabbitkw + +ADMIN HINTS +============================= + +On Linux, this requires an amqp service (typically +rabbitmq-server.service) running in the network. That service may +require epmd.service. + +SEE ALSO +========================== + +ldmsd(8), rabbitmq-server(1), ldmsd_controller(8), store_rabbitv3(7) diff --git a/rtd/docs/source/ldms_man/Plugin_store_rabbitv3.rst b/rtd/docs/source/ldms_man/Plugin_store_rabbitv3.rst new file mode 100644 index 000000000..e1c59cc35 --- /dev/null +++ b/rtd/docs/source/ldms_man/Plugin_store_rabbitv3.rst @@ -0,0 +1,221 @@ +===================== +Plugin_store_rabbitv3 +===================== + +:Date: 03 Dec 2016 + +.. contents:: + :depth: 3 +.. + +NAME +====================== + +Plugin_store_rabbitv3 - man page for the LDMS store_rabbitv3 plugin + +SYNOPSIS +========================== + +| Within ldmsd_controller or in a configuration file +| load name=store_rabbitv3 +| config name=store_rabbitv3 [ = ] +| strgp_add name=store_rabbitv3 [ = ] + +DESCRIPTION +============================= + +The store_rabbitv3 plugin is a rabbitmq producer. Actual storage of data +must be arranged separately by configuring some other amqp client. + +CONFIGURATION ATTRIBUTE SYNTAX +================================================ + +The configuration parameters root, host, port, exchange, vhost, user, +and pwfile are shared across all metric sets. + +**config** + | name= root= host= port= + exchange= vhost= user= pwfile= + extraprops= metainterval= + | These parameters are: + + name= + | + | This MUST be store_rabbitv3. + + root= + | + | The routing key prefix shared by all metric sets will be . + + host= + | + | The rabbitmq server host. The default is localhost. + + port= + | + | The server port on the nearest rabbitmq host. The default is + 5672. + + exchange= + | + | The amqp exchange to publish with is . The default is + amq.topic. + + vhost= + | + | The virtual host to be used is . The default is "/". + + user= + | + | The amqp username is . The default is "guest". + + pwfile= + | + | The file contains the amqp user password in the format + 'secretword=password. The default password "guest" is assumed if + no file is specified. + + extraprops= + | + | Turn on (y) or off (n) the use of extra properties with all + messages. + + mint + | + | The number of seconds between emission of time and host + invariant (meta) metrics. + +STORE ATTRIBUTE SYNTAX +======================================== + +**store** + | name= schema= container= + + name= + | + | This MUST be store_rabbitv3. + + schema= + | + | The name of the metric group, independent of the host name. + + container= + | + | The container will be used in the routing key. The current + routing key patterns is: + .... + + Use a unique container parameter for different metric sets coming + from different sampler (e.g., do not use the same container for + procstat and meminfo); however, use the same container for the same + metric set coming from all hosts (e.g., for all meminfo). + +AMQ event contents +==================================== + +This store generates rabbitmq events. The message in each event is just +the metric value in string form. The message properties of each event +encode everything else. + +The properties follow the AMQP standard, with LDMS specific +interpretations: + + timestamp + | + | The sample collection time in MICROSECONDS UTC. Divide by + 1,000,000 to get seconds UTC. + + type + | + | The ldms metric data type. + + app_id + | + | The app_id is the integer component_id, if it has been defined + by the sampler. + +Optional AMQ event contents +============================================= + +These fields and headers are present if extraprops=y is configured. + +content_type + | + | <"text/plain"> for all. + +reply_to + | + | The producer name. + +metric + | + | The label registered by the sampler plugin, which might be + anything. + +metric_name_amqp + | + | The label modified to work as a routing key, not necessarily easily + read. + +metric_name_least + | + | The label modified to work as a programming variable name, possibly + shortened and including a hash suffix. Not expected to be fully + human-readable in all cases. It will be the same across runs for + metric sets whose content labels do not vary across runs. + +container + | + | The container configuration name. + +schema + | + | The schema configuration name. + +PAYLOAD FORMAT +================================ + +Payloads are ASCII formatted. + +Scalar values are formatted in obvious C ways to ensure full precision +is retained. Each is a doublet: type,value + +Array values are formatted as comma separated lists: +type,array-length,value[,value]\*. + +Char array values omit the commas in the value list, giving the +appearance of a string. Note however that there may be embedded nul +characters. + +NOTES +======================= + +The semantics of LDMS messages are not an extremely close match to +network mail and news messages. The interpretations on message +properties used here may be subject to change in major releases of LDMS. + +The authentication to AMQP server uses the SASL plaintext method. In HPC +environments this is normally secure. Additional options enabling +encryption are likely to appear in future work at a cost in CPU. +Normally, an amqp server federation member should be hosted on or very +near the LDMS aggregator host. + +BUGS +====================== + +The periodic emission of meta metrics should be per (producer,metric) +pair, but the store API is not yet sufficient to make this a scalable +and efficient operation. In the meanwhile, meta metrics are emitted on +first definition and assumed to be identical for a metric set across all +producers. The special case of component_id (if present) is handled +correctly when extraprops=y is configured. + +EXAMPLES +========================== + +See the LDMS test script ldms_local_amqptest.sh. + +SEE ALSO +========================== + +ldmsd(8), rabbitmq-server(1), ldmsd_controller(8) diff --git a/rtd/docs/source/ldms_man/Plugin_store_sos.rst b/rtd/docs/source/ldms_man/Plugin_store_sos.rst new file mode 100644 index 000000000..f09e6e2d7 --- /dev/null +++ b/rtd/docs/source/ldms_man/Plugin_store_sos.rst @@ -0,0 +1,346 @@ +================ +Plugin_store_sos +================ + +:Date: 21 Dec 2015 + +.. contents:: + :depth: 3 +.. + +NAME +================= + +Plugin_store_sos - man page for the LDMS store_sos plugin + +SYNOPSIS +===================== + +| Within ldmsd_controller script: +| ldmsd_controller> load name=store_sos +| ldmsd_controller> config name=store_sos path=path +| ldmsd_controller> strgp_add plugin=store_sos [ = ] + +DESCRIPTION +======================== + +With LDMS (Lightweight Distributed Metric Service), store plugins for +the ldmsd (ldms daemon) are configured via the ldmsd_controller. The +store_sos plugin is a sos store. + +To build the store_sos, build with the following flag: **--enable_sos** + +STORE_SOS INIT CONFIGURATION ATTRIBUTE SYNTAX +========================================================== + +**config** + | name= path= + | ldmsd_controller configuration line + + name= + | + | This MUST be store_sos. + + path= + | + | The store will be put into a directory whose root is specified + by the path argument. This directory must exist; the store will + be created. The full path to the store will be + /. The schema(s) determine the schemas of the + data base. Container and schema are set when the strgp is added. + +STRGP_ADD ATTRIBUTE SYNTAX +======================================= + +The strgp_add sets the policies being added. This line identifies the +container and schema for a store. + +**strgp_add** + | plugin=store_sos name= schema= + container= [decomposition=] + | ldmsd_controller strgp_add line + + plugin= + | + | This MUST be store_sos. + + name= + | + | The policy name for this strgp. + + container= + | + | The container and schema define the store as described above + (see path). + + schema= + | + | The container and schema define the store as described above + (see path). You can have multiples of the same path and + container, but with different schema (which means they will have + different metrics) and they will be stored in the same store. + + decomposition= + | + | Optionally use set-to-row decomposition with the specified + configuration file in JSON format. See more about decomposition + in ldmsd_decomposition(7). + +USING SOS COMMANDS TO MANAGE PARTITIONS +==================================================== + +Some of the basic sos commands are given below. SOS tools will be built +into XXX. Any commands given with no argument, will return usage info. + +**sos_part_query** + | + | List the partitions defined in a container. + +**sos_part_create** + | -C [=] part_name + | Create a partition. + + **-C** ** + | + | Path to the container + + **-s** *state* + | + | State of the new partition (case insensitive). Default is + OFFLINE. Optional parameter. Valid options are: + + - PRIMARY: all new allocations go in this partition + + - ONLINE: objects are accessible, but the partition does not grow + + - OFFLINE: object references are invalid; the partition may be moved + or deleted. + + **part_name** + | + | Name of the partition + +**sos_part_delete** + | -C + | Delete a partition in a container. The partition must be in the + OFFLINE state to be deleted. + + **-C** ** + | + | Path to the container + + **name** + | + | Name of the parition + +**sos_part_modify** + | -C [=] part_name + | Modify the state of a partition. + + **-C** ** + | + | Path to the container + + **-s** *state* + | + | State of the new partition (case insensitive). Default is + OFFLINE. Optional parameter. Valid options are: + + - PRIMARY: all new allocations go in this partition + + - ONLINE: objects are accessible, but the partition does not grow + + - OFFLINE: object references are invalid; the partition may be moved + or deleted. + + **part_name** + | + | Name of the partition + +**sos_part_move** + | + | Move a partition to another storage location. -C -p + part_name + + **-C** ** + | + | Path to the container + + **-p** ** + | + | The new path. + + **part_name** + | + | Name of the partition + +USING SOS COMMANDS TO LOOK AT DATA IN A PARTITION +============================================================== + +sos_cmd can be used to get data from an sos instance. Some relevant +command options are below. Example usage is in the example section. + +**sos_cmd** + | -C -l + | Print a directory of the schemas. + + **-C** ** + | + | Path to the container + +**sos_cmd** + | -C -i + | Show debug information for the container + + **-C** ** + | + | Path to the container + +**sos_cmd** + | -C -q -S -X -V -V .... + | Print data from a container + + **-C** ** + | + | Path to the container + + **-q** + Used to query + + **-S** ** + | + | Schema querying against + + **-X** ** + | + | Variable that is indexed to use in the query. + + **-V** ** + | + | One or more vars to output. + +NOTES +================== + +- The configuration lines do not allow specification of the partition, + that is done automatically (by default this is the epoch timestamp). + +- Management of partitions is done outside of LDMS (e.g., cron script + that calls creation of new partitions and changes from PRIMARY to + ACTIVE). + +BUGS +================= + +No known bugs. + +EXAMPLES +===================== + +Configuring store_sos: +---------------------- + +:: + + ldmsd_controller> load name=store_sos + ldmsd_controller> config name=store_sos path=/XXX/storedir + ldmsd_controller> strgp_add name=sos_mem_policy plugin=store_sos container=sos schema=meminfo + +Querying a container's partitions: +---------------------------------- + +:: + + $ sos_part /NVME/0/SOS_ROOT/Test + Partition Name RefCount Status Size Modified Accessed Path + -------------------- -------- ---------------- -------- ---------------- ---------------- ---------------- + 00000000 3 ONLINE 1M 2015/08/25 13:49 2015/08/25 13:51 /SOS_STAGING/Test + 00000001 3 ONLINE 2M 2015/08/25 11:54 2015/08/25 13:51 /NVME/0/SOS_ROOT/Test + 00000002 3 ONLINE 2M 2015/08/25 11:39 2015/08/25 13:51 /NVME/0/SOS_ROOT/Test + 00000003 3 ONLINE PRIMARY 2M 2015/08/25 11:39 2015/08/25 13:51 /NVME/0/SOS_ROOT/Test + +Looking at a container's directory: +----------------------------------- + +Variables that are options for -X in the sos_cmd will have indexed = 1 + +:: + + $ sos_cmd -C /NVME/0/LDMS -l + schema : + name : aries_nic_mmr + schema_sz : 1944 + obj_sz : 192 + id : 129 + -attribute : timestamp + type : TIMESTAMP + idx : 0 + indexed : 1 + offset : 8 + -attribute : comp_time + type : UINT64 + idx : 1 + indexed : 1 + offset : 16 + -attribute : job_time + type : UINT64 + idx : 2 + indexed : 1 + offset : 24 + -attribute : component_id + type : UINT64 + idx : 3 + indexed : 0 + offset : 32 + -attribute : job_id + type : UINT64 + idx : 4 + indexed : 0 + offset : 40 + -attribute : AR_NIC_NETMON_ORB_EVENT_CNTR_REQ_PKTS + type : UINT64 + idx : 5 + indexed : 0 + offset : 48 + -attribute : AR_NIC_NETMON_ORB_EVENT_CNTR_REQ_FLITS + type : UINT64 + idx : 6 + indexed : 0 + offset : 56 + -attribute : AR_NIC_NETMON_ORB_EVENT_CNTR_REQ_STALLED + type : UINT64 + idx : 7 + indexed : 0 + offset : 64 + ... + +Looking at variable values in a container: +------------------------------------------ + +:: + + $ sos_cmd -C /NVME/0/LDMS -q -S aries_nic_mmr -X timestamp -V timestamp -V AR_NIC_NETMON_ORB_EVENT_CNTR_REQ_PKTS + timestamp AR_NIC_NETMON_ORB_EVENT_CNTR_REQ_PKTS + -------------------------------- ------------------ + 1447449560.003480 1642207034 + 1447449630.002155 1642213993 + 1447449630.003115 88703749 + 1447449630.003673 74768272 + 1447449640.002818 74768367 + 1447449640.003201 88703844 + 1447449640.003249 1642214024 + 1447449650.002885 74768402 + 1447449650.003263 1642214059 + 1447449650.003325 88703874 + 1447449660.002954 74768511 + 1447449660.003308 1642214174 + 1447449660.003444 88703993 + 1447449670.003015 74768547 + 1447449670.003361 1642214205 + 1447449670.003601 88704024 + 1447449680.003081 74768582 + +SEE ALSO +===================== + +ldms(7), Plugin_store_csv(7), ldmsd_decomposition(7) diff --git a/rtd/docs/source/ldms_man/index.rst b/rtd/docs/source/ldms_man/index.rst new file mode 100644 index 000000000..be58cdc9d --- /dev/null +++ b/rtd/docs/source/ldms_man/index.rst @@ -0,0 +1,8 @@ +LDMS Man Pages +===== + +.. toctree:: + :maxdepth: 1 + :glob: + + * diff --git a/rtd/docs/source/ldms_man/ldms-csv-anonymize.rst b/rtd/docs/source/ldms_man/ldms-csv-anonymize.rst new file mode 100644 index 000000000..fe14aebe4 --- /dev/null +++ b/rtd/docs/source/ldms_man/ldms-csv-anonymize.rst @@ -0,0 +1,183 @@ +================== +ldms-csv-anonymize +================== + +:Date: 18 Apr 2019 + +.. contents:: + :depth: 3 +.. + +NAME +=================== + +ldms-csv-anonymize - anonymize columns of csv files + +SYNOPSIS +======================= + +ldms-csv-anonymize -h + +ldms-csv-anonymize [--input csv-file] [--out-dir OUT_DIR] [--col-sep +COL_SEP] [--seed SEED] [--save-maps SAVE_MAPS] [--imap IMAP] [--nmap +NMAP] [--pmap PMAP] [--hmap HMAP] [--debug] [M:C [M:C ...]] + +ldms-csv-anonymize --gen-args GEN_ARGS + +DESCRIPTION +========================== + +The ldms-csv-anonymize command rewrites ldms and slurm data files +column-wise with filters specified by the M:C arguments. M:C is a +mapping:column number pair or filename. M is one of int,path,name,host. +C is a nonzero number. Negative numbers count back from the last column. + +OPTIONS +====================== + +--input= + | + | Args is a file name or space-separated list of file names to be + processed. Filenames cannot contain whitespace. + +--out-dir= + | + | Path is a directory (must pre-exist and should not be the same as + any directory containing the input) which will be filled with the + changed files. The original files will are not changed. If an + output file name coincides with one of the inputs, the input data + may be lost or corrupted. + +--col-sep= + | + | Split columns at this character. The default is comma. + +--save-maps= + | + | The path prefix for the generated map files. If the resulting map + filenames coincide with an existing file, the existing file is + overwritten. + +--imap= + | + | An integer mapping file to preload. It must contain two columns of + integers and magic. Normally it is the output of a prior run. See + MAPS below. + +--nmap= + | + | A name mapping file to preload. It must contain two columns of + names and magic. Normally it is the output of a prior run. Each + real name is replaced with 'n' and a sequential number. See MAPS + below. + +--pmap= + | + | A path element mapping file to preload. It must contain two columns + of path elements and magic. Normally it is the output of a prior + run. Path elements are unqualified subdirectory names. Each unique + subdirectory name is replaced with 'p' and a sequential number, + allowing directory hierarchy to be preserved without revealing + application identities. See MAPS below. + +--hmap= + | + | A host name mapping file to preload. It must contain columns of + host elements and magic. It may be host name fragment information + or the output of a prior run. Any hostname found in the input data + which cannot be mapped to the host elements will cause an + anonymization error. There is no default handling of unknown hosts. + See MAPS below. + +--gen-args=[,M:H]\*, + | + | Creating the M:C specification needed in a data transformation run + can be done by first using the argument generation mode. Given a + file starting with a header line of column names and the list of + method:name pairs, this command displays the corresponding list of + M:C arguments needed for the data transformation. + +--debug + | + | Echo some details of the transformation as it runs. + +--seed + | + | Supply a seed to the random number generator. No random values are + used at this time in the processing, however. + +MAPS and MAGIC +============================= + +Map files all start with a line of the form "#anonymize-csv-map " +where kind is one of the supported M values. The columns of the file are +separated by whitespace. The first column is the item of input data to +be replaced and the second column is the replacement. Multiple items +from column 1 may have the same value in column 2. + +By default, map files are saved in the output directory as +anonmap_Xmap.txt, where X is replaced with a kind indicator (i, p, n, +h). The prefix option is used to relocate these outputs. They cannot be +suppressed. + +In the special case of host names and host lists, name fragment +substitutions are supported. Any appearance of a host list, such as +gw[1,3-5] is expanded to single hostnames. Each host name is split at +"-", and each fragment is checked for a replacement from the hmap file. +Any fragment not found in the hmap has right-side digits 0-9 stripped +and mapping the remainder is again attempted; if successful, the +stripped number is appended to the result, otherwise an error occurs. +The fragments are rejoined with "-". When all hosts in the appearance +have been rewritten, the host list is collapsed before output. + +The special host map element 'netdomains' is used to remove fully +qualified domain suffixes. It is a comma separated list of suffixes, and +order matters (subdomains should come before their root if both appear). +Suffix removal occurs before substitution. + +NOTES +==================== + +There is no column delete option; use cut(1) to remove entire columns. + +To ensure map consistency across multiple runs, use the map outputs as +the map inputs to the second and subsequent runs. + +EXAMPLES +======================= + +In bash: + +:: + + colargs=$(ldms-csv-anonymize \ + --gen-args=host:ProducerName,int:uid,name:username,jobid.HEADER) + + ldms-csv-anonymize $colargs \ + --out-dir=/tmp \ + --save-maps=anonjob_ \ + --hmap=/home/anonjob_hmap.txt \ + --input=/home/jobid.csv + +and in a host map file: + +:: + + #anonymize-csv-map host + netdomains .ca.sandia.gov,.sandia.gov + compute node + admin svc + +will cause compute01 to be replaced with node01 and admin7 to be +replaced with svc7. The .sandia.gov and .ca.sandia.gov domains will be +stripped. + +BUGS +=================== + +There is no pipeline filtering mode. + +SEE ALSO +======================= + +cut(1) diff --git a/rtd/docs/source/ldms_man/ldms-csv-export-sos.rst b/rtd/docs/source/ldms_man/ldms-csv-export-sos.rst new file mode 100644 index 000000000..28864c4bc --- /dev/null +++ b/rtd/docs/source/ldms_man/ldms-csv-export-sos.rst @@ -0,0 +1,213 @@ +=================== +ldms-csv-export-sos +=================== + +:Date: 18 Apr 2019 + +.. contents:: + :depth: 3 +.. + +NAME +==================== + +ldms-csv-export-sos - generate helper files needed by sos-import-csv + +SYNOPSIS +======================== + +ldms-csv-export-sos -h + +ldms-csv-export-sos [--data DATA] [--blacklist BLACKLIST] [--whitelist +WHITELIST] [--exclude EXCLUDE] [--include INCLUDE] [--schema-name +SCHEMA_NAME] [--schema-file SCHEMA_FILE] [--map-file MAP_FILE] +[--strip-udata] [--guess] [--widen] [--maxlines MAXLINES] [--assume +ASSUME] [--verbose] + +DESCRIPTION +=========================== + +The ldms-csv-export-sos command parses LDMS CSV file information to +generate corresponding map (and optionally schema) files used by +sos-import-csv. + +OPTIONS +======================= + +--data= + | + | DATA is a file name of a LDMS .HEADER, .KIND, or data file. The + file name and at least the first line of the file are digested to + determine the content and the column types. LDMS CSV file name + conventions ($schema[.$date] is associated with + $schema.HEADER.$date or $schema.KIND.$date in the same directory). + The file may be gzipped; if so, the matching data/HEADER/KIND files + must also be gzipped. + +--blacklist= + | + | BLACKLIST is the name of a file with column names to exclude from + the schema, one per line. leading # comments allowed in the file. + +--whitelist= + | + | WHITELIST is the name of a file with column names to include in the + schema, one per line. leading # comments allowed in the file. Any + other columns found are excluded. + +--exclude= + | + | LIST is a string of metric names separated by commas. Columns named + are excluded from the generated schema. + +--include= + | + | LIST is a string of metric names separated by commas. Columns named + are included in the generated schema and all other columns found + are excluded. + +--schema-name= + | + | NAME overrides the default schema name determined from the data + file name. + +--schema-file= + | + | Use an existing schema file FILE instead of generating a schema. + When not specified, a schema file is always generated. Schema files + may not be gzipped. + +--map-file= + | + | Override the output map file name derived from the data file name. + +--alias-file= + | + | Provide the list of metrics to rename when creating or matching a + schema discovered from a header line. + +--strip-udata + | + | Suppress output of .userdata fields and remove .value suffix from + schema element names. + +--guess + | + | Guess the ldms data column types. (can be slow on large files) + +--maxlines= + | + | Parse no more than MAXLINES to guess data types with the --guess + option. The default if unspecified is 100000 lines. + +--assume= + | + | Assume all unknown data columns are type ASSUME. + +--verbose + | + | Show process debugging details. + +--widen + | + | Widen numeric types discovered to 64 bits. + +METRIC FILTERING +================================ + +When an include or whitelist is specified, exclude and blacklist +arguments are ignored entirely. An include option cannot be used to +prune a blacklist file. + +When userdata is present in the CSV file, for these filters, metric +names should be written without the .value or .userdata suffix. + +NOTES +===================== + +The recommended export method is to use the .KIND file if available and +to use the options "--guess --widen --maxlines=2" for legacy LDMS files. +This tool is aware of the CSV conventions (up to LDMS v4) for columns +named Time, ProducerName, producer, compid, component_id, Time_usec, +DT_usec, jobid, job_id, app_id, uid, and names ending in .userdata. + +Both assume and guess options should be used judiciously. Know your data +before using SOS or any other database. The output schema file is +formatted for editability, and it should be adjusted before use with SOS +if any guess or assumption proves erroneous. + +BUGS +==================== + +There is no pipeline filtering mode. + +EXAMPLES +======================== + +To test sos-import-csv with the resulting files: + +:: + + + ldms-csv-export-sos --data=renamecsv.1553744481 \ + --strip-udata --schema-name=meminfo \ + --blacklist=exclude.renamecsv + + mkdir container + sos-db --path container --create + sos-schema --path container \ + --add renamecsv.SCHEMASOS.1553744481 + sos-import-csv \ + --path container \ + --csv renamecsv.1553744481 \ + --map renamecsv.MAPSOS.1553744481 \ + --schema meminfo \ + --status + sos_cmd -C container -l + sos_cmd -C container -q -S meminfo -X Time + +Other examples + +:: + + + # make schema and map from *81 with schema rename from file + ldms-csv-export-sos --data=renamecsv.1553744481 \ + --strip-udata --schema-name=meminfo \ + --blacklist=exclude.renamecsv + + # reuse schema and make map from *90 + ldms-csv-export-sos --data=renamecsv.1553744490 \ + --schema-file=renamecsv.SCHEMASOS.1553744481 + + # reuse schema and make map from *90 with alternate output name + ldms-csv-export-sos --data=renamecsv.1553744490 \ + --strip-udata \ + --schema-file=renamecsv.SCHEMASOS.1553744481 \ + --map-file=mymap + + # translate array example (when supported) + ldms-csv-export-sos --data=fptrans.HEADER --strip-udata + + # translate array with old schema (when supported) + ldms-csv-export-sos --data=fptrans2.HEADER \ + --schema-file=fptrans.SCHEMASOS + + # test input guess when x.14 does not exist + ldms-csv-export-sos --data=x.HEADER.14 --guess + + # test input guess when y.KIND.14 does not exist but y.14 does + ldms-csv-export-sos --data=y.HEADER.14 \ + --guess --maxlines=4000 + + # test input guess and widen + ldms-csv-export-sos --data=y.HEADER.14 \ + --guess --widen --maxlines=4 + + # test assume + ldms-csv-export-sos --data=y.HEADER.14 --assume=u32 + +SEE ALSO +======================== + +sos-import-csv(1) diff --git a/rtd/docs/source/ldms_man/ldms-plugins.rst b/rtd/docs/source/ldms_man/ldms-plugins.rst new file mode 100644 index 000000000..ef9d57b0e --- /dev/null +++ b/rtd/docs/source/ldms_man/ldms-plugins.rst @@ -0,0 +1,63 @@ +============ +ldms-plugins +============ + +:Date: 28 Feb 2018 + +.. contents:: + :depth: 3 +.. + +NAME +============= + +ldms-plugins.sh - Display information on installed LDMSD plugins. + +SYNOPSIS +================= + +ldms-plugins.sh [OPTION] [NAME] + +DESCRIPTION +==================== + +The ldms-plugins.sh command is used to query ldmsd for information on +installed plugins. + +OPTIONS +================ + +If the NAME is specified, only information for that plugin is displayed. +The names all, store, and sampler are interpreted as described in +ldmsd(8). + +-b + | + | Produce brief output, omitting usages. + +-n + | + | Produce names only. + +EXAMPLES +================= + +ldms-plugins.sh -b + +ldms-plugins.sh vmstat + +ldms-plugins.sh -n sampler + +ldms-plugins.sh -n store + +NOTES +============== + +Error messages from attempting to load plugins may appear if +additionally needed libraries cannot be found. This is usually a bug in +the setting of LD_LIBRARY_PATH. + +SEE ALSO +================= + +ldmsd(8) diff --git a/rtd/docs/source/ldms_man/ldms-reverse-conf.rst b/rtd/docs/source/ldms_man/ldms-reverse-conf.rst new file mode 100644 index 000000000..5b6113d60 --- /dev/null +++ b/rtd/docs/source/ldms_man/ldms-reverse-conf.rst @@ -0,0 +1,38 @@ +================= +ldms-reverse-conf +================= + +:Date: 6 Jun 2022 + +.. contents:: + :depth: 3 +.. + +NAME +================== + +ldms-reverse-conf.sh - generate a tear-down configuration file + +SYNOPSIS +====================== + +ldms-reverse-conf.sh + +DESCRIPTION +========================= + +The ldms-reverse-conf.sh command parses an LDMS control script in the +key/value language which sets up samplers, stores, producers, updaters, +and subscriptions, and attempts to generate the matching tear-down +script to stdout. Invoking the ldmsd_controller or ldmsctl with the +teardown script should yield and almost idle daemon (listeners are still +active). + +Typically, a daemon is configured and left to run. The intent of this +utility is to make it easy to deconfigure a running daemon in the proper +command order given the original scripted configuration. + +SEE ALSO +====================== + +ldmsctl(8), ldmsd_controller(8) diff --git a/rtd/docs/source/ldms_man/ldms-run-static-tests.rst b/rtd/docs/source/ldms_man/ldms-run-static-tests.rst new file mode 100644 index 000000000..774b543f7 --- /dev/null +++ b/rtd/docs/source/ldms_man/ldms-run-static-tests.rst @@ -0,0 +1,133 @@ +===================== +ldms-run-static-tests +===================== + +:Date: 21 Aug 2020 + +.. contents:: + :depth: 3 +.. + +NAME +====================== + +run-static-tests.test - Execute the program + +SYNOPSIS +========================== + +run-static-tests.test -l + +run-static-tests.test -h + +run-static-tests.test [test_dir] + +DESCRIPTION +============================= + +The run-static-tests.test initiates the ldms-static-test.test test on +each enabled plugin. The stdout/stderr of each ldms-static-test.sh +invocation will be redirected to a log file and its output tree. This +log file will then be tarred and compressed when ldms-static-test.sh has +finsihed. The return code of ldms-static-test.sh will then be checked by +this driver script. If the return value is 0, then the script will print +"PASS $testname" and if the return value is 1, the script will print +"FAIL $testname". Where testname is each invocation of +ldms-static-test.sh of the enabled plugins. Please see +ldms-static-test.man for more information. + +OPTIONS +========================= + +-l + | + | List the enabled plugins. + +-h + | + | List help message. + +LANGUAGE +========================== + +The following macro language is provided as extensions on bash. Other +bash use is also possible, but not recommended. + +ENVIRONMENT +============================= + +Uses the current set environment to run. Environment may need to be +configured before excuting this test script. + +input + | + | The name of the input file as specified when ldms-static-test.sh is + invoked for each enabled plugin. + +testname + | + | The base name (directories stripped) of the input file name. This + variable makes it possible to use similar input across many test + files when the name of the input file is the same as the plugin + tested. + +strict + | + | If the variable "strict" is used for KILL_LDMSD + (ldms-static-test(8)) the script will output "FAIL $testname" and + return an XFAIL to indicate an expected failure only if the test + case plugin is listed in static-test-list. The stderr of + ldms-static-test.sh will be redirected to the log file + test.$testname.log under the default output location of test_dir. + +file + | + | The file "static-test-list" located in ldms/scripts/ defines a list + of samplers that are expected to fail. If there is a failed test + and the sampler is listed in this file, then run-static-test.sh + will output an "XFAIL" and continue. Developers can modify this + list to meet their needs. + +bypass <1,0> + | + | This variable assignment is used to determine an expected failure + (1) or normal failure (0) of a sampler plugin. This variable is set + to (1) if the sampler is listed in $file and set to (0) otherwise. + Used to test the successful and expected failures of each sampler + plugin. + +NOTES +======================= + +Any other variable may be defined and exported for use in the +attribute/value expansion of values in plugin configuration. + +FILES +======================= + +*$input_file.$i* + | + | For each value of i specifed to start an ldmsd, a configuration + file named $input_file.$i must also exist. This configuration file + is used when starting the daemon. + +*test_dir* + | + | Test output directory of ldms-static-test.sh. The default output + location is \`pwd`/ldmstest/$testname. + +GENERATED FILES +================================= + +*$test_dir/test.$testname.log* + | + | The log file containing stderr and stdout of ldms-static-test.sh. + +*$test_dir/test.$testname.tgz* + | + | Location of the compressed file logfile. + +SEE ALSO +========================== + +ldmsd-static-test.man diff --git a/rtd/docs/source/ldms_man/ldms-static-test.rst b/rtd/docs/source/ldms_man/ldms-static-test.rst new file mode 100644 index 000000000..486a91f55 --- /dev/null +++ b/rtd/docs/source/ldms_man/ldms-static-test.rst @@ -0,0 +1,348 @@ +================ +ldms-static-test +================ + +:Date: 4 Oct 2020 + +.. contents:: + :depth: 3 +.. + +NAME +================= + +ldms-static-test.sh - Run a canned test scenario + +SYNOPSIS +===================== + +ldms-static-test.sh -l + +ldms-static-test.sh -h + +ldms-static-test.sh [test_dir] + +DESCRIPTION +======================== + +The ldms-static-test.sh command starts a canned test defined in the +input_file using a standard environment. The input file is written in a +simple bash macro language described in LANGUAGE below. Supporting +configuration file fragments will be used, as determined from the input +file. See FILES below. This tests ldmsd run with static configuration +files (as would normally happen as a system service) and shut down with +a signal. + +OPTIONS +==================== + +-l + | + | List the canned tests available. + +-h + | + | List help message. + +LANGUAGE +===================== + +The following macro language is provided as extensions on bash. Other +bash use is also possible, but not recommended. + +DAEMONS + | + | Give all the numbers that will be used in the LDMSD invocations + anywhere in the test. This causes port variables to be defined so + that any daemon can connect to any other by referencing $portN as + explained in ENVIRONMENT below. If omitted, the ordering and + aggregation relationships of LDMSD calls may be infeasible. + +LDMSD [conf-options] + | + | This starts a number of daemons described by daemon-numbers. The + numbers can be a given list, such as "1 2 3". The environment of + each daemon (and its config script) will contain the variable i set + to one of the given values, as described in ENVIRONMENT. For each + value of i, a configuration fragment $input_file.$i must also + exist. Use seq(1) to generate large number sequences. + +See CONFIGURATION OPTIONS below for the explanation of [conf-options]. + +MESSAGE [arguments] + | + | The expanded arguments are logged. + +LDMS_LS [ldms_ls_args] + | + | This invokes ldms_ls on the k-th ldmsd. + +KILL_LDMSD [strict] + | + | Kills the listed daemons. If optional keyword strict is supplied, + then missing daemons will cause the bypass variable to be set and + the script to include an error code when it exits. + +SLEEP + | + | Sleeps n seconds and logs a message about it. + +JOBDATA jobfile [daemon-numbers] + | + | Creates jobfile with data for the jobid plugin to parse. If daemon + numbers are specified, creates a jobfile.$k for each value of k + listed in daemon-numbers. Each file will have unique numeric + values, sequentially increasing. This does not provide data in the + slurm-plugin sampler binary format. + +vgon + | + | Turns on use of valgrind for any ldmsd or ldms_ls subsequently + started. + +vgoff + | + | Turns off use of valgrind for any ldmsd or ldms_ls subsequently + started. + +file_created + | + | Verifies the existence and readability of filename. + +rollover_created + | + | Verifies the existence and readability of rollover files matching + pattern filename.[0-9]\*. + +bypass=<0,1> + | + | This variable assignment disables (1) or enables (0) all the macros + described above. Typical use is to skip one or more operations + while debugging a test script. + +KILL_LDMSD_STRICT=<0,1> + | + | This variable allows the script author to control whether + KILL_LDMSD is strict by default or not. If enabled (1), the script + will exit with error code 1 following a failed KILL_LDMSD. If + disabled (0) the script will suppress error codes from killing + missing daemons. Typically used for debugging missing pid files and + unexpectedly dead daemons. Supplying the keyword ‘strict’ before + the numeric arguments to KILL_LDMSD also sets KILL_LDMSD_STRICT=1. + +portbase= + | + | The listening port numbers assigned to the daemons will be K+i, + where i is as described for macro LDMSD. It is a good idea (to + support automated testing) if portbase is set in so + that each test uses a unique range of ports. This enables tests to + proceed in parallel. + +CONFIGURATION OPTIONS +================================== + +The LDMSD command supports the following options. Note that all -P +options are processed before all -p options in a single LDMSD call. + +-p + | + | The prolog file is included before the usually expected input file. + The location of prolog files is handled as are the test input + files. See FILES below. Multiple -p options are allowed. + +-P + | + | The looped-prolog-file is included before the usually expected + input file, once for each value in daemon-csl. Daemon-csl is a + comma separated list of daemon numbers, e.g. a complete argument + example is "-P producer,3,4,5". The variable ${j} is substituted + with a daemon number from the list for each inclusion. + +The location of looped prolog files is handled as are the test input +files. See FILES below. Multiple -P options are allowed. + +-c + | + | Where multiple daemon numbers are specified, the input generated + for the first number is cloned to all subsequent daemons. See + FILES. This allows a single file to serve many similar daemon + instances in scale testing. + +-s + | + | After an ldmsd is started, wait wait_microseconds before checking + for the daemon PID file to exist. The appropriate wait time is + variable depending on the complexity of the configuration. If not + specified, the default is 2 seconds wait time. + +ENVIRONMENT +======================== + +The following variables can be set in the script to affect the launch of +ldmsd: + +LDMSD_EXTRA + | + | If set, these arguments are are appended to the ldmsd launch. + Typical use is to specify "-m MEMSIZE" or other unusual arguments. + The following flags are always determined for the user and must not + be present in LDMSD_EXTRA: -x -c -l -v -r. + +VG + | + | If valgrind is used (see vgon, vgoff), then $VG is the name of the + debugging tool wrapped around the launch of ldmsd. The default is + 'valgrind'. + +VGARGS + | + | If valgrind is used (see vgon, vgoff), then $VGARGS is appended to + the default valgrind arguments. + +VGTAG + | + | If valgrind is used (see vgon, vgoff), then $VGTAG is inserted in + the valgrind output file name when defined. A good practice is for + VGTAG to start with ".". + +KILL_NO_TEARDOWN + | + | Set KILL_NO_TEARDOWN=1 to suppress attempting configuration cleanup + during KILL_LDMSD. If set, ldmsd internal cleanup() function will + attempt partial cleanup, but possibly leave active data structures + to be reported by valgrind. The following variables are visible to + the input file and the configuration file. + +i + | + | Daemon configuration files and commands can refer to ${i} where i + is the integer daemon number supplied via LDMSD for the specific + daemon using the script. + +portN + | + | Daemon configuration files and commands can refer to ${portN} where + N is any value of 'i' described above. portN is the data port + number of the N-th daemon. + +input + | + | The name of the input file as specified when invoking this command. + +testname + | + | The base name (directories stripped) of the input file name. This + variable makes it possible to use similar input across many test + files when the name of the input file is the same as the plugin + tested. + +TESTDIR + | + | Root directory of the testing setup. + +STOREDIR + | + | A directory that should be used for store output configuration. + +LOGDIR + | + | A directory that should be used for log outputs. + +LDMS_AUTH_FILE + | + | Secret file used for daemon communication. + +XPRT + | + | The transport used. It may be specified in the environment to + override the default 'sock', and it is exported to the executed + daemon environment. + +HOST + | + | The host name used for a specific interface. It may be specified in + the environment to override the default 'localhost', and it is + exported to the executed daemon environment. + +NOTES +================== + +Any other variable may be defined and exported for use in the +attribute/value expansion of values in plugin configuration. + +EXIT CODES +======================= + +Expected exit codes are 0 and 1. If the exit codes is 0, then the +program will proceed. If the exit code is 1 then the script will stop +and notify the user. + +FILES +================== + +*$input_file.$i* + | + | For each value of i specifed to start an ldmsd, a configuration + file named $input_file.$i must also exist. This configuration file + is used when starting the daemon. + +Exception: For any single "LDMSD -c ", only +$input_file.$i for the first listed number is needed; the first file +will be used for all subsequent numbers and any matching files except +the first are ignored. Where prologs are also specified, the regular +prolog inclusion process is applied to the first file. + +*[test_dir]* + | + | If test_dir is supplied, it is used as the test output directory. + The default output location is \`pwd`/ldmstest/$testname. + +*$docdir/examples/static-test/$input_file* + | + | If input_file is not found in the current directory, it is checked + for in $docdir/examples/static-test/$input_file. + +GENERATED FILES +============================ + +*$test_dir/logs/vg.$k$VGTAG.%p* + | *$test_dir/logs/vgls.$k$VGTAG.%p* + | The valgrind log for the kth daemon with PID %p or the valgrind log + for ldms_ls of the kth daemon with PID %p, if valgrind is active. + +*$test_dir/logs/$k.txt* + | + | The log for the kth daemon. + +*$test_dir/logs/teardown.$k.txt* + | + | The teardown log for the kth daemon. + +*$test_dir/run/conf.$k* + | + | The input for the kth daemon. + +*$test_dir/run/revconf.$k* + | + | The input for the kth daemon teardown. + +*$test_dir/run/env.$k* + | + | The environment present for the kth daemon. + +*$test_dir/run/start.$k* + | + | The start command of the kth daemon. + +*$test_dir/store/* + | + | The root of store output locations. + +*$test_dir/run/ldmsd/secret* + | + | The secret file for authentication. + +SEE ALSO +===================== + +seq(1) diff --git a/rtd/docs/source/ldms_man/ldms_auth_munge.rst b/rtd/docs/source/ldms_man/ldms_auth_munge.rst new file mode 100644 index 000000000..2c6cd9714 --- /dev/null +++ b/rtd/docs/source/ldms_man/ldms_auth_munge.rst @@ -0,0 +1,33 @@ +=============== +ldms_auth_munge +=============== + +:Date: 10 May 2018 + +.. contents:: + :depth: 3 +.. + +NAME +================ + +ldms_auth_munge - LDMS authentication using munge + +SYNOPSIS +==================== + +*ldms_app* **-a munge [-A socket=**\ *PATH*\ **]** + +DESCRIPTION +======================= + +**ldms_auth_munge** relies on **munge** service (see **munge**\ (7)) to +authenticate users. Munge daemon (**munged**) must be up and running. +The optional **socket** option can be used to specify the path to munged +unix domain socket in the case that munged wasn't using the default +path. + +SEE ALSO +==================== + +**munge**\ (7), **munged**\ (8) diff --git a/rtd/docs/source/ldms_man/ldms_auth_naive.rst b/rtd/docs/source/ldms_man/ldms_auth_naive.rst new file mode 100644 index 000000000..2fc6288f1 --- /dev/null +++ b/rtd/docs/source/ldms_man/ldms_auth_naive.rst @@ -0,0 +1,29 @@ +=============== +ldms_auth_naive +=============== + +:Date: 28 Feb 2018 + +.. contents:: + :depth: 3 +.. + +NAME +================ + +ldms_auth_naive - naive LDMS authentication implementation FOR TESTING + +SYNOPSIS +==================== + +*ldms_app* **-a naive** **[-A uid=**\ *UID*\ **]** **[-A +gid=**\ *GID*\ **]** + +DESCRIPTION +======================= + +**ldms_auth_naive** LDMS authentication plugin naively believes the +peer's credential declaration. The purpose of this plugin is purely for +testing the permission control of various objects in **ldmsd**. The +**uid** and **gid** options are used to specify the user credential. If +**uid** and/or **gid** are not specified, the default is -1. diff --git a/rtd/docs/source/ldms_man/ldms_auth_none.rst b/rtd/docs/source/ldms_man/ldms_auth_none.rst new file mode 100644 index 000000000..f1da1e2ee --- /dev/null +++ b/rtd/docs/source/ldms_man/ldms_auth_none.rst @@ -0,0 +1,29 @@ +============== +ldms_auth_none +============== + +:Date: 28 Feb 2018 + +.. contents:: + :depth: 3 +.. + +NAME +=============== + +ldms_auth_none - LDMS authentication disabled + +SYNOPSIS +=================== + +*ldms_app* **-a none [Default]** + +DESCRIPTION +====================== + +**ldms_auth_none** enables running without authentication of query +sources. Since "-a none" is the default it need not be specified (e.g., +running "ldmsd -x sock:1024 -a none" is equivalent to simply running +"ldmsd -x sock:1024"). Using this authentication type there will be NO +checks on identities associated with data and/or meta-data information +accesses. diff --git a/rtd/docs/source/ldms_man/ldms_auth_ovis.rst b/rtd/docs/source/ldms_man/ldms_auth_ovis.rst new file mode 100644 index 000000000..4d622eb2c --- /dev/null +++ b/rtd/docs/source/ldms_man/ldms_auth_ovis.rst @@ -0,0 +1,67 @@ +============== +ldms_auth_ovis +============== + +:Date: 28 Feb 2018 + +.. contents:: + :depth: 3 +.. + +NAME +=============== + +ldms_auth_ovis - LDMS authentication using ovis_auth library + +SYNOPSIS +=================== + +*ldms_app* **-a ovis [-A conf=**\ *PATH*\ **]** + +DESCRIPTION +====================== + +**ldms_auth_ovis** uses shared secret to authenticate the connection. +The secret is a text file containing the line: + + secretword=X + +where X is a string at least 8 characters long. Lines starting with # in +the file are ignored. + +Four locations are checked in order for the secret: + +1) the full file path given on the command line via "-A conf=authfile", + +2) the full file path given in environment variable LDMS_AUTH_FILE, + +3) $HOME/.ldmsauth.conf, and + +4) $SYSCONFDIR/ldmsauth.conf (e.g. /etc/ldmsauth.conf). + +where $HOME is taken from */etc/password* and $SYSCONFDIR is determined +at ldms compile time. If one of these is not set, the search continues +with the next location. A failure in reading one, if the file exists, +ends the search and is a failure to authenticate. + +The secret file permissions must be set to 600 or more restrictive. + +ENVIRONMENT +====================== + +"LDMS_AUTH_FILE" is a full file path for a secretword file. It is not +necessary, if the file is in one of the other checked locations. + +NOTES +================ + +Authentication can be disabled at ldms build time by configuring your +ldms build with --disable-ovis_auth. Then no secretword file is required +or checked. + +BUGS +=============== + +Networked file system users should verify the privacy of their secret +files, as various access control list schemes might be more permissive +than the standard permissions bits. diff --git a/rtd/docs/source/ldms_man/ldms_authentication.rst b/rtd/docs/source/ldms_man/ldms_authentication.rst new file mode 100644 index 000000000..f8b73736b --- /dev/null +++ b/rtd/docs/source/ldms_man/ldms_authentication.rst @@ -0,0 +1,71 @@ +=================== +ldms_authentication +=================== + +:Date: 28 Feb 2018 + +.. contents:: + :depth: 3 +.. + +NAME +==================== + +ldms_authentication - Authentication in LDMS transports + +DESCRIPTION +=========================== + +LDMS applications use authentication plugins in LDMS transports to +authenticate the peers. In other words, not only **ldmsd** authenticates +the client connections, the clients (**ldms_ls**, **ldmsctl**, +**ldmsd_controller**, and other **ldmsd**) authenticate the **ldmsd** +too. + +**ldmsd**, **ldms_ls**, **ldmsd_controller**, and **ldmsctl** use the +following options for authentication purpose: + +**-a** *AUTH_PLUGIN* + Specifying the name of the authentication plugin. The default is + "none" (no authentication). + +**-A** *NAME*\ **=**\ *VALUE* + Specifying options to the authentication plugin. This option can be + given multiple times. + +**auth** configuration object has been introduced in **ldmsd** version +4.3.4. It describes an authentication domain in the configuration file +with **auth_add** command. **listen** and **prdcr_add** config commands +can refer to **auth** object created by **auth_add** command to specify +the authentication domain a listening port or a producer connection +belong to. If no **auth** option is specified, **listen** and +**prdcr_add** commands fall back to use the authentication method +specified by **-a, -A** CLI options (which is default to **none**). + +Please consult the manual of the plugin for more details. + +LIST OF LDMS_AUTH PLUGINS +========================================= + +**none** + Authentication will NOT be used (allow all connections) (see + **ldms_auth_none**\ (7)). + +**ovis** + The shared secret authentication using ovis_ldms (see + **ldms_auth_ovis**\ (7)). + +**naive** + The naive authentication for testing. (see **ldms_auth_naive**\ (7)). + +**munge** + User credential authentication using Munge. (see + **ldms_auth_munge**\ (7)). + +SEE ALSO +======================== + +**ldms_auth_none**\ (7), **ldms_auth_ovis**\ (7), +**ldms_auth_naive**\ (7), **ldms_auth_munge**\ (7), **ldmsctl**\ (8), +**ldmsd**\ (8), **ldms_ls**\ (8), **ldmsd_controller**\ (8), +**ldms_quickstart**\ (7), **ldms_build_install**\ (7) diff --git a/rtd/docs/source/ldms_man/ldms_build_install.rst b/rtd/docs/source/ldms_man/ldms_build_install.rst new file mode 100644 index 000000000..e212f2ca7 --- /dev/null +++ b/rtd/docs/source/ldms_man/ldms_build_install.rst @@ -0,0 +1,315 @@ +================== +ldms_build_install +================== + +:Date: 22 Dec 2016 + +.. contents:: + :depth: 3 +.. + +NAME +=================== + +ldms_build_install - Instructions for building and installing ldms + +INTRODUCTION +=========================== + +OVIS is a modular system for HPC data collection, transport, storage, +analysis, visualization, and response. The Lightweight Distributed +Metric Service (LDMS) is the OVIS data collection and transport system. +LDMS provides capabilities for lightweight run-time collection of +high-fidelity data. Data can be accessed on-node or transported off +node. Additionally, LDMS can store data in a variety of storage options. + +This entire source encompasses a number of the modular components of +OVIS. The top level subdirectory ldms contains the ldms source. This +document covers building only the ldms component from the top level +directory. + +DESCRIPTION +========================== + +This document covers building only the ldms component from the top level +directory. + +ldms is built via the following steps: + + :: + + build prerequisties + cd top_level_directory + mkdir build + cd build + make + make install + + + This document describes the steps involved in building the prerequisties and in doing the configure. + A description of the arguments for configure can be found by invoking + + ./configure --help + + at BOTH the top level and in the ldms subdirectory. + +PREREQUISTES: +============================ + +- libevent-2.0 is a requirement. It can be built from source obtained + from libevent.org or it can be installed from rpm or similar on your + system via a utility like yum. If you do the latter, then you need to + install both the libevent and libevent-devel packages. + +- If you intend to use the aries_mmr sampler, then you will need to + install Cray's gpcd library. More information on this can be found in + the Plugin.aries_mmr man page. (This is the recommended method for + getting HSN metrics for the Aries). + +- If you intend to use the hsn metrics in the cray_aries_r_sampler or + the cray_gemini_r_sampler, you will need to configure gpcdr. More + information on this can be found in the Plugin.cray_sampler_variants + man page. (This is the recommended method for the Gemini). + +- Use the gnu compiler for building ldms. (This may necessitate a + module change on some platforms). + +The remaining instructions will include paths to where the headers and +libraries of these prerequisties are installed. + +CONFIGURATION OPTIONS +==================================== + +There are configuration options at the top level, in ldms, and in the +ovis_ldms support directories. This section is thus split into these +three sections, however the configuration arguments are all combined as +arguments to the top level configure. The list of configuration options +give here is not comprehensive, rather it refers to the most common +arguments. + +TOP LEVEL OPTIONS +----------------- + +A number of top level "enable|disable-feature" options exist. The +defaults are chosen for a generic linux build to work by default. + +**--enable|disable-rpath** + | + | Disable this. Do not hardcode runtime library paths. + +**--enable|disable-ldms** + | + | Enable this. Default enabled. + +**--enable|disable-sos** + | + | Used to enable or disable sos. Enable only if you are going to use + the store_sos plugin. Default disable. + +**--enable|disable-ocm|baler|me|komondor** + | + | Disable all of these. All default disabled. + +OVIS_LIB LEVEL OPTIONS +---------------------- + +A number of top level "enable|disable-feature" options exist. The +defaults are chosen for a generic linux build to work by default. + +**--enable|disable-auth** + | + | Enables or disables authentication. Default enabled. + +**--enable|disable-sock** + | + | Enables or disables the sock transport. Default enabled. + +**--enable|disable-rdma** + | + | Enables or disables the rdma transport. Default disabled + +**--enable|disable-ugni** + | + | Enables or disables the ugni transport. The is cray-specific for + rdma over gemini or aries. Default disabled. + +LDMS LEVEL OPTIONS +------------------ + +A number of "enable|disable-feature options" exist. In addition a number +of "with" options exist to specify paths to files/libraries/etc. The +defaults are chosen for a generic linux build to work by default. + +General Options +--------------- + +**--enable|disable-ovis_auth** + | + | If --enable, then disable/enable authentication. Default enabled. + +**--enable|disable-python** + | + | Enable the ldms python api and the configuration tools that depend + on the API. Default: enabled if python and cython detected. + **--enable|disable-readline** + | Enable or disable the readline module. It is necessary to enable if + you want to use the configuration tools interactively; if you are + going to use a script interface to the configuration tools (usual + method), then this can be disabled. + +**--with-libevent**\ *[=path]* + | + | Specify libevent path [default=/usr] + +Generic Sampler Options +----------------------- + +**--enable|disable-meminfo|procinterrupts|procnfs|procnetdev|vmstat** + | + | Enable or disable generic linux samplers for data in /proc. Default + enabled. + +**--enable|disable-lustre** + | + | Enable or disable the lustre module. Default enabled. + +Cray-specific Sampler Options +----------------------------- + +**--enable|disable-kgnilnd** + | + | Enable the kgnilnd sampler. Default disabled. + +**--enable|disable-cray_system_sampler** + | + | Enable or disable the cray_system_sampler module. Default disabled. + If you enable this, then consider the following options: + + **--enable-gemini-gpcdr** + | + | Enable the gemini-gpcdr version of the cray_system_sampler. + Default disabled. Both the gemini and aries versions can be + built simultaneously. + + **--enable-aries-gpcdr** + | + | Enable the aries-gpcdr version of the cray_system_sampler. + Default disabled. For the Aries, we recommended getting the HSN + metrics via aries-mmr, instead of the aries-gpcdr sampler. Still + build the aries-gpcdr sampler, but run it without the HSN part + of the metric collection. Both the gemini and aries versions can + be built simultaneously. + + **--enable-cray-nvidia**\ OR\ **--with-cray-nvidia-inc**\ [=path] + | + | For gemini systems with gpus, Enable the cray-nvidia metric + sampling in the cray_gemini_r_sampler. You need not specify + --enable-cray-nvidia if you are instead specifying the path to + the include file via --with-cray-nvidia-inc. + + **--enable|disable-lustre** + | + | Enable or disable the lustre module for use in the + cray_system_sampler. Default enabled. + + **--with-rca**\ *[=path]* + | + | Specify the path to the rca includes via --with-rca + [default=/usr]. + + **--with-krca**\ *[=path]* + | + | Specify the path to the krca includes via --with-krca + [default=/usr]. + + **--with-cray-hss-devel**\ *[=path]* + | + | Specify the path to the hss-devel includes via + --with-cray-hss-devel [default=/usr]. + +**--enable|disable-aries-mmr** + | + | Enable or disable the aries-mmr module. Default disabled. If you + enable this, then consider the following options: + + **--with-aries-libgpcd**\ *LIBDIR,INCDIR* + | + | Locations of gpcd library and headers for aries_mmr sampler. + E.g. --with-aries-libgpcd=/special/libs,/private/headerdir + +Store Options +------------- + +**--enable|disable-csv** + | + | Enable the csv stores (store_csv and store_function_csv). Default + enable. **--enable|disable-sos** + | Enable or disable the sos stores. Enable this only if you are going + to use the store_sos plugin. Default disable. + +INSTALL DIRECTORY SETUP +====================================== + +The build will go into prefix (/XXX/Build/build_ovis in the examples +section below). + +- bin - python-based utility commands, such as ldmsd_controller. Also + test scripts. + +- include - subdurectories with header files + +- lib - libraries. At the top level are libraries for the ldms + infrastructure (e.g., libldms.so, libzap.so, etc). There is a + subdirectory, which will be called either ovis-ldms or ovis-lib which + contains all the libraries for the plugins (samplers, such as + libmeminfo.so; stores, such as libstore_csv.so; and transports, such + as libzap_sock.so). + +- lib64 - python library + +- sbin - C-based utility commands, such as ldms_ls and ldmsd. + +- share - documentation, including man pages. + +NOTES +==================== + +This document does not cover putting the install into a cray-system +image. Nor does it over setting up init scripts to run ldms as a system +service (for any type of linux platform). + +EXAMPLES +======================= + +configure.sh script for a Cray XC install with the cray-specific +samplers only: + +:: + + PREFIX=/XXX/Build/build_ovis + LIBDIR=${PREFIX}/lib + + # add --enable-FEATURE here + ENABLE="--enable-ugni --enable-ldms-python --enable-kgnilnd --enable-lustre --enable-aries_mmr --enable-cray_system_sampler --enable-aries-gpcdr" + + # add --disable-FEATURE here + DISABLE="--disable-rpath --disable-readline --disable-mmap --disable-baler --disable-sos" + + # libevent2 prefix + LIBEVENT_PREFIX=/XXX/Build/libevent-2.0_build + + WITH="--with-rca=/opt/cray/rca/default/ --with-krca=/opt/cray/krca/default --with-cray-hss-devel=/opt/cray-hss-devel/default/ --with-pkglibdir=ovis-ldms --with-aries-libgpcd=/XXX/Build/gpcd/lib/,/XXX/Build/gpcd/include/" + + + if [ -n "$LIBEVENT_PREFIX" ]; then + WITH="$WITH --with-libevent=$LIBEVENT_PREFIX" + fi + + CFLAGS='-g -O0' + +SEE ALSO +======================= + +ldms_authentication(8), ldms_quickstart(7), ldmsd(8), +Plugin_cray_sampler_variants(7), Plugin_aries_mmr(7), +Plugin_store_csv(7), Plugin_store_function_csv(7) diff --git a/rtd/docs/source/ldms_man/ldms_ls.rst b/rtd/docs/source/ldms_man/ldms_ls.rst new file mode 100644 index 000000000..39dfc5024 --- /dev/null +++ b/rtd/docs/source/ldms_man/ldms_ls.rst @@ -0,0 +1,224 @@ +======= +ldms_ls +======= + +:Date: 28 Feb 2018 + +.. contents:: + :depth: 3 +.. + +NAME +======== + +ldms_ls - Query an ldmsd for metric set values + +SYNOPSIS +============ + +ldms_ls [OPTION...] [NAME] + +DESCRIPTION +=============== + +The ldms_ls command is used to query an ldmsd (ldms daemon) for metric +set values. + +ENVIRONMENT +=============== + +The following environment variables must be set: +------------------------------------------------ + +LD_LIBRARY_PATH + include the path to ovis/lib and libevent2. On some system, lib64 + rather than lib is required. + +PATH + include the path to ovis/sbin + +The following environment variables may be set to override compiled defaults: +----------------------------------------------------------------------------- + +ZAP_LIBPATH + path to ovis/lib/ovis-ldms + +LDMSD_PLUGIN_LIBPATH + path to ovis/lib/ovis-ldms + +The following environment variables are optional: +------------------------------------------------- + +LDMS_LS_MEM_SZ + The size of memory reserved for metric sets. See the -m option. + +OPTIONS +=========== + +If the NAME is specified on the command line without -E/-S/-I, only information for that instance = NAME is displayed. + +**-E** *NAME* + | + | Indicates that the NAME is a regular expression. + +**-S** *NAME* + | + | Indicates that the NAME is a schema name. + +**-I** *NAME* + | + | Indicates that the NAME is an instance name. This is the default. + +**-h** *HOST* + | + | HOST to query. Default is localhost. + +**-x** *TRANSPORT* + TRANSPORT to use for the query. values are sock, rdma, or ugni (Cray + XE/XK/XC). Default is sock. + +**-p** *PORT* + PORT of the HOST to use for the query. Default is LDMS_DEFAULT_PORT. + +**-l** + Display long listing. Outputs details of the metric set, including + timestamp, metric names, metric types, and values. + +**-a** *AUTH* + The name of the LDMS Authentication plugin. Please see + **ldms_authentication**\ (7) for more details. (default: "none"). + +**-A** *NAME*\ **=**\ *VALUE* + The name-value options for the LDMS Authentication plugin. This + option can be given multiple times. Please see + **ldms_authentication**\ (7) for more information and consult the + plugin manual for the option details. + +**-m** *MEMORY_SIZE* + | + | MEMORY_SIZE is the size of memory reserved for metric sets. This + value has precedence over the value of the LDMS_LS_MEM_SZ + environment variable. The given size must be less than 1 petabytes. + For example, 20M or 20mb are 20 megabytes. Unless a specific set is + being queried, this should usually match the size of pre-allocated + memory specified when starting the remote ldmsd being queried. + +**-u** + Display the user data for the metrics. (Usually compid) + +**-v** + Display metadata information. Specifying this option multiple times + increases the verbosity. + +**-V** + Display LDMS version information and then exit. + +**-w** *WAIT_SEC* + WAIT_SEC is the time to wait before giving up on the server. Default + is 10 sec. + +DEFAULTS +============ + +**ldms_ls** with no arguments defaults to **ldms_ls -p** *XXX* **-h** +*localhost* **-x** *sock* where XXX is the LDMS_DEFAULT_PORT. + +NOTES +========= + +None. + +BUGS +======== + +No known bugs. + +EXAMPLES +============ + +:: + + 1) $ldms_ls -h vm1 -x sock -p 60000 + vm1_1/meminfo + vm1_1/vmstat + + + + 2) $ldms_ls -h vm1 -x sock -p 60000 -l + vm1_1/meminfo: consistent, last update: Thu Oct 29 08:04:44 2015 [202552us] + D u64 MemTotal 132165188 + D u64 MemFree 129767048 + D u64 Buffers 0 + D u64 Cached 46780 + D u64 SwapCached 0 + D u64 Active 16116 + D u64 Inactive 8596 + D u64 Active(anon) 10440 + D u64 Inactive(anon) 220 + D u64 Active(file) 5676 + D u64 Inactive(file) 8376 + D u64 Unevictable 35400 + D u64 Mlocked 6032 + + + + + The output format of the data is as follows: + M/D + indicates metadata vs data values + Metrictype + in the example above, unsigned int 64. + Value + Value of the metric + + 3) For a non-existent set: + $ldms_ls -h vm1 -x sock -p 60000 -l vm1_1/foo + ldms_ls: No such file or directory + ldms_ls: lookup failed for set 'vm1_1/foo' + + 4) Display metadata: + ldms_ls -h vm1 -x sock -p 60000 -v + vm1_1/meminfo: consistent, last update: Fri Dec 16 17:12:08 2016 [5091us] + METADATA -------- + Producer Name : vm1_1 + Instance Name : vm1_1/meminfo + Schema Name : meminfo + Size : 1816 + Metric Count : 43 + GN : 2 + DATA ------------ + Timestamp : Fri Dec 16 17:12:08 2016 [5091us] + Duration : [0.000072s] + Consistent : TRUE + Size : 384 + GN : 985 + ----------------- + + 5) Regular Expression: + $ldms_ls -h vm1 -x sock -p 60000 -E vm1 + vm1_1/meminfo + vm1_1/vmstat + + $ldms_ls -h vm1 -x sock -p 60000 -E vms + vm1_1/vmstat + + $ldms_ls -h vm1 -x sock -p 60000 -E -I memin + vm1_1/meminfo + + $ldms_ls -h vm1 -x sock -p 60000 -E -S ^vmstat$ + vm1_1/vmstat + + $ldms_ls -h vm1 -x sock -p 60000 -E -S cpu + ldms_ls: No metric sets matched the given criteria + +If the -E option is not given, the given string will be taken literally, +i.e., it is equivalent to giving -E ^foo$. + +The regular expression option can be used with the -v and -l options. In +this case ldms_ls will display only the information of the metric sets +that matched the given regular expression. + +SEE ALSO +============ + +ldms_authentication(7), ldmsd(8), ldms_quickstart(7), diff --git a/rtd/docs/source/ldms_man/ldms_quickstart.rst b/rtd/docs/source/ldms_man/ldms_quickstart.rst new file mode 100644 index 000000000..0cff2344d --- /dev/null +++ b/rtd/docs/source/ldms_man/ldms_quickstart.rst @@ -0,0 +1,836 @@ +=============== +ldms_quickstart +=============== + +:Date: 12 Dec 2016 + +.. contents:: + :depth: 3 +.. + +NAME +================ + +LDMS_QuickStart - man page for Quick Start of LDMS + +INTRODUCTION +======================== + +LDMS is the Lightweight Distributed Metric Service. LDMS is a +distributed data collection, transport, and storage tool that supports a +wide variety of configuration options. There are three main functional +components described below. + +*Samplers* run one or more plugins that periodically sample data of +interest. Each plugin defines a group of metrics called a metric set. +The sampling frequency is user defined and can be dynamically changed. A +host can simultaneously run multiple plugins. Configuration flags +determine whether the sampling plugins run synchronously or +asynchonously (both on a host and across hosts). Memory allocated for a +particular metric set is overwritten by each successive sampling. The +host daemon does not retain sample history; plugins do not typically +retain history, but can be written to do so. + +*Aggregators* collect data in a pull fashion from samplers and/or other +aggregators. The collection frequency is user defined and operates +independently of other collection operations and sampling operations. +Distinct metric sets can be collected at different frequencies. Once +started, the aggregation schedule cannot be altered without restarting +the aggregator. Fan-in refers to the number of hosts collected from by a +single aggregator. Maximum fan-in varies by transport but is roughly +9,000:1 for the socket transport and for the RDMA transport over +Infiniband. It is > 15000:1 for RDMA over the Cray Gemini transport. +Daisy chaining is not limited to two levels; multiple aggregators may +aggregate from the same sampler or aggregator ldmsd. Fan-in at higher +levels is limited by the aggregator host capabilities (CPU, memory, +network bandwidth, and storage bandwidth). + +*Storage* plugins write in a variety of formats. Comma Separated Value +(CSV) file storage of metric sets plugins are provided. Storage occurs +when a valid updated metric set data is collected by an aggregator that +has been configured to write that data to storage. Collection of a +metric set whose data has not been updated or is incomplete does not +result in a write to storage in any format. + +The host daemon is the same base code in all cases; differentiation is +based on configuration of plugins for sampling or storage and on +configuring aggregation of data from other host daemons. + +DESCRIPTION +======================= + +Quick Start instructions for LDMS (Lightweight Distributed Metric +Service). + +This man page describes how to configure and run LDMS daemons (ldmsd) to +perform the following tasks: + +- collect data + +- aggregate data from multiple ldmsds + +- store collected data to files. + +There are three basic configurations that will be addressed: + +- configuring an ldmsd with collector plugins + +- configuring a ldmsd to aggregate information from other ldmsds + +- configuring a store_csv storage plugin on an ldmsd. + +The order in which these configurations should be performed does not +matter with respect to collectors and aggregators. + +While a complete listing of flags and parameters can be seen by running +ldmsd and the configuration tools with the --help directive, this +document describes the flags and parameters required for running a basic +setup. + +There are no run scripts provided in the current release; the examples +here can be used in the creation of such. + +Arrangement of this document +======================================== + +This document is arranged as follows: + + 1) Prerequisites + + 2) Build and install + + 3) Configuring and Starting an ldmsd (general) + + 4) through 8) Example ldmsd configurations and queries + + 9) Protection Domain Tags (Cray Only) + + 10) Troubleshooting + + 11) About the man pages + +1) PREREQUISITES +============================ + +- All sections below assume the build directory is /tmp/opt/ovis. + +- libevent-2.0 is a requirement. + +- Python 2.7 or Python 2.6 with the argparse module is required for + ldmsd_controller + +2) BUILD/INSTALL: +============================= + +There is a separate document with build/install instructions. + +The default ldms build in v3 has authentication turned on. This document +does not include use of the authentication flags; the instructions here +are as if you had built with --disable_ovis_auth. For more information +on authentication, see the ldms_authentication man page. + +3) CONFIGURING AND STARTING AN LDMSD +================================================ + +3-1) Environment Variables for LDMS +----------------------------------- + +You will need to set the following environment variables when running +LDMS daemons. This assumes that ldms has been installed in to +/tmp/opt/ovis. + +:: + + export LD_LIBRARY_PATH=/tmp/opt/ovis/lib/:/tmp/opt/ovis/lib/ovis-ldms/:/lib:$LD_LIBRARY_PATH + export ZAP_LIBPATH=/tmp/opt/ovis/lib/ovis-ldms + export LDMSD_PLUGIN_LIBPATH=/tmp/opt/ovis/lib/ovis-ldms + export PATH=/tmp/opt/ovis/sbin/:/tmp/opt/ovis/bin:$PATH + export LDMSD_SOCKPATH=/tmp/run/ldmsd + +LDMSD_SOCKPATH determines the location for the unix domain socket +(described in the ldmsd args below). The default is /var/run/ldmsd. Make +sure you use a location that is writeable if you are running as +non-root. + +3-2) Options for Configuring Plugins of an ldmsd +------------------------------------------------ + +Plugins for an ldmsd can be configured via a configuration file +specified as an argument to the "-c" flag. Also, ldmsd_controller is a +configuration tool that can work in interactive mode and can also can be +directed commands/scripts to a socket. The plugin configuration commands +are the same in all cases. + +In the instructions below, we briefly illustrate use of the +configuration script to ldmsd vs ldmsd_controller. Some environmental +variables have been supressed in this section for clarity. In all +subsequent examples (Sections 4+), we provide versbose detail for the +ldmsd configuration script method only. Altering this to use the other +methods should then be obvious. + +3-2a) Configuring an ldmsd via a configuration script +----------------------------------------------------- + +This is the most usual mode of configuring ldms in production scenarios +and can also be used for test scenarios. + +Example commands for configuring a sampler: + +:: + + > more config.file + + load name=meminfo + config name=meminfo producer=vm1_1 instance=vm1_1/meminfo + start name=meminfo interval=1000000 + +The path to the configuration script is then provided to the ldmsd via +the "-c" flag when it is started: + +Example ldmsd start command with a configuration script: + +:: + + ldmsd -x sock:60000 -S tmp/ldmsd/sock1 -l /tmp/log/logfile -v DEBUG -c ./config.file + +3-2b) Configuring ldmsd via ldmsd_controller +-------------------------------------------- + +You can use ldmsd_controller to connect to the ldmsd at any time to +issue plugin commands. This is most often used for dynamically issuing +commands to a running ldmsd. + +Example ldmsd start command without a configuration script: + +:: + + ldmsd -x sock:60000 -S tmp/ldmsd/sock1 -l /tmp/log/logfile -v DEBUG + +Call the ldmsd_controller interactively and enter the same commands as +you would in the configuration script. + +:: + + ldmsd_controller --host vm1_1 --port=61000 + ldmsd_controller> load name=meminfo + ldmsd_controller> config name=meminfo producer=vm1_1 instance=vm1_1/meminfo + ldmsd_controller> start name=meminfo interval=1000000 + ldmsd_controller> quit + +Relatedly, you can run ldmsd_controller with the commands in script +form. For example: + +:: + + > more config.sh + + #!/bin/bash + echo "load name=meminfo" + echo "config name=meminfo producer=vm1_1 instance=vm1_1/meminfo" + echo "start name=meminfo interval=1000000" + +Call the ldmsd_controller with the script: + +:: + + ldmsd_controller --host vm1_1 --port=60000 --script ./config.sh + +ldmsd_contoller may be executed multiple times to issues different +commands to the same ldmsd. + +3-3) Starting an ldmsd +---------------------- + +3-3a) Set environment variables, as described above. + +3-3b) Run ldmsd: + +:: + + /ldmsd -x : -S -l -v -c config.file + +Notes: + +- Transport is one of: sock, rdma, ugni (ugni is Cray specific for + using RDMA over the Gemini/Aries network) + +- The configuration file contains the commands to configure the + plugins. + +- The unix domain socket can be used to communicate configuration + information to an ldmsd. The default path for this is + /var/run/ldmsd/. To change this the environment variable + LDMSD_SOCKPATH must be set to the desired path (e.g. export + LDMSD_SOCKPATH=/tmp/run/ldmsd) + +- No log can be can be obtained by using LOG_LEVEL QUIET, or specifying + /dev/null for the log file, or using command line redirection. + +- The default is to run as a background process but the -F flag can be + specified for foreground + +- A script can be made to start ldmsd and collectors on a host where + that script contains the information to execute the command. + +3-3c) Examples for launching ldmsd: + +- Start an ldmsd on the socket transport with a log file and a + configuration file. + +:: + + /tmp/opt/ovis/sbin/ldmsd -x sock:60000 -S /var/run/ldmsd/metric_socket -l /tmp/opt/ovis/logs/1 -c config.file + + Same but with log level QUIET + /tmp/opt/ovis/sbin/ldmsd -x sock:60000 -S /var/run/ldmsd/metric_socket -l /tmp/opt/ovis/logs/1 -c config.file -V QUIET + +- Start 2 instances of ldmsd on host vm1 + +:: + + Note: Make sure to use different socket names and listen on different ports if you are on the same host. + /tmp/opt/ovis/sbin/ldmsd -x sock:60000 -S /var/run/ldmsd/metric_socket_vm1_1 -l /tmp/opt/ovis/logs/vm_1 -c config.file + /tmp/opt/ovis/sbin/ldmsd -x sock:60001 -S /var/run/ldmsd/metric_socket_vm1_2 -l /tmp/opt/ovis/logs/vm_2 -c config.file + +4) EXAMPLE: CONFIGURE AN LDMSD WITH SAMPLER PLUGINS +=============================================================== + +4-1) Create the configuration file for the sampler plugins: +----------------------------------------------------------- + +Configure a "meminfo" collector plugin to collect every second. + +:: + + load name=meminfo + config name=meminfo producer=vm1_1 instance=vm1_1/meminfo + start name=meminfo interval=1000000 + + + Notes: + For synchronous operation include "offset=<#usec>" in start line (e.g. start name=meminfo interval=xxx offset=yyy). + This will cause the sampler to target interval + yyy aligned to the second and micro second + (e.g. every 5 seconds with an offset of 0 usec would ideally result in collections at 00:00:00, 00:00:05, 00:00:10, etc. + whereas with an offset of 100,000 usec it would be 00:00:00.1, 00:00:05.1, 00:00:10.1, etc) + Different plugins may have additional configuration parameters. + +4-2) Set environment variables, as described above. +--------------------------------------------------- + +4-3) Start the ldmsd with the config file, as described above. e.g., +-------------------------------------------------------------------- + + ldmsd -x sock:60000 -S tmp/ldmsd/sock1 -l /tmp/log/logfile -v DEBUG + -c ./config.file + +4-4) Verifying the collector +---------------------------- + +At this point the ldmsd collector should be checked using the utility +ldms_ls (See Using ldms_ls below) + +5) EXAMPLE: CONFIGURE AN AGGREGATOR USING LDMSD_CONTROLLER +====================================================================== + +5-1) Start 2 separate ldmsds, one on host vm1_1 and one on host vm1_2, with sampler plugins, as described above +--------------------------------------------------------------------------------------------------------------- + +5-2) Write a script to add producers and start collecting from them: +-------------------------------------------------------------------- + +This adds vm1_1 as a producer with its sets collected at 2 second +intervals and vm1_2 as a producer with its sets collected at 5 second +intervals. Here the "name" of the producer must match the "producer" +name given to the sampler. + +The first set of lines adds the producers. The second set of lines +establishes the aggregation from them. at the specified intervals. + +:: + + > more add_prdcr.config + prdcr_add name=vm1_2 host=vm1 type=active xprt=sock port=60001 interval=20000000 + prdcr_start name=vm1_2 + prdcr_add name=vm1_1 host=vm1 type=active xprt=sock port=60000 interval=20000000 + prdcr_start name=vm1_1 + updtr_add name=policy2_h1 interval=2000000 offset=0 + updtr_prdcr_add name=policy2_h1 regex=vm1_1 + updtr_start name=policy2_h1 + updtr_add name=policy5_h2 interval=5000000 offset=0 + updtr_prdcr_add name=policy5_h2 regex=vm1_2 + updtr_start name=policy5_h2 + +5-3) Set environment variables, as described above +-------------------------------------------------- + +5-4) Start an ldmsd on your host to aggregate using the configuration file +-------------------------------------------------------------------------- + + /tmp/opt/ovis/sbin/ldmsd -x sock:60002 -S + /var/run/ldmsd/metric_socket_agg -l /tmp/opt/ovis/logs/vm1_agg -c + ./add_prdcr.sh + +Notes: + +- There is no requirement that aggregator intervals match collection + intervals + +- Because the collection and aggregation processes operate + asynchronously there is the potential for duplicate data collection + as well as missed samples. The first is handled by the storage + plugins by comparing generation numbers and not storing duplicates. + The second implies either a loss in fidelity (if collecting counter + data) or a loss of data points here and there (if collecting + differences of counter values or non counter values). This can be + handled using the synchronous option on both collector and aggregator + but is not covered here. + +5-4) At this point the ldmsd collector should be checked using the utility ldms_ls +---------------------------------------------------------------------------------- + +(See Using ldms_ls below). In this case you should see metric sets for +both vm1_1 and vm1_2 displayed when you query the aggregator ldmsd using +ldms_ls. + +6) EXAMPLE: CONFIGURE AN LDMS AGGREGATOR WITH A STORAGE PLUGIN +========================================================================== + +6-1) Add storage configuration lines to the configuration file described above. +------------------------------------------------------------------------------- + +This adds a store_csv to store sets whose schema are meminfo or vmstat +and whose instance name matches the regex. A set's schema and instance +names will be seen in the output of ldms_ls (described below). + +> more add_store.sh load name=store_csv config name=store_csv +path=<> action=init altheader=0 rollover=30 rolltype=1 +strgp_add name=policy_mem plugin=store_csv container=csv schema=meminfo +strgp_prdcr_add name=policy_mem regex=vm\* strgp_start +name=policy_vmstat strgp_add name=policy_vmstat plugin=store_csv +container=csv schema=vmstat strgp_prdcr_add name=policy_vmstat +regex=vm\* strgp_start name=policy_vmstat + +Notes: + +- For the csv store, the whole path must pre-exist. + +- See the Plugin_store_csv man page for more info on the plugin + configuration arguments. + +- If you want to collect on a host and store that data on the same + host, run two ldmsd's: one with a collector plugin only and one as an + aggegrator with a store plugin only. + +6-2) Set environment variables, as described above +-------------------------------------------------- + +6-3) Start the aggregator with the full configuration file (both aggregator and store lines), as described above +---------------------------------------------------------------------------------------------------------------- + +6-4) Verify the store +--------------------- + +Go to data store and verify files have been created and are being +written to + +:: + + cd <>/ + ls -ltr + +You can now utilize this data. + +Data will flush to the store when the OS flushes data unless an advanced +flag is used. Thus, in a default configuration, if you have a small +number of nodes and/or a long interval, you may not see data appear in +the store for a few minutes. + +7) EXAMPLES: USING LDMS_LS TO DISPLAY SETS/METRICS FROM AN LDMSD +============================================================================ + +7-1) Set environment variables, as described above +-------------------------------------------------- + +7-2a) Query ldmsd on host vm1 listening on port 60000 (sampler) using the sock transport for metric sets being served by that ldmsd +----------------------------------------------------------------------------------------------------------------------------------- + +:: + + ldms_ls -h vm1 -x sock -p 60000 + Should return: + vm1_1/meminfo + vm1_1/vmstat + +7-2b) Query ldmsd on host vm1 listening on port 60002 (aggregator) using the sock transport for metric sets being served by that ldmsd +-------------------------------------------------------------------------------------------------------------------------------------- + +:: + + ldms_ls -h vm1 -x sock -p 60002 + Should return: + vm1_1/meminfo + vm1_1/vmstat + vm1_2/meminfo + vm1_2/vmstat + +7-2c) Query ldmsd on host vm1 listening on port 60000 using the sock transport for the names and contents of metric sets being served by that ldmsd. +---------------------------------------------------------------------------------------------------------------------------------------------------- + +Should return: Set names (vm1_1/meminfo and vm1_1/vmstat in this case) +as well as all names and values associated with each set respectively. +Only vm1_1/meminfo shown here. + +:: + + > ldms_ls -h vm1 -x sock -p 60000 -l + vm1_1/meminfo: consistent, last update: Wed Jul 31 21:51:08 2013 [246540us] + U64 33084652 MemTotal + U64 32092964 MemFree + U64 0 Buffers + U64 49244 Cached + U64 0 SwapCached + U64 13536 Active + U64 39844 Inactive + U64 5664 Active(anon) + U64 13540 Inactive(anon) + U64 7872 Active(file) + U64 26304 Inactive(file) + U64 2996 Unevictable + U64 2988 Mlocked + U64 0 SwapTotal + U64 0 SwapFree + U64 0 Dirty + U64 0 Writeback + U64 7164 AnonPages + U64 6324 Mapped + U64 12544 Shmem + U64 84576 Slab + U64 3948 SReclaimable + U64 80628 SUnreclaim + U64 1608 KernelStack + U64 804 PageTables + U64 0 NFS_Unstable + U64 0 Bounce + U64 0 WritebackTmp + U64 16542324 CommitLimit + U64 73764 Committed_AS + U64 34359738367 VmallocTotal + U64 3467004 VmallocUsed + U64 34356268363 VmallocChunk + U64 0 HugePages_Total + U64 0 HugePages_Free + U64 0 HugePages_Rsvd + U64 0 HugePages_Surp + U64 2048 Hugepagesize + U64 565248 DirectMap4k + U64 5726208 DirectMap2M + U64 27262976 DirectMap1G + +7-2d) Query for a non-existent set: +=============================================== + +:: + + ldms_ls -h vm1 -x sock -p 60000 -l vm1_1/foo + ldms_ls: No such file or directory + ldms_ls: lookup failed for set 'vm1_1/foo' + +7-2e) Display metadata about sets contained by vm1 ldmsd listening on port 60000 +============================================================================================ + +:: + + ldms_ls -h vm1 -x sock -p 60000 -v + vm1_1/meminfo: consistent, last update: Fri Dec 16 17:12:08 2016 [5091us] + METADATA -------- + Producer Name : vm1_1 + Instance Name : vm1_1/meminfo + Schema Name : meminfo + Size : 1816 + Metric Count : 43 + GN : 2 + DATA ------------ + Timestamp : Fri Dec 16 17:12:08 2016 [5091us] + Duration : [0.000072s] + Consistent : TRUE + Size : 384 + GN : 985 + ----------------- + +8) STOP AN LDMSD +============================ + +To kill all ldmsd on a host +--------------------------- + +:: + + killall ldmsd + +9) PROTECTION DOMAIN TAGS (Cray) +============================================ + +9-1) Cray XE/XK: +---------------- + +If you are going to be using the "ugni" transport (RDMA over Gemini) you +will need to run with either system (as root) or user (as user) ptags. +While root CAN run using any ptag the fact that its use is unknown to +ALPS could cause collisions with applications. + +To see current ptags: +--------------------- + +:: + + > apstat -P + PDomainID Type Uid PTag Cookie + LDMS system 0 84 0xa9380000 + +To create a userspace ptag: +--------------------------- + +:: + + apmgr pdomain -c + + Example: + > apmgr pdomain -c foo + > apstat -P + PDomainID Type Uid PTag Cookie + LDMS system 0 84 0xa9380000 + foo user 12345 233 0xa1230000 + +Note: A system administrator will have to setup system ptags and/or +enable users to set up ptags. + +To remove a userspace ptag: +--------------------------- + +:: + + apmgr pdomain -r + +Note: The userid of the ptag being removed must match that of the user +running the command or root + +PTAG-Related Enviroment variables for ldms (XE/XK) +-------------------------------------------------- + +Set the following environment variables for either user or system ptags +(example shows user ptag values): + +:: + + export ZAP_UGNI_PTAG 233 + export ZAP_UGNI_COOKIE 0xa1230000 + +Starting ldms from aprun with ptags +----------------------------------- + +When running with user space ptags you must specify the ptag name when +using aprun + +:: + + aprun <> -p foo ldmsd <> + or + aprun <> -p foo ldms_ls <> + +Note: On some systems you will run aprun after a qsub -I or within a +script specified in qsub or similiar. + +9-2) Cray XC, CLE <= 5.2: +------------------------- + +If you are going to be using the "ugni" transport (RDMA over Aries) you +will need to run with either system (as root) or user (as user) ptags. +While root CAN run using any ptag the fact that its use is unknown to +ALPS could cause collisions with applications. + +To see current ptags: +--------------------- + +:: + + > apstat -P + PDomainID Type Uid Cookie Cookie2 + LDMS system 0 0x86b80000 0 + +To create a userspace ptag: +--------------------------- + +:: + + apmgr pdomain -c + + Example: + > apmgr pdomain -c foo + > apstat -P + PDomainID Type Uid Cookie Cookie2 + LDMS system 0 0x86b80000 0 + foo user 20596 0x86bb0000 0x86bc0000 + +Note: A system administrator will have to setup system ptags and/or +enable users to set up ptags. + +To remove a userspace ptag: +--------------------------- + +:: + + apmgr pdomain -r + +Note: The userid of the ptag being removed must match that of the user +running the command or root + +PTAG-Related Enviroment variables for ldms (XC) +----------------------------------------------- + +Set the following environment variables. On XC the ptag value doesn't +matter but ZAP_UGNI_PTAG must be defined. Set the Cookie (not Cookie2) +for either user or system ptag. + +:: + + export ZAP_UGNI_PTAG=0 + export ZAP_UGNI_COOKIE=0x86bb0000 + +Starting ldms from aprun with ptags +----------------------------------- + +When running with user space ptags you must specify the ptag name when +using aprun + +:: + + aprun <> -p foo ldmsd <> + or + aprun <> -p foo ldms_ls <> + +Note: On some systems you will run aprun after a qsub -I or within a +script specified in qsub or similiar. + +10) TROUBLESHOOTING +=============================== + +What causes the following error: libibverbs: Warning: RLIMIT_MEMLOCK is 32768 bytes? +------------------------------------------------------------------------------------ + +Running as a user with "max locked memory" set too low. The following is +an example of trying to run ldms_ls as a user with "max locked memory" +set to 32k: + +:: + + ldms_ls -h -x rdma -p + libibverbs: Warning: RLIMIT_MEMLOCK is 32768 bytes. + This will severely limit memory registrations. + RDMA: recv_buf reg_mr failed: error 12 + ldms_ls: Cannot allocate memory + +Why doesn't my ldmsd start? +--------------------------- + +Possible options: + +- Check for existing /var/run/ldms/metric_socket or similar. Sockets + can be left if an ldmsd did not clean up upon termination. kill -9 + may leave the socket hanging around. + +- The port you are trying to use may already be in use on the node. The + following shows the logfile output of such a case: + +:: + + Tue Sep 24 08:36:54 2013: Started LDMS Daemon version 2.1.0 + Tue Sep 24 08:36:54 2013: Process 123456 listening on transport ugni:60020 + Tue Sep 24 08:36:54 2013: EV_WARN: Can't change condition callbacks once they have been initialized. + Tue Sep 24 08:36:54 2013: Error 12 listening on the 'ugni' transport. + Tue Sep 24 08:36:54 2013: LDMS Daemon exiting...status 7 + If using the -l flag make sure that your log directory exists prior to running + If writing to a store with this particular lmdsd make sure that your store directory exists prior to running + If you are running on a Cray with transport ugni using a user space PTag, check that you called aprun with the -p flag + aprun -N 1 -n -p run_my_ldmsd.sh + +How can I find what process is using the port? +---------------------------------------------- + + netstat -abno + +Why arent all my hosts/sets adding to the aggregator? +----------------------------------------------------- + +Possible options: + +- use -m flag on the aggregator to use more memory when adding a lot of + hosts + +- use -p on the aggregator to use more processors + +Why isn't my ldmsd storing its own set to the store? +---------------------------------------------------- + +Currently, this is not supported. You can use a separate ldmsd on the +same host to gather another ldmsd's data for that host. + +Why is my aggregator not responding (CRAY XE/XK)? +------------------------------------------------- + +Running a ldmsd aggregator as a user but trying to aggregate from a +ldmsd that uses a system ptag can result in the aggregator hanging +(alive but not responding and not writing to the store). The following +is the logfile output of such an aggregator: + +:: + + Tue Sep 24 08:42:40 2013: Connected to host 'nid00081:60020' + Tue Sep 24 08:42:42 2013: cq_thread_proc: Error 11 monitoring the CQ. + +11) MAN PAGES +========================= + +ldms comes with man pages. In the build process these will be installed +in /ovis/share/man. Man pages are in the following +catagories: + +General +------- + +General pages address information, such as ldms_build_install, +ldms_quickstart, and ldms_authentication. + +Utilities +--------- + +Utilities pages address the various utilities and commands such as +ldmsd, ldmsd_controller, and ldms_ls. + +Plugins +------- + +Plugin pages address all plugins, both samplers and stores. Naming +convention for these pages is Plugin_XXX. For example: Plugin_aries_mmr, +Plugin_cray_system_sampler_variants, Plugin_kgnilnd, Plugin_meminfo, +Plugin_procinterrupts, Plugin_procnetdev, Plugin_procnfs, +Plugin_store_csv, Plugin_store_function_csv, Plugin_store_sos, and +Plugin_vmstat. + +NOTES +================= + +As part of the install, test scripts are placed in /tmp/opt/ovis/bin. +These scripts may serve as additional examples. These are being +converted from using the obsolete ldmsctl tool to the ldmsd_controller +tool, so they may not be fully updated at any given time. + +BUGS +================ + +No known bugs. + +SEE ALSO +==================== + +ldms_build_install(7), ldmsd(8), ldmsd_controller(8), +ldms_authentication(7), ldms_build_install(7), ldms_ls(8) diff --git a/rtd/docs/source/ldms_man/ldms_sampler_base.rst b/rtd/docs/source/ldms_man/ldms_sampler_base.rst new file mode 100644 index 000000000..8af324bb7 --- /dev/null +++ b/rtd/docs/source/ldms_man/ldms_sampler_base.rst @@ -0,0 +1,132 @@ +================= +ldms_sampler_base +================= + +:Date: 04 Feb 2018 + +.. contents:: + :depth: 3 +.. + +NAME +================== + +sampler_base - man page for the LDMS sampler_base which is the base +class for sampler + +SYNOPSIS +====================== + +Configuration variable base class for LDMS samplers. + +DESCRIPTION +========================= + +With LDMS (Lightweight Distributed Metric Service), sampler plugins for +the ldmsd (ldms daemon) should inherit from the sampler_base base class. +This class defines variables that should be common to all samplers. It +also adds them to the sampler set set and handles their value +assignment. + +In order to configure a plugin, one should consult both the plugin +specific man page for the information and configuration arguments +specific to the plugin and this man page for the arguments in the +sampler_base. + +CONFIGURATION ATTRIBUTE SYNTAX +============================================ + +**config** + name= producer= instance= + [component_id=] [schema=] [job_set= job_id= + app_id= job_start= job_end=] + +| +| configuration line + + name= + | + | This will be the name of the plugin being loaded. + + producer= + | + | A unique name for the host providing the data. + + instance= + | + | A unique name for the metric set. + + schema= + | + | Optional schema name. It is intended that the same sampler on + different nodes with different metrics have a different schema. + Defaults to the sampler name. + + component_id= + | + | Optional unique number for the component being monitored, + Defaults to zero. + + job_set= + | + | The instance name of the set containing the job data, default is + 'job_info'. + + job_id= + | + | The name of the metric containing the Job Id, default is + 'job_id'. + + app_id= + | + | The name of the metric containing the Application Id, default is + 'app_id'. + + job_start= + | + | The name of the metric containing the Job start time, default is + 'job_start'. + + job_end= + | + | The name of the metric containing the Job end time, default is + 'job_end'. + +NOTES +=================== + +- This man page does not cover usage of the base class for plugin + writers. + +- Not all plugins may have been converted to use the base class. The + plugin specific man page should refer to the sampler_base where this + has occurred. + +BUGS +================== + +No known bugs. + +EXAMPLES +====================== + +Within ldmsd_controller or a configuration file: + +:: + + load name=meminfo + config name=meminfo producer=vm1_1 instance=vm1_1/meminfo + start name=meminfo interval=1000000 + +SEE ALSO +====================== + +ldmsd(8), ldms_quickstart(7), ldmsd_controller(8), +Plugin_all_example(7), Plugin_aries_linkstatus(7), Plugin_aries_mmr(7), +Plugin_array_example(7), Plugin_clock(7), +Plugin_cray_sampler_variants(7), Plugin_cray_dvs_sampler(7), +Plugin_procdiskstats(7), Plugin_fptrans(7), Plugin_kgnilnd(7), +Plugin_lnet_stats(7), Plugin_meminfo(7), Plugin_msr_interlagos(7), +Plugin_perfevent(7), Plugin_procinterrupts(7), Plugin_procnetdev(7), +Plugin_procnfs(7), Plugin_rapl(7), Plugin_sampler_atasmart(7), +Plugin_sysclassib(7), Plugin_synthetic(7), Plugin_vmstat(7) diff --git a/rtd/docs/source/ldms_man/ldmsctl.rst b/rtd/docs/source/ldms_man/ldmsctl.rst new file mode 100644 index 000000000..fcb2d25d1 --- /dev/null +++ b/rtd/docs/source/ldms_man/ldmsctl.rst @@ -0,0 +1,791 @@ +======= +ldmsctl +======= + +:Date: 19 Nov 2019 + +.. contents:: + :depth: 3 +.. + +NAME +======== + +ldmsctl - Issue control commands to ldmsd. + +SYNOPSIS +============ + +ldmsctl [OPTION...] + +DESCRIPTION +=============== + +After LDMS (lightweight Distributed Metric Service) version 3.4, ldmsctl +is an LDMS daemon C-interface that can be used to dynamically configure +an LDMS daemon instead of ldmsd_controller when Python is not available. +After the ldmsctl is started commands can be entered at the prompt or +(usually) a command script can be created and piped into the ldmsctl. + +LDMS version 4 requires ldmsctl to use LDMS transport (data channel) to +connect to **ldmsd** to levarage LDMS Authentication plugin in the +transport. Please note that the **ldmsd** may have multiple data +channels, one of which can be dedicated for management use. + +ENVIRONMENT +=============== + +The following environment variables must be set (includes environment +variables needed for the actions, for example, paths to the sampler +libraries to be added): + +LD_LIBRARY_PATH + path_to_ovis_build/lib:path_to_ovis_build/lib/ovis-ldms:path_to_libevent_2.0_build/lib + +ZAP_LIBPATH + path_to_ovis_build/lib/ovis-ldms + +LDMSD_PLUGIN_LIBPATH + path_to_ovis_build/lib/ovis-ldms + +PATH + path_to_ovis_build/sbin:path_to_ovis_build/bin + +OPTIONS +=========== + +**-h** *HOST* + HOST is the hostname to connect to the LDMS daemon + +**-p** *PORT* + PORT is the port to connect to the LDMS daemon + +**-x** *XPRT* + XPRT is the transport one of sock, ugni, or rdma. Only use with the + option -i + +**-a** *AUTH* + AUTH is the name of the LDMS Authentication plugin to be used for the + connection. Please see **ldms_authentication**\ (7) for more + information. If this option is not given, the default is "none" (no + authentication). + +**-A** *NAME*\ **=**\ *VALUE* + Passing the *NAME*\ =\ *VALUE* option to the LDMS Authentication + plugin. This command line option can be given multiple times. Please + see **ldms_authentication**\ (7) for more information, and consult + the plugin manual page for plugin-specific options. + +**-s** *SOURCE* + SOURCE is the path to a configuration file + +**-X** *COMMAND* + COMMAND is a shell command to be executed. The output will be sent to + ldmsd. + +**-V** + Display LDMS version information and then exit. + +REGULAR EXPRESSION +====================== + +The regular expression specified in *regex=* option of the commands is a +POSIX Extended (modern) Regular Expression. In short, "\*+?{}|^$." are +special regular expression characters. Please see **regex(7)** for more +information. + +PLUGIN COMMAND SYNTAX +========================= + +Load a plugin +------------- + +| **load** attr= + + **name** *name* + | + | The plugin name + +List the usage of the loaded plugins +------------------------------------ + +**usage** + +unload a plugin +--------------- + +| **term** attr= + + **name** *name* + | + | The plugin name + +Send a configuration command to the specified plugin. +----------------------------------------------------- + +**config** attr= + + **name** *name* + | + | The plugin name + + **attr=value** + | + | Plugin specific attr=value tuples + + **Attributes specific for sampler plugins (Some sampler plugins + may have additional** attributes) + + **producer** *producer* + | + | A unique name for the host providing the data + + **instance** *instance* + | + | The set instance name. The name must be unique among all + metric sets in all LDMS daemons. + + **[component_id** *component_id*\ **]** + | + | A unique number for the comopnent being monitored. The + default is zero. + + **[schema** *schema*\ **]** + | + | The name of the metric set schema. + + **[job_set** *job_set*\ **]** + | + | The set instance name of the set containing the job data. The + default is 'job_info'. + + **[uid** *uid*\ **]** + | + | The user id of the set's owner. The default is the returned + value of geteuid(). + + **[gid** *gid*\ **]** + | + | The group id of the set's owner. The default is the returned + value of getegid(). + + **[perm** *perm*\ **]** + | + | The sampler plugin instance access permission. The default is + 0440. + +Start a sampler plugin +---------------------- + +**start** attr= + + **name** *name* + | + | The plugin name. + + **interval** *interval* + | + | The sample interval in microseconds. + + **[offset** *offset*\ **]** + | + | Offset (shift) from the sample mark in microseconds. Offset can + be positive or negative with magnitude up to 1/2 the sample + interval. If this offset is specified, including 0, collection + will be synchronous; if the offset is not specified, collection + will be asynchronous. Optional. + +Stop a sampler plugin +--------------------- + +**stop** attr= + + **name** *name* + | + | The plugin name. + +PRODUCER COMMAND SYNTAX +=========================== + +Add a producer to the aggregator +-------------------------------- + +| **prdcr_add** attr= + + **name** *name* + | + | The producer name. The producer name must be unique in an + aggregator. It is independent of any attributes specified for + the metric sets or hosts. + + **xprt** *xprt* + | + | The transport name [sock, rdma, ugni] + + **host** *host* + | + | The hostname of the host + + **type** *conn_type* + | + | The connection type [active, passive] + + **interval** *interval* + | + | The connection retry interval + + **[perm** *permission*\ **]** + | + | The permission to modify the producer in the future + +Delete a producer from the aggregator +------------------------------------- + +| The producer cannot be in use or running +| **prdcr_del** attr= + + **name** *name* + | + | The producer name + +Start a producer +---------------- + +**prdcr_start** attr= + + **name** *name* + | + | The producer name + + **[interval** *interval*\ **]** + | + | The connection retry interval in microsec. If unspecified, the + previously configured value will be used. Optional. + +Start all producers matching a regular expression +------------------------------------------------- + +**prdcr_start_regex** attr= + + **regex** *regex* + | + | A regular expression + + **[interval** *interval*\ **]** + | + | The connection retry interval in microsec. If unspecified, the + previously configured value will be used. Optional. + +Stop a producer +--------------- + +**prdcr_stop** attr= + + **name** *name* + | + | The producer name + +Stop all producers matching a regular expression +------------------------------------------------ + +**prdcr_stop_regex** attr= + + **regex** *regex* + | + | A regular expression + +Query producer status +--------------------- + +**prdcr_status** attr= + + **[name** *name*\ **]** + | + | The producer name. If none is given, the statuses of all + producers are reported. + +Subscribe for stream data from all matching producers +----------------------------------------------------- + +**prdcr_subsribe** + + **regex** *regex* + | + | The regular expression matching producer name + + **stream** *stream* + | + | The stream name + +UPDATER COMMAND SYNTAX +========================== + +Add an updater process that will periodically sample producer metric sets +------------------------------------------------------------------------- + +**updtr_add** attr= + + **name** *name* + | + | The update policy name. The policy name should be unique. It is + independent of any attributes specified for the metric sets or + hosts. + + **interval** *interval* + | + | The update/collect interval + + **[offset** *offset*\ **]** + | + | Offset for synchronized aggregation. Optional. + + **[push** *onchange|true*\ **]** + | + | Push mode: 'onchange' and 'true'. 'onchange' means the Updater + will get an update whenever the set source ends a transaction or + pushes the update. 'true' means the Updater will receive an + update only when the set source pushes the update. If \`push\` + is used, \`auto_interval\` cannot be \`true\`. + + **[auto_interval** *true|false* **]** + If true, the updater will schedule set updates according to the + update hint. The sets with no hints will not be updated. If false, + the updater will schedule the set updates according to the given + interval and offset values. If not specified, the value is + *false*. + + **[perm** *permission*\ **]** + | + | The permission to modify the updater in the future + +Remove an updater from the configuration +---------------------------------------- + +**updtr_del** attr= + + **name** *name* + | + | The update policy name + +Add a match condition that specifies the sets to update. +-------------------------------------------------------- + +**updtr_match_add** attr= + + **name** *name* + | + | The update policy name + + **regex** *regex* + | + | The regular expression + + **match** *match (inst|schema)* + | + | The value with which to compare; if match=inst, the expression + will match the set's instance name, if match=schema, the + expression will match the set's schema name. + +Remove a match condition from the Updater. +------------------------------------------ + +**updtr_match_del** attr= + + **name** *name* + | + | The update policy name + + **regex** *regex* + | + | The regular expression + + **match** *match (inst|schema)* + | + | The value with which to compare; if match=inst, the expression + will match the set's instance name, if match=schema, the + expression will match the set's schema name. + +Add matching producers to an updater policy +------------------------------------------- + +This is required before starting the updater. + +**updtr_prdcr_add** attr= + + **name** *name* + | + | The update policy name + + **regex** *regex* + | + | A regular expression matching zero or more producers + +Remove matching producers to an updater policy +---------------------------------------------- + +**updtr_prdcr_del** attr= + + **name** *name* + | + | The update policy name + + **regex** *regex* + | + | A regular expression matching zero or more producers + +Start updaters. +--------------- + +**updtr_start** attr= + + **name** *name* + | + | The update policy name + + **[interval** *interval*\ **]** + | + | The update interval in micro-seconds. If this is not specified, + the previously configured value will be used. Optional. + + **[offset** *offset*\ **]** + | + | Offset for synchronized aggregation. Optional. + +Stop an updater. +---------------- + +The Updater must be stopped in order to change it's configuration. + +**updtr_stop** attr= + + **name** *name* + | + | The update policy name + +Query the updater status +------------------------ + +**updtr_status** attr= + + **[name** *name*\ **]** + | + | The updater name. If none is given, the statuses of all updaters + are reported. + +STORE COMMAND SYNTAX +======================== + +Create a Storage Policy and open/create the storage instance. +------------------------------------------------------------- + +**strgp_add** attr= + + **name** *name* + | + | The unique storage policy name. + + **plugin** *plugin* + | + | The name of the storage backend. + + **container** *container* + | + | The storage backend container name. + + **schema** *schema* + | + | The schema name of the metric set to store. + + **[perm** *permission*\ **]** + | + | The permission to modify the storage in the future + +Remove a Storage Policy +----------------------- + +| All updaters must be stopped in order for a storage policy to be + deleted +| **strgp_del** attr= + + **name** *name* + | + | The storage policy name + +Add a regular expression used to identify the producers this storage policy will apply to. +------------------------------------------------------------------------------------------ + +| If no producers are added to the storage policy, the storage policy + will apply on all producers. +| **strgp_prdcr_add** attr= + + **name** *name* + | + | The storage policy name + + **regex** *name* + | + | A regular expression matching metric set producers. + +Remove a regular expression from the producer match list +-------------------------------------------------------- + +**strgp_prdcr_del** attr= + + | **name** *name* + | The storage policy name + + **regex** *regex* + | + | The regex of the producer to remove. + +Add the name of a metric to store +--------------------------------- + +**strgp_metric_add** attr= + + | **name** *name* + | The storage policy name + + **metric** *metric* + | + | The metric name. If the metric list is NULL, all metrics in the + metric set will be stored. + +Remove a metric from the set of stored metrics. +----------------------------------------------- + +**strgp_metric_del** attr= + + | **name** *name* + | The storage policy name + + **metric** *metric* + | + | The metric to remove + +Start a storage policy. +----------------------- + +**strgp_start** attr= + + | **name** *name* + | The storage policy name + +Stop a storage policy. +---------------------- + +A storage policy must be stopped in order to change its configuration. + +**strgp_stop** attr= + + | **name** *name* + | The storage policy name + +Query the storage policy status +------------------------------- + +**strgp_status** attr= + + **[name** *name*\ **]** + | + | The storage policy name. If none is given, the statuses of all + storage policies are reported. + +FAILOVER COMMAND SYNTAX +=========================== + +Please see **ldmsd_failover**\ (7). + +SETGROUP COMMAND SYNTAX +=========================== + +Please see **ldmsd_setgroup**\ (7). + +STREAM COMMAND SYNTAX +========================= + +Publish data to the named stream +-------------------------------- + +**plublish** attr= + + **name** *name* + | + | The stream name + + **data** *data* + | + | The data to publish + +Subscribe to a stream +--------------------- + +**subscribe** attr= + + **name** *name* + | + | The stream name + +LDMS DAEMON COMMAND SYNTAX +============================== + +Changing the verbosity level of ldmsd +------------------------------------- + +**loglevel** attr= + + | **level** *level* + | Verbosity levels [DEBUG, INFO, ERROR, CRITICAL, QUIET] + +Exit the connected LDMS daemon gracefully +----------------------------------------- + +**daemon_exit** + +Query the connected LDMS daemon status +-------------------------------------- + +**daemon_status** + +Tell the daemon to dump it's internal state to the log file. +------------------------------------------------------------ + +**status** [name=] + + | **[**\ *type]* + | Reports only the specified objects. The choices are prdcr, updtr + and strgp. + + | prdcr: list the state of all producers. + | updtr: list the state of all update policies. + | strgp: list the state of all storage policies. + + [name *value*] + The object name of which the status will be reported. + +MISC COMMAND SYNTAX +======================= + +Display the list of available commands +-------------------------------------- + +| +| **help** + + | [*command]* + | If a command is given, the help of the command will be printed. + Otherwise, only the available command names are printed. + +Set the user data value for a metric in a metric set. +----------------------------------------------------- + +| +| **udata** attr= + + **set** *set* + | + | The sampler plugin name + + **metric** *metric* + | + | The metric name + + **udata** *udata* + | + | The desired user-data. This is a 64b unsigned integer. + +Set the user data of multiple metrics using regular expression. +--------------------------------------------------------------- + +| The user data of the first matched metric is set to the base value. + The base value is incremented by the given 'incr' value and then sets + to the user data of the consecutive matched metric and so on. +| **udata_regex** attr= + + **set** *set* + | + | The metric set name. + + **regex** *regex* + | + | A regular expression to match metric names to be set + + **base** *base* + | + | The base value of user data (uint64) + + **[incr** *incr*\ **]** + | + | Increment value (int). The default is 0. If incr is 0, the user + data of all matched metrics are set to the base value. Optional. + +Get the LDMS version the running LDMSD is based on. +--------------------------------------------------- + +**version** + +NOTES +========= + +- ldmsctl is currently kept for backwards compatibility purposes with + LDMS v2 commands. ldmsctl still works in version 3, however with + ldmsctl, some capabilitites use v2 pathways as opposed to v3. + +- ldmsctl will be removed in a future release. It is not recommended + that you use this with v2. + +BUGS +======== + +No known bugs. + +EXAMPLES +============ + +1) Run ldmsctl + +:: + + $/tmp/opt/ovis/sbin/ldmsctl -h vm1_2 -p 10001 -x sock + ldmsctl> + +2) After starting ldmsctl, configure "meminfo" collector plugin to +collect every second. + +:: + + Note: interval=<# usec> e.g interval=1000000 defines a one second interval. + ldmsctl> load name=meminfo + ldmsctl> config name=meminfo component_id=1 set=vm1_1/meminfo + ldmsctl> start name=meminfo interval=1000000 + ldmsctl> quit + +3) Configure collectors on host "vm1" via bash script called collect.sh + +:: + + #!/bin/bash + # Configure "meminfo" collector plugin to collect every second (1000000 usec) on vm1_2 + echo "load name=meminfo" + echo "config name=meminfo component_id=2 set=vm1_2/meminfo" + echo "start name=meminfo interval=1000000" + # Configure "vmstat" collector plugin to collect every second (1000000 usec) on vm1_2 + echo "load name=vmstat" + echo "config name=vmstat component_id=2 set=vm1_2/vmstat" + echo "start name=vmstat interval=1000000" + + Make collect.sh executable + chmod +x collect.sh + + Execute collect.sh (Note: When executing this across many nodes you would use pdsh to execute the script on all nodes + in parallel) + > ldmsd -x sock:11111 -l ldmsd.log + > ldmsctl -x sock -p 11111 -h localhost -X collect.sh + +:: + +SEE ALSO +============ + +ldms_authentication(7), ldmsd(8), ldms_ls(8), ldmsd_controller(8), +ldms_quickstart(7) diff --git a/rtd/docs/source/ldms_man/ldmsd.rst b/rtd/docs/source/ldms_man/ldmsd.rst new file mode 100644 index 000000000..567b354aa --- /dev/null +++ b/rtd/docs/source/ldms_man/ldmsd.rst @@ -0,0 +1,391 @@ +===== +ldmsd +===== + +:Date: 28 Feb 2018 + +.. contents:: + :depth: 3 +.. + +NAME +====== + +ldmsd - Start an ldms daemon + +SYNOPSIS +========== + +ldmsd [OPTION...] + +DESCRIPTION +============= + +The ldmsd command can be used to start an ldms daemon. Plugin +configuration of the ldmsd can be done via the a configuration file or +the ldmsd_controller. + +Starting ldmsd with the configuration file option enables you to +statically configure a sampler without requiring python. Dynamically +configuring samplers with ldmsd_controller requires python. Currently, +v2's ldmsctl can still be used to dynamically configure a sampler +without requiring python. This capability will be replaced and it is not +recommended that you use this option. + +ENVIRONMENT +============= + +The ldmsd-check-env program will dump currently set environment variables that may influence ldmsd and plugin behavior. +----------------------------------------------------------------------------------------------------------------------- + +The following environment variables must often be set: + +LD_LIBRARY_PATH + Path to ovis/lib and libevent2/lib, if not in a system default path. + Depending on the system these may be lib64 instead of lib. + +PATH + Include the path to sbin directory containing ldmsd. + +The following environment variables may be set to override compiled-in defaults: +-------------------------------------------------------------------------------- + +ZAP_LIBPATH + Path to ovis/lib/ovis-ldms + +LDMSD_PLUGIN_LIBPATH + Path to ovis/lib/ovis-ldms + +LDMSD_PIDFILE + Full path name of pidfile overriding the default /var/run/ldmsd.pid + unless the command line argument "-r pidfilepath" is present. + +LDMSD_LOG_TIME_SEC + If present, log messages are stamped with the epoch time rather than + the date string. This is useful when sub-second information is + desired or correlating log messages with other epoch-stamped data. + +LDMSD_SOCKPATH + Path to the unix domain socket for the ldmsd. Default is created + within /var/run. If you must change the default (e.g., not running as + root and hence /var/run is not writeable), set this variable (e.g., + /tmp/run/ldmsd) or specify "-S socketpath" to ldmsd. + +LDMSD_MEM_SZ + The size of memory reserved for metric sets. Set this variable or + specify "-m" to ldmsd. See the -m option for further details. If both + are specified, the -m option takes precedence over this environment + variable. + +LDMSD_UPDTR_OFFSET_INCR + The increment to the offset hint in microseconds. This is only for + updaters that determine the update interval and offset automatically. + For example, the offset hint is 100000 which is 100 millisecond of + the second. The updater offset will be 100000 + + LDMSD_UPDTR_OFFSET_INCR. The default is 100000 (100 milliseconds). + +CRAY Specific Environment variables for ugni transport +------------------------------------------------------ + +ZAP_UGNI_PTAG For XE/XK, the PTag value as given by apstat -P. For XC, +The value does not matter but the environment variable must be set. + +ZAP_UGNI_COOKIE + For XE/XK, the Cookie value corresponding to the PTag value as given + by apstat -P For XC, the Cookie value (not Cookie2) as given by + apstat -P + +ZAP_UGNI_CQ_DEPTH + Optional value for the CQ depth. Default is 2048. + +ZAP_UGNI_STATE_INTERVAL + Optional. If set, then ldmsd will check all nodes' states via rca + interface. States for all nodes are checked and stored at intervals + determined by this environment variable. The stored values are + checked against before contacting a node. If you choose to use this + option, then the rule of thumb is to set ZAP_UGNI_STATE_INTERVAL and + ZAP_UGNI_STATE_OFFSET such that the node states are checked before + the metric set update occurs (see interval and offset in + ldmsd_controller) + +ZAP_UGNI_STATE_OFFSET + Optional. Only relevant if ZAP_UGNI_STATE_INTERVAL is set. Defaults + to zero. Offset from zero for checking the nodes state (see + ZAP_UGNI_STATE_INTERVAL, above). + +OPTIONS +========= + +General/Configuration Options: +------------------------------ + +**-F** + Run in foreground mode; don't daemonize the program. Default is + false. + +**-B, --banner** *version-file-mode [0, 1, 2]* + When run in daemon mode, controls the existence of the banner file. + Mode 0 suppresses the version file. Mode 1 deletes it at daemon exit. + Mode >= 2 leaves it in place for debugging after daemon exit. Default + mode is 1. The banner contains the software and protocol versions + information, which is also logged at the INFO level. The banner file + name is always the pidfile name with .version appended. + +**-c** *CONFIG_PATH* + The path to configuration file (optional, default: ). The + configuration file contains a batch of ldmsd controlling commands, + such as \`load\` for loading a plugin, and \`prdcr_add\` for defining + a ldmsd producer to aggregate from (see **ldmsd_controller**\ (8) for + a complete list of commands, or simply run **ldmsd_controller** then + **help**). The commands in the configuration file are executed + sequentially, except for **prdcr_start**, **updtr_start**, + **strgp_start**, and **failover_start** that will be deferred. If + **failover_start** is present, the failover service will start first + (among the deferred). Then, upon failover pairing success or failure, + the other deferred configuration objects will be started. Please also + note that while failover service is in use, prdcr, updtr, and strgp + cannot be altered (start, stop, or reconfigure) over in-band + configuration. See also REORDERED COMMANDS below. + +**-m, --set_memory** *MEMORY_SIZE* + | + | MEMORY_SIZE is the maximum size of pre-allocated memory for metric + sets. The given size must be less than 1 petabytes. For example, + 20M or 20mb are 20 megabytes. The default is adequate for most + ldmsd acting in the collector role. For aggregating ldmsd, a rough + estimate of preallocated memory needed is (Number of nodes + aggregated) x (Number of metric sets per node) x 4k. Data sets + containing arrays may require more. The estimate can be checked by + enabling DEBUG logging and examining the mm_stat bytes_used+holes + value at ldmsd exit. + +**-n, --daemon_name** *NAME* + | + | The name of the daemon. By default, it is "*HOSTNAME:PORT*". The + failover feature uses the daemon name to verify the buddy name, and + the producer name of kernel metric sets is the daemon name. + +**-r, --pid_file** *pid_file* + The path to the pid file and prefix of the .version banner file for + daemon mode. + +**-V** + Display LDMS version information and then exit. + +**-u** plugin_name + Display the usage for named plugin. Special names all, sampler, and + store match all, sampler type, and store type plugins, respectively. + +Communication Options: +---------------------- + +**-x** *XPRT:PORT:HOST* + | + | Specifies the transport type to listen on. May be specified more + than once for multiple transports. The XPRT string is one of + 'rdma', 'sock', or 'ugni' (CRAY XE/XK/XC). A transport specific + port number must be specified following a ':', e.g. rdma:10000. An + optional host or address may be specified after the port, e.g. + rdma:10000:node1-ib, to listen to a specific address. + +The listening transports can also be specified in the configuration file +using **listen** command, e.g. \`listen xprt=sock port=1234 +host=node1-ib\`. Please see **ldmsd_controller**\ (8) section **LISTEN +COMMAND SYNTAX** for more details. + +**-a, --default_auth** *AUTH* + Specify the default LDMS Authentication method for the LDMS + connections in this daemon (when the connections do not specify + authentication method/domain). Please see + **ldms_authentication**\ (7) for more information. If this option is + not given, the default is "none" (no authentication). Also see + **ldmsd_controller**\ (8) section **AUTHENTICATION COMMAND SYNTAX** + for how to define an authentication domain. + +**-A, --default_auth_args** *NAME*\ **=**\ *VALUE* + Passing the *NAME*\ =\ *VALUE* option to the LDMS Authentication + plugin. This command line option can be given multiple times. Please + see **ldms_authentication**\ (7) for more information, and consult + the plugin manual page for plugin-specific options. + +Log Verbosity Options: +---------------------- + +**-l, --log_file** *LOGFILE* + | + | LOGFILE is the path to the log file for status messages. Default is + stdout unless given. The syslog facility is used if LOGFILE is + exactly "syslog". Silence can be obtained by specifying /dev/null + for the log file or using command line redirection as illustrated + below. + +**-v, --log_level** *LOG_LEVEL* + | + | LOG_LEVEL can be one of DEBUG, INFO, ERROR, CRITICAL or QUIET. The + default level is ERROR. QUIET produces only user-requested output. + (Note: this has changed from the previous release where q + designated no (QUIET) logging). + +**-t** + Truncate the log file if it already exists. + +**-L,**\ *--log_config* ** \| \| ** + | + | Append configuration replay messages or configuration debugging + messages to the log indicated by -l (when PATH is omitted) or to + the file named PATH. Bit values of CINT correspond to: + +:: + + 0: no messages + 1: debug messages from the generic 'request' handler + 2: config history messages in replayable format + 4: query history messages in replayable format + 8: failover debugging messages + 16: include delta time prefix when using PATH + 32: include epoch timestamp prefix when using PATH + +These values may be added together to enable multiple outputs. All +messages are logged at the user-requested level, LDMSD_LALL. CINT values +2, 26 and 27 are often interesting. When CINT is omitted, 1 is the +default. When PATH is used, the log messages are flushed to as they are +generated. + +Kernel Metric Options: +---------------------- + +**-k, --publish_kernel** + Publish kernel metrics. + +**-s, --kernel_set_file** *SETFILE* + Text file containing kernel metric sets to publish. Default: + /proc/sys/kldms/set_list + +Thread Options: +--------------- + +**-P, --worker_threads** *THR_COUNT* + | + | THR_COUNT is the number of event threads to start. + +SPECIFYING COMMAND-LINE OPTIONS IN CONFIGURATION FILES +======================================================== + +Users can use the 'option' command to specify some command-line options +in a configuration file. + + option + +Command-line options supported by the 'option' command and the corresponding attributes +--------------------------------------------------------------------------------------- + +**-a,**\ *--default_auth* + +**-A,**\ *--default_auth_args* + +**-B,**\ *--banner* + +**-k,**\ *--publish_kernel* + +**-l,**\ *--log_file* **PATH** + +**-m,**\ *--set_memory* + +**-n,**\ *--daemon_name* + +**-P,**\ *--worker_threads* + +**-r,**\ *--pid_file* + +**-s,**\ *--kernel_set_path* + +**-v,**\ *--log_level* + +**-L,**\ *--log_config* **** + +Specifying the listen endpoints in configuraton files +----------------------------------------------------- + +Users can use the 'listen' command to define the listen endpoints. For example, + listen xprt=sock port=411 + +Example +------- + +> cat ldmsd.conf + +:: + + # cmd-line options + option --log_file /opt/ovis/var/ldmsd.log --log_level ERROR + option -m 2GB -P 16 + option -a munge + listen xprt=ugni port=411 + # meminfo + load name=meminfo + config name=meminfo producer=nid0001 instance=nid0001/meminfo + start name=meminfo interval=1000000 offset=0 + +RUNNING LDMSD ON CRAY XE/XK/XC SYSTEMS USING APRUN +==================================================== + +ldsmd can be run as either a user or as root using the appropriate PTag +and cookie. + +Check (or set) the PTag and cookie. + + Cray XE/XK Systems: + + :: + + > apstat -P + PDomainID Type Uid PTag Cookie + LDMS system 0 84 0xa9380000 + foo user 22398 243 0x2bb0000 + + Cray XC Systems: + > apstat -P + PDomainID Type Uid Cookie Cookie2 + LDMS system 0 0x86b80000 0 + foo user 20596 0x86bb0000 0x86bc0000 + + Set the environment variables ZAP_UGNI_PTAG and ZAP_UGNI_COOKIE with + the appropriate ptag and cookie. + + Run ldmsd directly or as part of a script launched from aprun. In + either case, Use aprun with the correct -p when running. + +REORDERED COMMANDS +==================== + +Certain commands in are reordered when processing input scripts +specified with -c. Items related to failover are handled as described in +the '-c' section above. Other commands are promoted to run before any +non-promoted commands from the loaded script. In particular, env, +loglevel, listen, auth, and option are promoted. + +NOTES +======= + +OCM flags are unsupported at this time. + +BUGS +====== + +None known. + +EXAMPLES +========== + +:: + + $/tmp/opt/ovis/sbin/ldmsd -x sock:60000 -p unix:/var/run/ldmsd/metric_socket -l /tmp/opt/ovis/logs/1 + + + $/tmp/opt/ovis/sbin/ldmsd -x sock:60000 -p sock:61000 -p unix:/var/runldmsd/metric_socket + +SEE ALSO +========== + +ldms_authentication(7), ldmsctl(8), ldms_ls(8), ldmsd_controller(8), +ldms_quickstart(7) diff --git a/rtd/docs/source/ldms_man/ldmsd_controller.rst b/rtd/docs/source/ldms_man/ldmsd_controller.rst new file mode 100644 index 000000000..8451c0775 --- /dev/null +++ b/rtd/docs/source/ldms_man/ldmsd_controller.rst @@ -0,0 +1,866 @@ +================ +ldmsd_controller +================ + +:Date: 19 Nov 2019 + +.. contents:: + :depth: 3 +.. + +NAME +================= + +ldmsd_controller - a python program to configure an ldms daemon. + +SYNOPSIS +===================== + +**ldmsd_controller** [OPTIONS] + +ldmsd_controller> [ = ] + +DESCRIPTION +======================== + +With LDMS (Lightweight Distributed Metric Service), the ldmsd can be +configured via the ldmsd_controller. + +If ldms is built with --enable-readline, one can invoke the +ldmsd_controller from the command line and obtain an input interface +with feedback. In many instances, instances, however, it is prefered to +execute scripts and send the output commands to an ldmsd instead. + +ENVIRONMENT +======================== + +Note: python2.6 with the additional installation of the argparse module +OR python2.7 (which has the argparse module) is required. + +PYTHONPATH + /lib[64]/pythonX.Y/site-packages/ + +PATH + /bin + +LDMSD_CONTROLLER OPTIONS +===================================== + +**-h,--host** *HOST* + Hostname of **ldmsd** to connect to + +**-p,--port** *PORT* + The port of **ldmsd** to connect to + +**-x,--xprt** *XPRT* + The transport type (**sock**, **rdma**, **ugni**). + +**-a,--auth** *AUTH* + The LDMS authentication plugin. Please see + **ldms_authentication**\ (7) for more information. + +**-A,--auth-arg** *NAME=VALUE* + Options *NAME*\ =\ *VALUE* Passing the *NAME*\ =\ *VALUE* option to + the LDMS Authentication plugin. This command line option can be given + multiple times. Please see **ldms_authentication**\ (7) for more + information, and consult the plugin manual page for plugin-specific + options. + +**--source** *SOURCE* + | + | Path to the config file + +**--script** *SCRIPT* + | + | Execute the script and send the output commands to the connected + ldmsd + +**-?** + Display help + +**--help** + Display help + +REGULAR EXPRESSION +=============================== + +The regular expression specified in *regex=* option of the commands is a +POSIX Extended (modern) Regular Expression. In short, "\*+?{}|^$." are +special regular expression characters. Please see **regex(7)** for more +information. + +PLUGIN COMMAND SYNTAX +================================== + +Load a plugin +------------- + +| **load** attr= + + **name** *name* + | + | The plugin name + +List the usage of the loaded plugins +------------------------------------ + +**usage** + +unload a plugin +--------------- + +| **term** attr= + + **name** *name* + | + | The plugin name + +Send a configuration command to the specified plugin. +----------------------------------------------------- + +**config** attr= + + **name** *name* + | + | The plugin name + + **attr=value** + | + | Plugin specific attr=value tuples + + **Attributes specific for sampler plugins (Some sampler plugins + may have additional** attributes) + + **producer** *producer* + | + | A unique name for the host providing the data + + **instance** *instance* + | + | The set instance name. The name must be unique among all + metric sets in all LDMS daemons. + + **[component_id** *component_id*\ **]** + | + | A unique number for the comopnent being monitored. The + default is zero. + + **[schema** *schema*\ **]** + | + | The name of the metric set schema. + + **[job_set** *job_set*\ **]** + | + | The set instance name of the set containing the job data. The + default is 'job_info'. + + **[uid** *uid*\ **]** + | + | The user id of the set's owner. The default is the returned + value of geteuid(). + + **[gid** *gid*\ **]** + | + | The group id of the set's owner. The default is the returned + value of getegid(). + + **[perm** *perm*\ **]** + | + | The sampler plugin instance access permission. The default is + 0440. + +Start a sampler plugin +---------------------- + +**start** attr= + + **name** *name* + | + | The plugin name. + + **interval** *interval* + | + | The sample interval in microseconds. + + **[offset** *offset*\ **]** + | + | Offset (shift) from the sample mark in microseconds. Offset can + be positive or negative with magnitude up to 1/2 the sample + interval. If this offset is specified, including 0, collection + will be synchronous; if the offset is not specified, collection + will be asynchronous. Optional. + +Stop a sampler plugin +--------------------- + +**stop** attr= + + **name** *name* + | + | The plugin name. + +AUTHENTICATION COMMAND SYNTAX +========================================== + +Add an authentication domain +---------------------------- + +**auth_add** **name**\ =\ *NAME* **plugin**\ =\ *PLUGIN* [ ... *PLUGIN +ATTRIBUTES* ... ] + + **name**\ =\ *NAME* + | + | The name of the authentication domain. This is the name referred + to by **listen** and **prdcr_add** commands. + + **plugin**\ =\ *none*\ \|\ *ovis*\ \|\ *munge* + | + | The LDMS Authentication Plugin for this domain. + + [ ... *PLUGIN ATTRIBUTES* ... ] + | + | Arbitrary plugin attributes. Please consult the manual of the + authentication plugin for more information. + +LISTEN COMMAND SYNTAX +================================== + +Instruct ldmsd to listen to a port +---------------------------------- + +**listen** **port**\ =\ *PORT* +**xprt**\ =\ *sock*\ \|\ *rdma*\ \|\ *ugni*\ \|\ *fabric* +[**host**\ =\ *HOST*] [**auth**\ =\ *AUTH_REF*] + + **port**\ =\ *PORT* + | + | The port to listen to. Also, please be sure not to use ephemeral + port (ports in the range of + **/proc/sys/net/ip4/ip_local_port_range**). + + **xprt**\ =\ *sock*\ \|\ *rdma*\ \|\ *ugni*\ \|\ *fabric* + | + | The type of the transport. + + **host**\ =\ *HOST* + | + | An optional hostname or IP address to bind. If not given, listen + to all addresses (0.0.0.0 or PORT). + + **auth**\ =\ *AUTH_REF* + | + | Instruct **ldmsd** to use *AUTH_REF* (a name reference to + **auth** object created by **auth_add** command) to authenticate + connections on this port. If not given, the port uses the + default authentication method specified on the CLI options (see + **ldmsd**\ (8) option **-a**). + +PRODUCER COMMAND SYNTAX +==================================== + +Add a producer to the aggregator +-------------------------------- + +| **prdcr_add** attr= + + **name** *name* + | + | The producer name. The producer name must be unique in an + aggregator. It is independent of any attributes specified for + the metric sets or hosts. + + **xprt** *xprt* + | + | The transport name [sock, rdma, ugni] + + **host** *host* + | + | The hostname of the host + + **type** *conn_type* + | + | The connection type [active, passive] + + **interval** *interval* + | + | The connection retry interval + + **[perm** *permission*\ **]** + | + | The permission to modify the producer in the future + + **[auth** *AUTH_REF*\ **]** + | + | Instruct **ldmsd** to use *AUTH_REF* (a name reference to + **auth** object created by **auth_add** command) with the + connections to this producer. If not given, the default + authentication method specified on the CLI options (see + **ldmsd**\ (8) option **-a**) is used. + +Delete a producer from the aggregator +------------------------------------- + +| The producer cannot be in use or running +| **prdcr_del** attr= + + **name** *name* + | + | The producer name + +Start a producer +---------------- + +**prdcr_start** attr= + + **name** *name* + | + | The producer name + + **[interval** *interval*\ **]** + | + | The connection retry interval in microsec. If unspecified, the + previously configured value will be used. Optional. + +Start all producers matching a regular expression +------------------------------------------------- + +**prdcr_start_regex** attr= + + **regex** *regex* + | + | A regular expression + + **[interval** *interval*\ **]** + | + | The connection retry interval in microsec. If unspecified, the + previously configured value will be used. Optional. + +Stop a producer +--------------- + +**prdcr_stop** attr= + + **name** *name* + | + | The producer name + +Stop all producers matching a regular expression +------------------------------------------------ + +**prdcr_stop_regex** attr= + + **regex** *regex* + | + | A regular expression + +Query producer status +--------------------- + +**prdcr_status** attr= + + **[name** *name*\ **]** + | + | The producer name. If none is given, the statuses of all + producers are reported. + +Subscribe for stream data from all matching producers +----------------------------------------------------- + +**prdcr_subsribe** + + **regex** *regex* + | + | The regular expression matching producer name + + **stream** *stream* + | + | The stream name + +UPDATER COMMAND SYNTAX +=================================== + +Add an updater process that will periodically sample producer metric sets +------------------------------------------------------------------------- + +**updtr_add** attr= + + **name** *name* + | + | The update policy name. The policy name should be unique. It is + independent of any attributes specified for the metric sets or + hosts. + + **interval** *interval* + | + | The update/collect interval + + **[offset** *offset*\ **]** + | + | Offset for synchronized aggregation. Optional. + + **[push** *onchange|true*\ **]** + | + | Push mode: 'onchange' and 'true'. 'onchange' means the Updater + will get an update whenever the set source ends a transaction or + pushes the update. 'true' means the Updater will receive an + update only when the set source pushes the update. If \`push\` + is used, \`auto_interval\` cannot be \`true\`. + + **[auto_interval** *true|false* **]** + If true, the updater will schedule set updates according to the + update hint. The sets with no hints will not be updated. If false, + the updater will schedule the set updates according to the given + interval and offset values. If not specified, the value is + *false*. + + **[perm** *permission*\ **]** + | + | The permission to modify the updater in the future + +Remove an updater from the configuration +---------------------------------------- + +**updtr_del** attr= + + **name** *name* + | + | The update policy name + +Add a match condition that specifies the sets to update. +-------------------------------------------------------- + +**updtr_match_add** attr= + + **name** *name* + | + | The update policy name + + **regex** *regex* + | + | The regular expression + + **match** *match (inst|schema)* + | + | The value with which to compare; if match=inst, the expression + will match the set's instance name, if match=schema, the + expression will match the set's schema name. + +Remove a match condition from the Updater. +------------------------------------------ + +**updtr_match_del** attr= + + **name** *name* + | + | The update policy name + + **regex** *regex* + | + | The regular expression + + **match** *match (inst|schema)* + | + | The value with which to compare; if match=inst, the expression + will match the set's instance name, if match=schema, the + expression will match the set's schema name. + +Add matching producers to an updater policy +------------------------------------------- + +This is required before starting the updater. + +**updtr_prdcr_add** attr= + + **name** *name* + | + | The update policy name + + **regex** *regex* + | + | A regular expression matching zero or more producers + +Remove matching producers to an updater policy +---------------------------------------------- + +**updtr_prdcr_del** attr= + + **name** *name* + | + | The update policy name + + **regex** *regex* + | + | A regular expression matching zero or more producers + +Start updaters. +--------------- + +**updtr_start** attr= + + **name** *name* + | + | The update policy name + + **[interval** *interval*\ **]** + | + | The update interval in micro-seconds. If this is not specified, + the previously configured value will be used. Optional. + + **[offset** *offset*\ **]** + | + | Offset for synchronized aggregation. Optional. + +Stop an updater. +---------------- + +The Updater must be stopped in order to change it's configuration. + +**updtr_stop** attr= + + **name** *name* + | + | The update policy name + +Query the updater status +------------------------ + +**updtr_status** attr= + + **[name** *name*\ **]** + | + | The updater name. If none is given, the statuses of all updaters + are reported. + +STORE COMMAND SYNTAX +================================= + +Create a Storage Policy and open/create the storage instance. +------------------------------------------------------------- + +**strgp_add** attr= + + **name** *name* + | + | The unique storage policy name. + + **plugin** *plugin* + | + | The name of the storage backend. + + **container** *container* + | + | The storage backend container name. + + **schema** *schema* + | + | The schema name of the metric set to store. + + **[perm** *permission*\ **]** + | + | The permission to modify the storage in the future + +Remove a Storage Policy +----------------------- + +| All updaters must be stopped in order for a storage policy to be + deleted +| **strgp_del** attr= + + **name** *name* + | + | The storage policy name + +Add a regular expression used to identify the producers this storage policy will apply to. +------------------------------------------------------------------------------------------ + +| If no producers are added to the storage policy, the storage policy + will apply on all producers. +| **strgp_prdcr_add** attr= + + **name** *name* + | + | The storage policy name + + **regex** *name* + | + | A regular expression matching metric set producers. + +Remove a regular expression from the producer match list +-------------------------------------------------------- + +**strgp_prdcr_del** attr= + + | **name** *name* + | The storage policy name + + **regex** *regex* + | + | The regex of the producer to remove. + +Add the name of a metric to store +--------------------------------- + +**strgp_metric_add** attr= + + | **name** *name* + | The storage policy name + + **metric** *metric* + | + | The metric name. If the metric list is NULL, all metrics in the + metric set will be stored. + +Remove a metric from the set of stored metrics. +----------------------------------------------- + +**strgp_metric_del** attr= + + | **name** *name* + | The storage policy name + + **metric** *metric* + | + | The metric to remove + +Start a storage policy. +----------------------- + +**strgp_start** attr= + + | **name** *name* + | The storage policy name + +Stop a storage policy. +---------------------- + +A storage policy must be stopped in order to change its configuration. + +**strgp_stop** attr= + + | **name** *name* + | The storage policy name + +Query the storage policy status +------------------------------- + +**strgp_status** attr= + + **[name** *name*\ **]** + | + | The storage policy name. If none is given, the statuses of all + storage policies are reported. + +FAILOVER COMMAND SYNTAX +==================================== + +Please see **ldmsd_failover**\ (7). + +SETGROUP COMMAND SYNTAX +==================================== + +Please see **ldmsd_setgroup**\ (7). + +STREAM COMMAND SYNTAX +================================== + +Publish data to the named stream +-------------------------------- + +**plublish** attr= + + **name** *name* + | + | The stream name + + **data** *data* + | + | The data to publish + +Subscribe to a stream +--------------------- + +**subscribe** attr= + + **name** *name* + | + | The stream name + +LDMS DAEMON COMMAND SYNTAX +======================================= + +Changing the verbosity level of ldmsd +------------------------------------- + +**loglevel** attr= + + | **level** *level* + | Verbosity levels [DEBUG, INFO, ERROR, CRITICAL, QUIET] + +Exit the connected LDMS daemon gracefully +----------------------------------------- + +**daemon_exit** + +Query the connected LDMS daemon status +-------------------------------------- + +**daemon_status** + +Tell the daemon to dump it's internal state to the log file. +------------------------------------------------------------ + +**status** [name=] + + | **[**\ *type]* + | Reports only the specified objects. The choices are prdcr, updtr + and strgp. + + | prdcr: list the state of all producers. + | updtr: list the state of all update policies. + | strgp: list the state of all storage policies. + + [name *value*] + The object name of which the status will be reported. + +MISC COMMAND SYNTAX +================================ + +Display the list of available commands +-------------------------------------- + +| +| **help** + + | [*command]* + | If a command is given, the help of the command will be printed. + Otherwise, only the available command names are printed. + +Set the user data value for a metric in a metric set. +----------------------------------------------------- + +| +| **udata** attr= + + **set** *set* + | + | The sampler plugin name + + **metric** *metric* + | + | The metric name + + **udata** *udata* + | + | The desired user-data. This is a 64b unsigned integer. + +Set the user data of multiple metrics using regular expression. +--------------------------------------------------------------- + +| The user data of the first matched metric is set to the base value. + The base value is incremented by the given 'incr' value and then sets + to the user data of the consecutive matched metric and so on. +| **udata_regex** attr= + + **set** *set* + | + | The metric set name. + + **regex** *regex* + | + | A regular expression to match metric names to be set + + **base** *base* + | + | The base value of user data (uint64) + + **[incr** *incr*\ **]** + | + | Increment value (int). The default is 0. If incr is 0, the user + data of all matched metrics are set to the base value. Optional. + +Get the LDMS version the running LDMSD is based on. +--------------------------------------------------- + +**version** + +Launch a subshell to do arbitrary commands +------------------------------------------ + +**!**\ shell-command + +Comment (a skipped line) +------------------------ + +**#**\ comment-string + +BUGS +================= + +No known bugs. + +EXAMPLES +===================== + +Example of a script to add producers to updaters +------------------------------------------------ + +:: + + > more add_prdcr.sh + #!/bin/bash + + SOCKDIR=/XXX/run/ldmsd + portbase=61100 + port1=`expr $portbase + 1` + port2=`expr $portbase + 2` + port3=`expr $portbase + 3` + + echo "prdcr_add name=localhost2 host=localhost type=active xprt=sock port=$port2 interval=20000000" + echo "prdcr_start name=localhost2" + echo "prdcr_add name=localhost1 host=localhost type=active xprt=sock port=$port1 interval=20000000" + echo "prdcr_start name=localhost1" + echo "updtr_add name=policy5_h1 interval=2000000 offset=0" + echo "updtr_prdcr_add name=policy5_h1 regex=localhost1" + echo "updtr_start name=policy5_h1" + echo "updtr_add name=policy5_h2 interval=5000000 offset=0" + echo "updtr_prdcr_add name=policy5_h2 regex=localhost2" + echo "updtr_start name=policy5_h2" + +Example of a script to add and start stores +------------------------------------------- + +:: + + > more add_store.sh + #!/bin/bash + + # whole path must exist + STORE_PATH=/XXX/ldmstest/store + mkdir -p $STORE_PATH + sleep 1 + + # CSV + echo "load name=store_csv" + echo "config name=store_csv path=$STORE_PATH action=init altheader=0 rollover=30 rolltype=1" + echo "config name=store_csv action=custom container=csv schema=cray_aries_r altheader=1 userdata=0" + + echo "strgp_add name=policy_mem plugin=store_csv container=csv schema=meminfo" + echo "strgp_start name=policy_mem" + + #echo "strgp_add name=csv_memfoo_policy plugin=store_csv container=meminfo schema=meminfo_foo" + #echo "strgp_prdcr_add name=csv_memfoo_policy regex=localhost*" + #echo "strgp_start name=csv_memfoo_policy" + +Example to start an ldmsd and use ldmsd_controller to call a script +------------------------------------------------------------------- + +:: + + > ldmsd -x sock:11111 -l log.txt + > ldmsd_controller --host localhost --port 11111 --xprt sock --script myscript.sh + +SEE ALSO +===================== + +ldmsd(8), ldmsctl(8), ldms_quickstart(7), ldmsd_failover(7), +ldmsd_setgroup(7) diff --git a/rtd/docs/source/ldms_man/ldmsd_decomposition.rst b/rtd/docs/source/ldms_man/ldmsd_decomposition.rst new file mode 100644 index 000000000..0204971f1 --- /dev/null +++ b/rtd/docs/source/ldms_man/ldmsd_decomposition.rst @@ -0,0 +1,317 @@ +=================== +ldmsd_decomposition +=================== + +:Date: 2 Jun 2022 + +.. contents:: + :depth: 3 +.. + +NAME +==================== + +ldmsd_decomposition - manual for LDMSD decomposition + +DESCRIPTION +=========================== + +A decomposition is a routine that converts LDMS set into one or more +rows before feeding them to the store. Currently, only **store_sos**, +**store_csv**, and **store_kafka** support decomposition. To use +decomposition, simply specify +**decomposition=**\ *DECOMP_CONFIG_JSON_FILE* option in the +**strgp_add** command. There are three types of decompositions: +**static**, **as_is**, and \`flex\`. **static** decomposition statically +and strictly decompose LDMS set according to the definitions in the +*DECOMP_CONFIG_JSON_FILE*. **as_is** decomposition on the other hand +takes all metrics and converts them as-is into rows. **flex** +decomposition applies various decompositions by LDMS schema digest +mapping from the configuration. + +Please see section **STATIC DECOMPOSITION**, **AS_IS DECOMPOSITION** , +and **FLEX DECOMPOSITION** for more information. + +More decomposition types may be added in the future. The decomposition +mechanism is pluggable. Please see **as_is**, **static**, and **flex** +decomposition implementation in \`ldms/src/decomp/\` directory in the +source tree for more information. + +STATIC DECOMPOSITION +==================================== + +The **static** decomposition statically and strictly converts LDMS set +to one or more rows according to the *DECOMP_CONFIG_JSON_FILE*. The +format of the JSON configuration file is as follows: + +:: + + { + "type": "static", + "rows": [ + { + "schema": "OUTPUT_ROW_SCHEMA", + "cols": [ + { "src":"LDMS_METRIC_NAME", "dst":"OUTPUT_COL_NAME","type":"TYPE", + "array_len": ARRAY_LEN_IF_TYPE_IS_ARRAY, + "rec_member": "REC_MEMBER_NAME_IF_SRC_IS_RECORD", + "fill": "FILL_VALUE" + }, + ... + ], + "indices": [ + { "name":"INDEX_NAME", "cols":[ OUTPUT_COLUMNS, ... ] }, + ... + ] + }, + ... + ] + } + +The "rows" is an array of row definition object, each of which defines +an output row. The "schema" attribute specifies the output schema name, +which is the schema name used by the storage plugin to identify the row +schema. Each row definition contains "cols" which is a list of column +definitions, and "indices" which is a list of index definitions. Each +column definition is an object with at least "src" describing the metric +name, "dst" describing the output column name, and "type" describing the +value type of the column. If the type is an array, "array_len" is +required. If the "src" is a list of record, "rec_member" is required to +specify the record member for the output column. The "fill" value is +used to fill in the output column in the case that the "src" metric is +not present in the LDMS set (e.g. in the case of meminfo). + +Each index definition object contains "name" (the name of the index) and +"cols" which is the names of the OUTPUT columns comprising the index. + +The **"timestamp"**, **"producer"**, and **"instance"** are special +"src" that refer to update timestamp, producer name and instance name of +the set respectively. + +The following is an example of a static decomposition definition +converting meminfo set into two schemas, "meminfo_filter" (select a few +metrics) and "meminfo_directmap" (select a few direct map metrics with +"fill" since DirectMap varies by CPU architecture). + +:: + + { + "type": "static", + "rows": [ + { + "schema": "meminfo_filter", + "cols": [ + { "src":"timestamp", "dst":"ts", "type":"ts" }, + { "src":"producer", "dst":"prdcr", "type":"char_array", "array_len":64 }, + { "src":"instance", "dst":"inst", "type":"char_array", "array_len":64 }, + { "src":"component_id", "dst":"comp_id", "type":"u64" }, + { "src":"MemFree", "dst":"free", "type":"u64" }, + { "src":"MemActive", "dst":"active", "type":"u64" } + ], + "indices": [ + { "name":"time_comp", "cols":["ts", "comp_id"] }, + { "name":"time", "cols":["ts"] } + ] + }, + { + "schema": "meminfo_directmap", + "cols": [ + { "src":"timestamp", "dst":"ts", "type":"ts" }, + { "src":"component_id", "dst":"comp_id", "type":"u64" }, + { "src":"DirectMap4k", "dst":"directmap4k", "type":"u64", "fill": 0 }, + { "src":"DirectMap2M", "dst":"directmap2M", "type":"u64", "fill": 0 }, + { "src":"DirectMap4M", "dst":"directmap4M", "type":"u64", "fill": 0 }, + { "src":"DirectMap1G", "dst":"directmap1G", "type":"u64", "fill": 0 } + ], + "indices": [ + { "name":"time_comp", "cols":["ts", "comp_id"] }, + { "name":"time", "cols":["ts"] } + ] + } + ] + } + +The following is an example of a static decomposition with "rec_member" +usage. + +:: + + { + "type": "static", + "rows": [ + { + "schema": "netdev2_small", + "cols": [ + { "src":"timestamp", "dst":"ts", "type":"ts" }, + { "src":"producer", "dst":"prdcr", "type":"char_array", "array_len":64 }, + { "src":"instance", "dst":"inst", "type":"char_array", "array_len":64 }, + { "src":"component_id", "dst":"comp_id", "type":"u64" }, + { "src":"netdev_list", "rec_member":"name", + "dst":"netdev.name", "type":"char_array", "array_len":16 }, + { "src":"netdev_list", "rec_member":"rx_bytes", + "dst":"netdev.rx_bytes", "type":"u64" }, + { "src":"netdev_list", "rec_member":"tx_bytes", + "dst":"netdev.tx_bytes", "type":"u64" }, + ], + "indices": [ + { "name":"time_comp", "cols":["ts", "comp_id"] }, + { "name":"time", "cols":["ts"] } + ] + } + ] + } + +In this case, if the "netdev_list" has N members, the decomposition will +expand the set into N rows. + +AS_IS DECOMPOSITION +=================================== + +The **as_is** decomposition generate rows as-is according to metrics in +the LDMS set. To avoid schema conflict, such as meminfo collecting from +heterogeneous CPU architectures, **as_is** decomposition appends the +short LDMS schema digest (7 characters) to the row schema name before +submitting the rows to the storage plugin. For example, "meminfo" LDMS +schema may turn into "meminfo_8d2b8bd" row schema. The **as_is** +decomposition configuration only takes "indices" attribute which defines +indices for the output rows. When encountering a list of primitives, the +as_is decomposition expands the set into multiple rows (the non-list +metrics' values are repeated). When encountering a list of records, in +addition to expanding rows, the decomposition also expand the record +into multiple columns with the name formatted as +"LIST_NAME.REC_MEMBER_NAME". The "timestamp" is not a metric in the set +but it is used in all storage plugins. So, the "timestamp" column is +prepended to each of the output rows. + +The format of the JSON configuration is as follows: + +:: + + { + "type": "as_is", + "indices": [ + { "name": "INDEX_NAME", "cols": [ COLUMN_NAMES, ... ] }, + ... + ] + } + +The following is an **as_is** decomposition configuration example with +two indices: + +:: + + { + "type": "as_is", + "indices": [ + { "name": "time", "cols": [ "timestamp" ] }, + { "name": "time_comp", "cols": [ "timestamp", "component_id" ] } + ] + } + +FLEX DECOMPOSITION +================================== + +The **flex** decomposition applies various decompositions by LDMS schema +digests specified in the configuration. The configurations of the +applied decompositions are also specified in \`flex\` decomposition file +as follows: + +:: + + { + "type": "flex", + /* defining decompositions to be applied */ + "decomposition": { + "": { + "type": "", + ... + }, + ... + }, + /* specifying digests and the decompositions to apply */ + "digest": { + "": "", + "": [ "", "" ], + ... + "*": "" /* optional : the unmatched */ + } + } + +**Example:** In the following example, the "meminfo" LDMS sets have 2 +digests due to different metrics from different architecture. The +configuration then maps those digests to "meminfo" static decomposition +(producing "meminfo_filter" rows). It also showcases the ability to +apply multiple decompositions to a matching digest. The procnetdev2 sets +with digest +"E8B9CC8D83FB4E5B779071E801CA351B69DCB9E9CE2601A0B127A2977F11C62A" will +have "netdev2" static decomposition and "the_default" as-is +decomposition applied to them. The sets that do not match any specific +digest will match the "\*" digest. In this example, "the_default" as-is +decomposition is applied. + +:: + + { + "type": "flex", + "decomposition": { + "meminfo": { + "type": "static", + "rows": [ + { + "schema": "meminfo_filter", + "cols": [ + { "src":"timestamp", "dst":"ts", "type":"ts" }, + { "src":"producer", "dst":"prdcr", "type":"char_array", "array_len":64 }, + { "src":"instance", "dst":"inst", "type":"char_array", "array_len":64 }, + { "src":"component_id", "dst":"comp_id", "type":"u64" }, + { "src":"MemFree", "dst":"free", "type":"u64" }, + { "src":"MemActive", "dst":"active", "type":"u64" } + ], + "indices": [ + { "name":"time_comp", "cols":["ts", "comp_id"] }, + { "name":"time", "cols":["ts"] } + ] + } + ] + }, + "netdev2" : { + "type" : "static", + "rows": [ + { + "schema": "procnetdev2", + "cols": [ + { "src":"timestamp", "dst":"ts","type":"ts" }, + { "src":"component_id", "dst":"comp_id","type":"u64" }, + { "src":"netdev_list", "rec_member":"name", "dst":"dev.name", + "type":"char_array", "array_len": 16 }, + { "src":"netdev_list", "rec_member":"rx_bytes", "dst":"dev.rx_bytes", + "type":"u64" }, + { "src":"netdev_list", "rec_member":"tx_bytes", "dst":"dev.tx_bytes", + "type":"u64" } + ], + "indices": [ + { "name":"time_comp", "cols":["ts", "comp_id"] } + ] + } + ] + }, + "the_default": { + "type": "as_is", + "indices": [ + { "name": "time", "cols": [ "timestamp" ] }, + { "name": "time_comp", "cols": [ "timestamp", "component_id" ] } + ] + } + }, + "digest": { + "71B03E47E7C9033E359DB5225BC6314A589D8772F4BC0866B6E79A698C8799C0": "meminfo", + "59DD05D768CFF8F175496848486275822A6A9795286FD9B534FDB9434EAF4D50": "meminfo", + "E8B9CC8D83FB4E5B779071E801CA351B69DCB9E9CE2601A0B127A2977F11C62A": [ "netdev2", "the_default" ], + "*": "the_default" + } + } + +SEE ALSO +======================== + +Plugin_store_sos(7), Plugin_store_csv(7), Plugin_store_kafka(7) diff --git a/rtd/docs/source/ldms_man/ldmsd_failover.rst b/rtd/docs/source/ldms_man/ldmsd_failover.rst new file mode 100644 index 000000000..650c08a3d --- /dev/null +++ b/rtd/docs/source/ldms_man/ldmsd_failover.rst @@ -0,0 +1,289 @@ +============== +ldmsd_failover +============== + +:Date: 13 Aug 2018 + +.. contents:: + :depth: 3 +.. + +NAME +=============== + +ldmsd_failover - explanation, configuration, and commands for ldmsd +failover + +SYNOPSIS +=================== + +failover_config + host=\ *HOST* port=\ *PORT* xprt=\ *XPRT* [peer_name=\ *NAME*] + [interval=\ *USEC*] [timeout_factor=\ *FLOAT*] [auto_switch=\ *0|1*] + +failover_start + +failover_stop + +failover_status + +failover_peercfg_start + +failover_peercfg_stop + +DESCRIPTION +====================== + +**ldmsd** can be configured to form a failover pair with another +**ldmsd**. In a nutshell, when a failover pair is formed, the ldmsd's +exchange their updater and producer configuration so that when one goes +down, the other will take over the LDMS set aggregation load +(**failover**). + +**Ping-echo** mechanism is used to detect the service unavailability. +Each ldmsd in the pair sends ping requests to the other, the peer echo +back along with its status. When the echo has not been received within +the timeout period (see below), the peer configuration is automatically +started (failover). + +The following paragraphs explain ldmsd configuration commands relating +to ldmsd failover feature. + +**failover_config** configure failover feature in an ldmsd. The failover +service must be stopped before configuring it. The following list +describes the command parameters. + + host=HOST + The hostname of the failover partner. This is optional in + re-configuration. + + port=PORT + The LDMS port of the failover partner. This is optional in + re-configuration. + + xprt=XPRT + The LDMS transport type (sock, rdma, or ugni) of the failover + partner. This is optional in re-configuration. + + peer_name=NAME + (Optional) The ldmsd name of the failover parter (please see + option **-n** in **ldmsd**\ (8)). If this is specified, the ldmsd + will only accept a pairing with other ldmsd with matching name. + Otherwise, the ldmsd will pair with any ldmsd requesting a + failover pairing. + + interval=uSEC + (Optional) The interval (in micro-seconds) for ping and transport + re-connecting. The default is 1000000 (1 sec). + + timeout_factor=FLOAT + (Optional) The echo timeout factor. The echo timeout is calculated + by **timeout_factor \* interval**. The default is 2. + + auto_switch=0|1 + (Optional) If this is on (1), ldmsd will start **peercfg** or stop + **peercfg** automatically. Otherwise, the user need to issue + **failover_peercfg_start** or **failover_peercfg_stop** manually. + By default, this value is 1. + +**failover_start** is a command to start the (configured) failover +service. After the failover service has started, it will pair with the +peer, retreiving peer configurations and start peer configurations when +it believes that the peer is not in service (with \`auto_switch=1\`, +otherwise it does nothing). + +Please also note that when the failover service is in use (after +**failover_start**), prdcr, updtr, and strgp cannot be altered over the +in-band configuration (start, stop, or reconfigure). The failover +service must be stopped (**failover_stop**) before altering those +configuration objects. + +**failover_stop** is a command to stop the failover service. When the +service is stopped, the peer configurations will also be stopped and +removed from the local memory. The peer also won't be able to pair with +local ldmsd when the failover service is stopped. Issuing +**failover_stop** after the pairing process succeeded will stop failover +service on both daemons in the pair. + +**failover_status** is a command to report (via **ldmsd_controller**) +the failover statuses. + +**failover_peercfg_start** is a command to manually start peer +configruation. Please note that if the **auto_switch** is 1, the ldmsd +will automatically stop peer configuration when it receives the echo +from the peer. + +**failover_peercfg_stop** is a command to manually stop peer +configuration. Please note that if the **auto_switch** is 1, the ldmsd +will automatically start peercfg when the echo has timed out. + +FAILOVER: AUTOMATIC PEERCFG ACTIVATION +================================================= + +The peer configuration is automatically activated when an echo-timeout +event occurred (with \`auto_switch=1\`). The echo-timeout is calculated +based on ping interval, ping-echo round-trip time, \`timeout_factor\` +and moving standard deviation of ping-echo round-trip time as follows: + +rt_time[N] is an array of last N ping-echo round-trip time. + +base = max( max(rt_time), ping_interval ) timeout1 = base + 4 \* +SD(rt_time) timeout2 = base*timeout_factor + +timeout = max( timeout1, timeout2 ) + +EXAMPLES +=================== + +Let's consider the following setup: + +:: + + .-------. + | a20 | + |-------| + | s00/a | + | s00/b | + | s01/a | + | s01/b | + | s02/a | + | s02/b | + | s03/a | + | s03/b | + '-------' + ^ + | + .-----------'-----------. + | | + .-------. .-------. + | a10 | | a11 | + |-------| |-------| + | s00/a | pair | s02/a | + | s00/b |...............| s02/b | + | s01/a | | s03/a | + | s01/b | | s03/b | + '-------' '-------' + ^ ^ + | | + .----'---. .-'------. + | | | | + .-------..-------. .-------..-------. + | s00 || s01 | | s02 || s03 | + |-------||-------| |-------||-------| + | s00/a || s01/a | | s02/a || s03/a | + | s00/b || s01/b | | s02/b || s03/b | + '-------''-------' '-------''-------' + +In this setup, we have 4 sampler daemons (*s00* - *s03*), 2 level-1 +aggregator (*a10*, *a11*), and 1 level-2 aggregator (*a20*). Each +sampler daemon contain set *a* and set *b*, which are prefixed by the +sampler daemon name. The level-1 aggregators are configured to be a +failover pair, aggregating sets from the sampler daemons as shown in the +picture. And the level-2 aggregator is configured to aggregate sets from +the level-1 aggregators. + +The following is a list of configuration and CLI options to achieve the +setup shown above: + +:: + + # a20.cfg + prdcr_add name=prdcr_a10 host=a10.hostname port=12345 xprt=sock \ + type=active interval=1000000 + prdcr_start name=prdcr_a10 + prdcr_add name=prdcr_a11 host=a11.hostname port=12345 xprt=sock \ + type=active interval=1000000 + prdcr_start name=prdcr_a11 + updtr_add name=upd interval=1000000 offset=0 + updtr_prdcr_add name=upd regex.* + updtr_start upd + + # a10.cfg + prdcr_add name=prdcr_s00 host=s00.hostname port=12345 xprt=sock \ + type=active interval=1000000 + prdcr_start name=prdcr_s00 + prdcr_add name=prdcr_s01 host=s01.hostname port=12345 xprt=sock \ + type=active interval=1000000 + prdcr_start name=prdcr_s01 + updtr_add name=upd interval=1000000 offset=0 + updtr_prdcr_add name=upd regex.* + updtr_start upd + failover_config host=a11.hostname port=12345 xprt=sock \ + interval=1000000 peer_name=a11 + failover_start + # a10 CLI + $ ldmsd -c a10.cfg -x sock:12345 -n a10 + # name this daemon "a10" + + # a11.cfg + prdcr_add name=prdcr_s02 host=s02.hostname port=12345 xprt=sock \ + type=active interval=1000000 + prdcr_start name=prdcr_s02 + prdcr_add name=prdcr_s03 host=s03 port=12345 xprt=sock \ + type=active interval=1000000 + prdcr_start name=prdcr_s03 + updtr_add name=upd interval=1000000 offset=0 + updtr_prdcr_add name=upd regex.* + updtr_start upd + failover_config host=a10.hostname port=12345 xprt=sock \ + interval=1000000 peer_name=a10 + failover_start + # a11 CLI + $ ldmsd -c a11 -x sock:12345 -n a11 + # name this daemon "a11" + + # sampler config are omitted (irrelevant). + +With this setup, when *a10* died, *a11* will start aggregating sets from +*s00* and *s01*. When this is done, *a20* will still get all of the sets +through *a11* depicted in the following figure. + +:: + + .-------. + | a20 | + |-------| + | s00/a | + | s00/b | + | s01/a | + | s01/b | + | s02/a | + | s02/b | + | s03/a | + | s03/b | + '-------' + ^ + | + '-----------. + | + xxxxxxxxx .-------. + x a10 x | a11 | + x-------x |-------| + x s00/a x | s00/a | + x s00/b x | s00/b | + x s01/a x | s01/a | + x s01/b x | s01/b | + xxxxxxxxx | s02/a | + | s02/b | + | s03/a | + | s03/b | + '-------' + ^ + | + .--------.-----------------.-'------. + | | | | + .-------..-------. .-------..-------. + | s00 || s01 | | s02 || s03 | + |-------||-------| |-------||-------| + | s00/a || s01/a | | s02/a || s03/a | + | s00/b || s01/b | | s02/b || s03/b | + '-------''-------' '-------''-------' + +When *a10* heartbeat is back, *a11* will stop its producers/updaters +that were working in place of *a10*. The LDMS network is then recovered +back to the original state in the first figure. + +SEE ALSO +=================== + +**ldmsd**\ (8), **ldms_quickstart**\ (7), **ldmsd_controller**\ (8) diff --git a/rtd/docs/source/ldms_man/ldmsd_setgroup.rst b/rtd/docs/source/ldms_man/ldmsd_setgroup.rst new file mode 100644 index 000000000..f9bdecbaf --- /dev/null +++ b/rtd/docs/source/ldms_man/ldmsd_setgroup.rst @@ -0,0 +1,236 @@ +============== +ldmsd_setgroup +============== + +:Date: 5 Jul 2018 + +.. contents:: + :depth: 3 +.. + +NAME +=============== + +ldmsd_setgroup - explanation, configuration, and commands for ldmsd set +group + +SYNOPSIS +=================== + +setgroup_add + name=\ *GROUP_NAME* [producer=\ *PRODUCER*] [interval=\ *USEC*] + [offset=\ *USEC*] + +setgroup_mod + name=\ *GROUP_NAME* [interval=\ *USEC*] [offset=\ *USEC*] + +setgroup_del + name=\ *GROUP_NAME* + +setgroup_ins + name=\ *GROUP_NAME* instance=\ *COMMA_SEPARATED_LIST_OF_INSTANCES* + +setgroup_rm + name=\ *GROUP_NAME* instance=\ *COMMA_SEPARATED_LIST_OF_INSTANCES* + +DESCRIPTION +====================== + +An **ldmsd setgroup** (referred to as **setgroup** for short) is an +**ldms_set** with special information for LDMS daemon (**ldmsd**). The +setgroup information contains a list of other sets so that the LDMSD +**updtr** can update all the sets in the collection at once +(iteratively). This will help administrators in configuration, and help +sampler plugin developer to manage their collection of sets. For an +example usage of \`ldmsd_group\_*\` APIs, please see **grptest.c**, and +\`ldmsd_group\_*()\` declarations (with doxygen doc) in **ldmsd.h**. In +this manual page, we will focus on LDMSD commands that manage the +setgroup from the configuration side. The description for each command +and its parameters is as follows. + +**setgroup_add** adds (creates) a new setgroup. The following list +describes the command parameters: + + name=GROUP_NAME + The name of the setgroup. + + [producer=PRODUCER] + (Optional) The producer name of the setgroup. If not set, the name + of the LDMSD (the **-n** option) is used. + + [interval=USEC] + (Optional) The micro-second update interval hint. + + [offset=USEC] + (Optional) The micro-second update offset hint. + +**setgroup_mod** modifies (mutable) attributes of the setgroup. The list +of parameters is as follows: + + name=GROUP_NAME + The name of the setgroup. + + [interval=USEC] + (Optional) The micro-second update interval hint. + + [offset=USEC] + (Optional) The micro-second update offset hint. + +**setgroup_ins** inserts a list of set instances into the setgroup. + + name=GROUP_NAME + The name of the setgroup. + + instance=COMMA_SEPARATED_LIST_OF_INSTANCES + A comma-separated list of set instances. + +**setgroup_rm** removes a list of set instances from the setgroup. + + name=GROUP_NAME + The name of the setgroup. + + instance=COMMA_SEPARATED_LIST_OF_INSTANCES + A comma-separated list of set instances. + +**setgroup_del** deletes the setgroup. + + name=GROUP_NAME + The name of the setgroup. + +EXAMPLE +================== + +In this example, we will have 2 **ldmsd**'s, namely **sampler** and +**aggregator** for the sampler daemon and the aggregator daemon +respectively. The sampler will have \`meminfo\`, \`set_0\`, \`set_1\`, +\`set_2\`, \`set_3\` as its regular sets. \`thegroup\` will be the +setgroup created in the sampler that contains \`meminfo\` and \`set_0\`. +The aggregator will be setup to update only \`thegroup\`. + +:: + + ### sampler.conf + # It is OK to add the group first, please also not that our group has no + # update hint so that the updater in the aggregator can control its update + # interval. + setgroup_add name=thegroup + + # Insert meminfo and set_0 into the group + setgroup_ins name=thegroup instance=meminfo,set_0 + + # test_sampler will generate a bunch of sets, with this config it will create + # set_0, set_1, set_2, set_3 + load name=test_sampler + config name=test_sampler producer=sampler \ + action=default \ + base=set \ + num_sets=4 \ + push=0 + start name=test_sampler interval=1000000 offset=0 + # meminfo + load name=meminfo + config name=meminfo producer=sampler \ + instance=meminfo + start name=meminfo interval=1000000 offset=0 + ### END OF sampler.conf + + ### aggregator.conf + # Normal producer setup + prdcr_add name=prdcr host=localhost port=10001 xprt=sock \ + interval=1000000 \ + type=active + prdcr_start name=prdcr + # Setup the `grp_updtr` so that it only updates `thegroup`. + updtr_add name=grp_updtr interval=1000000 offset=500000 + updtr_match_add name=grp_updtr regex=thegroup + updtr_prdcr_add name=grp_updtr regex=prdcr + updtr_start name=grp_updtr + ### END OF sampler.conf + +The daemons can be started with the following commands: + +:: + + # For sampler, foreground start + $ ldmsd -F -c sampler.conf -x sock:10001 + # For aggregator, foreground start + $ ldmsd -F -c aggregator.conf -x sock:10000 + +When listing the sets on the aggregator with **-v** option, you'll see +that only \`meminfo\` and \`set_0\` are recent. \`thegroup\` is only +updated when its information changed. The rest of the sets only been +looked-up, but not updated. + +:: + + $ ldms_ls -x sock -p 10000 -v | grep update + thegroup: consistent, last update: Thu Jul 05 16:22:08 2018 [303411us] + set_3: inconsistent, last update: Wed Dec 31 18:00:00 1969 [0us] + set_2: inconsistent, last update: Wed Dec 31 18:00:00 1969 [0us] + set_1: inconsistent, last update: Wed Dec 31 18:00:00 1969 [0us] + set_0: consistent, last update: Thu Jul 05 16:36:30 2018 [1793us] + meminfo: consistent, last update: Thu Jul 05 16:36:31 2018 [1946us] + +While when listing the sets on the sampler, we will see all of them +being updated (except \`thegroup\`). + +:: + + thegroup: consistent, last update: Thu Jul 05 16:22:08 2018 [303411us] + set_3: consistent, last update: Thu Jul 05 16:39:52 2018 [1915us] + set_2: consistent, last update: Thu Jul 05 16:39:52 2018 [1916us] + set_1: consistent, last update: Thu Jul 05 16:39:53 2018 [1948us] + set_0: consistent, last update: Thu Jul 05 16:39:53 2018 [1948us] + meminfo: consistent, last update: Thu Jul 05 16:39:53 2018 [2022us] + +**Removing/inserting** instances from/into the group can also be done +interactively via **ldmsd_controller**. If we do the following on the +**sampler**: + +:: + + $ ldmsd_controller --port 10001 + Welcome to the LDMSD control processor + sock:localhost:10001> setgroup_rm name=thegroup instance=set_0 + sock:localhost:10001> setgroup_ins name=thegroup instance=set_3 + +\`set_0\` will be removed from \`thegroup\`, and \`set_3\` will be added +into \`thegroup\`. Listing the sets on the **aggregator** will see that +\`set_0\` stopped being updated, and \`set_3\` becomes recent. + +:: + + thegroup: consistent, last update: Thu Jul 05 16:42:12 2018 [378918us] + set_3: consistent, last update: Thu Jul 05 16:42:14 2018 [2070us] + set_2: inconsistent, last update: Wed Dec 31 18:00:00 1969 [0us] + set_1: inconsistent, last update: Wed Dec 31 18:00:00 1969 [0us] + set_0: consistent, last update: Thu Jul 05 16:41:25 2018 [1116us] + meminfo: consistent, last update: Thu Jul 05 16:42:15 2018 [1223us] + +The **members** of the group can be **listed** by the following: + +:: + + $ ldms_ls -x sock -p 10000 -v thegroup + thegroup: consistent, last update: Thu Jul 05 16:42:12 2018 [378918us] + APPLICATION SET INFORMATION ------ + grp_member: set_3 : - + grp_member: meminfo : - + ldmsd_grp_gn : 8 + METADATA -------- + Producer Name : a:10001 + Instance Name : thegroup + Schema Name : ldmsd_grp_schema + Size : 184 + Metric Count : 1 + GN : 1 + User : root(0) + Group : root(0) + Permissions : -rwxrwxrwx + DATA ------------ + Timestamp : Thu Jul 05 16:42:12 2018 [378918us] + Duration : [0.000017s] + Consistent : TRUE + Size : 64 + GN : 8 + ----------------- diff --git a/rtd/docs/source/ldms_man/ldmsd_stream_publish.rst b/rtd/docs/source/ldms_man/ldmsd_stream_publish.rst new file mode 100644 index 000000000..71055ad6a --- /dev/null +++ b/rtd/docs/source/ldms_man/ldmsd_stream_publish.rst @@ -0,0 +1,131 @@ +==================== +ldmsd_stream_publish +==================== + +:Date: 21 Aug 2021 + +.. contents:: + :depth: 3 +.. + +NAME +===================== + +ldmsd_stream_publish - man page for the LDMS ldmsd_stream_publish +executable utility + +SYNOPSIS +========================= + +At the command line: ldmsd_stream_publish [args] + +DESCRIPTION +============================ + +The ldmsd_stream_publish executable publishes to the ldmsd_streams +interface of a running ldms daemon. The hello_publisher takes a file as +input and publishes it either in bulk or line by line. It reuses the +connection for all the messages + +COMMAND LINE SYNTAX +==================================== + +ldmsd_sstream_publish -x -h -p -s -a -A -t -f [-l] + | + + -x + | + | transport of the ldmsd to which to connect. + + -p + | + | port of the ldmsd to which to connect. + + -a + | + | auth to connect to the ldmsd + + -A + | + | auth-opts to connect to the ldmsd + + -s + | + | Name of the stream (this will be used for subscribing) + + -t + | + | Optional data-format. Either 'string' or 'json'. Default is + string. + + -l + | + | Optional line mode. Publishes file one line at a time as + separate publish calls + + -f + | + | File that is published. If not specified, input is copied from + STDIN. + + -r N + | + | Repeat the publication of the file N times, with a delay + interval specifed by -i. Repeating is not supported unless the + input is a file. If the -l option is given, the file and + connection are opened once and the lines are replayed to + individual ldmsd_stream_publish calls. If -l is not given, the + ldmsd_stream_publish_file call is used, resulting in multiple + connection openings. -i interval_in_microseconds + | Change the default delay (usleep(interval_in_microseconds)) used + if repeat is specified. + +BUGS +===================== + +No known bugs. + +NOTES +====================== + +This executable is in development and may change at any time. + +The difference in repeat behavior if -l is present allows for testing +two scenarios: repeating many messages to a single connection and +repeating connection attempts to a daemon that may come and go during +publication attempts. Environment variables LDMSD_STREAM_CONN_TIMEOUT +and LDMSD_STREAM_ACK_TIMEOUT will affect the timing of the repeat loop +when -l is not given. + +EXAMPLES +========================= + +Within ldmsd_controller or a configuration file: + +:: + + load name=hello_sampler + config name=hello_sampler producer=host1 instance=host1/hello_sampler stream=foo component_id=1 + start name=hello_sampler interval=1000000 offset=0 + +:: + + > cat testdata.10.out + { "seq": 0, "job-id" : 10364, "rank" : 1, "kokkos-perf-data" : [ {"name" : "SPARTAFOO0", "count": 0, "time": 0.0000},{"name" : "SPARTAFOO1", "count": 1, "time": 0.0001},{"name" : "SPARTAFOO2", "count": 2, "time": 0.0002},{"name" : "SPARTAFOO3", "count": 3, "time": 0.0003},{"name" : "SPARTAFOO4", "count": 4, "time": 0.0004},{"name" : "SPARTAFOO5", "count": 5, "time": 0.0005},{"name" : "SPARTAFOO6", "count": 6, "time": 0.0006},{"name" : "SPARTAFOO7", "count": 7, "time": 0.0007},{"name" : "SPARTAFOO8", "count": 8, "time": 0.0008},{"name" : "SPARTAFOO9", "count": 9, "time": 0.0009}] } + +:: + + > ldmsd_stream_publish -x sock -h localhost -p 52001 -s foo -t json -f ./testdata.10.out -a none + + + In the log file of the ldmsd: + > cat log.txt + Sat Aug 21 18:15:27 2021: CRITICAL : stream_type: JSON, msg: "{ "seq": 0, "job-id" : 10364, "rank" : 1, "kokkos-perf-data" : [ {"name" : "SPARTAFOO0", "count": 0, "time": 0.0000},{"name" : "SPARTAFOO1", "count": 1, "time": 0.0001},{"name" : "SPARTAFOO2", "count": 2, "time": 0.0002},{"name" : "SPARTAFOO3", "count": 3, "time": 0.0003},{"name" : "SPARTAFOO4", "count": 4, "time": 0.0004},{"name" : "SPARTAFOO5", "count": 5, "time": 0.0005},{"name" : "SPARTAFOO6", "count": 6, "time": 0.0006},{"name" : "SPARTAFOO7", "count": 7, "time": 0.0007},{"name" : "SPARTAFOO8", "count": 8, "time": 0.0008},{"name" : "SPARTAFOO9", "count": 9, "time": 0.0009},{"name" : "SPARTAFOO10", "count": 10, "time": 0.00010}] }", msg_len: 589, entity: 0x2aaab8004680 + + Note that the hello_streams sampler does not do a sample, instead it subscribes to the stream with a callback and prints out what it got off the stream. + +SEE ALSO +========================= + +ldmsd(8), ldms_quickstart(7), ldmsd_controller(8), ldms_sampler_base(7), +Plugin_hello_sampler(7), Plugin_stream_csv_store(7) diff --git a/rtd/docs/source/ldms_man/ldmsd_stream_subscribe.rst b/rtd/docs/source/ldms_man/ldmsd_stream_subscribe.rst new file mode 100644 index 000000000..5829c2c99 --- /dev/null +++ b/rtd/docs/source/ldms_man/ldmsd_stream_subscribe.rst @@ -0,0 +1,125 @@ +====================== +ldmsd_stream_subscribe +====================== + +:Date: 21 Aug 2021 + +.. contents:: + :depth: 3 +.. + +NAME +======================= + +ldmsd_stream_subscribe - man page for the LDMS ldmsd_stream_subscribe +utility + +SYNOPSIS +=========================== + +At the command line: ldmsd_stream_subscribe [args] + +DESCRIPTION +============================== + +The ldmsd_stream_subscribe program subscribes to a stream in place of a +full ldmsd daemon, writing received messages to a file or to stdout. + +COMMAND LINE SYNTAX +====================================== + +ldmsd_stream_subscribe -x -h -p -s -a -A -f -D -i -R -q -E + | + + -x,--xprt + | + | transport type on which to listen. + + -p,--port + | + | port on which to listen. + + -h,--host + | + | hostname or IP address of interface on which to listen. + + -a,--auth + | + | authentication to expect from publishers. + + -A,--auth_arg + | + | auth options if needed (for e.g. ovis auth or munge on unusual + port) + + -s,--stream + | + | Name of the stream to subscribe. + + -f,--file + | + | File where messages delivered are written. If not specified, + STDOUT. + + -E,--events-raw + | + | Suppress delivery envelope information in message output. + + -q,--quiet + | + | Suppress message output to file or stdout entirely. + + -D,--daemonize + | + | Put the process in the background as a daemon. + + -R,--daemon-noroot + | + | Prevent file system root (/) change-directory when starting the + daemon. (Does nothing if -D is not present). + + -i,--daemon-io + | + | Keep the input and output file descriptors attached to the + daemon instead of closing them. (Does nothing if -D is not + present). + +BUGS +======================= + +No known bugs. + +NOTES +======================== + +This program is in development and may change at any time. + +Using "-a none" is insecure and should only be used with care. + +EXAMPLES +=========================== + +Running in user mode as a sink to test a stream publishing program +writing to tag 'mystream': + +:: + + ldmsd_stream_subscribe -x sock -h 127.0.0.1 -p 20411 -s mystream -a none -f messages.out -D -R + +Running in root mode and testing on port 511 + +:: + + ldmsd_stream_subscribe -x sock -h 127.0.0.1 -p 511 -s mystream -a munge -f /var/log/ldms-stream/messages.out -D + +Sending data to listening subscriber + +:: + + echo '{ "a": "worthless message"}' | ./ldmsd_stream_publish -x sock -h 127.0.0.1 -p 20411 -s mystream -a none -t json + +SEE ALSO +=========================== + +ldmsd(8), ldms_quickstart(7), ldmsd_stream_publish(8), +ldms_authentication(7) diff --git a/rtd/docs/source/ldms_man/lsdate.rst b/rtd/docs/source/ldms_man/lsdate.rst new file mode 100644 index 000000000..1d4f61fa3 --- /dev/null +++ b/rtd/docs/source/ldms_man/lsdate.rst @@ -0,0 +1,37 @@ +========= +LSDATE +========= + +:Date: June 2018 + +.. contents:: + :depth: 3 +.. + +NAME +======= + +lsdate - list directory contents with UTC timestamp suffix translation + +SYNOPSIS +=========== + +**lsdate** [*OPTION*]... [*FILE*]... + +DESCRIPTION +============== + +Execute ls(1) and apply an output filter to reveal the calendar date of +timestamp suffixed files, such as produced by LDMS CVS stores. +Timestamps are assumed to be seconds since the epoch. Fractional seconds +are not supported. + +SEE ALSO +=========== + +ls(1), Plugin_store_csv(7) + +NOTES +======== + +The output of lsdate -s and the output of lsdate -l may be surprising. diff --git a/rtd/docs/source/ldmscon.rst b/rtd/docs/source/ldmscon.rst new file mode 100644 index 000000000..dc06f17eb --- /dev/null +++ b/rtd/docs/source/ldmscon.rst @@ -0,0 +1,70 @@ +LDMS Users Group Conference (LDMSCON) +########### +The LDMS Users Group Conferences (LDMSCON) serves as a forum for users to share experiences with the LDMS product. + +About +********** +You can find the general information and previous conferences following webpage: +`LDMS Users Group Conference`_. + +.. _LDMS Users Group Conference: https://sites.google.com/view/ldmscon + +Please go to the to stay up to date on tutorials, presentations and discussions about LDMS and its' software capabilities. +------------ + +LDMSCON2023 +************ +The following attachment contains the scripts and commands used in the LDMCON2023 Basics powerpoint presentation. + +**Please DOWNLOAD THE FOLLOWING .ZIP FILE to easily follow along with the tutorial.** + +:download: `ldmscon2023_directory.zip `_ + +Recordings of previous presentations, tutorials and information for LDMSCON2023 can be found here: +`LDMSCON2023 `_ + +* The tutorial videos and slides can be viewed under the "Tutorials" tab at the top right. +.. image:: images/ldmscon/ldmscon2023pres.PNG + :width: 200 +* The presentation videos and slides can be viewed under the "Presentations" tab at the top right. +.. image:: images/ldmscon/ldmscon2023tutorial.png + :width: 200 + +.. note:: + + **If the file directory ``ldmscon2023`` is not extracted under ``/root/``** then please keep in mind that **any reference to ``/root/``** in the powerpoint presentation, and following files, **will need to be changed to the absolute path of ``ldmscon2023/``**. + + * ``../conf/e3/agg_store_csv.conf`` + * ``../scripts/e1/start_ldms_simple.sh`` + * ``../scripts/e2/start_ldms_simple_agg.sh`` + * ``../scripts/e3/start_agg_store_csv.sh`` + * ``../scripts/e3/store_csv.txt`` + +.. note:: + All files under ``../scripts/e*`` are not used in the tutorial but rather are the commands/steps used for each exercise. They demonstrate LDMS's ability to configure and initialize it's daemons with a single bash script. + +.. note:: + These scripts must be ran in a directory that is readable and writable. Otherwise the log/data file generation will not work. + +LDMSCON2022 +************ +Recordings of previous presentations, tutorials and information for LDMSCON2022 can be found here: +`LDMSCON2022 `_ + +* The tutorial videos and slides can be viewed under the "Tutorials" tab at the top right. +.. image:: images/ldmscon/ldmscon2022pres.PNG + :width: 200 +* The presentation videos and slides can be viewed under the "Presentations" tab at the top right. +.. image:: images/ldmscon/ldmscon2022tutorial.PNG + :width: 200 + +Archived LDMSCONs +************ +All LDMSCONs prior to 2022 have been archived and can be found here: +`Archived LDMSCONs `_ + + + + + + diff --git a/rtd/docs/source/sampler_man/Plugin_app_sampler.rst b/rtd/docs/source/sampler_man/Plugin_app_sampler.rst new file mode 100644 index 000000000..79459414f --- /dev/null +++ b/rtd/docs/source/sampler_man/Plugin_app_sampler.rst @@ -0,0 +1,311 @@ +================== +Plugin_app_sampler +================== + +:Date: 30 Sep 2019 + +.. contents:: + :depth: 3 +.. + +NAME +==================== + +ldmsd_app_sampler - LDMSD app_sampler plugin + +SYNOPSIS +======================== + +**config** **name=app_sampler** **producer=**\ *PRODUCER* +**instance=**\ *INSTANCE* [ **schema=\ SCHEMA** ] [ +**component_id=\ COMPONENT_ID** ] [ **stream=\ STREAM_NAME** ] [ +**metrics=\ METRICS** ] [ **cfg_file=\ PATH** ] + +DESCRIPTION +=========================== + +**``app_sampler``** collects metrics from **``/proc/``** according +to current SLURM jobs/tasks running on the system. **``app_sampler``** +depends on **``slurm_notifier``** SPANK plugin to send SLURM job/task +events over **``ldmsd_stream``** (**``stream``** option, default: +slurm). A set is created per task when the task started in the following +format: **``PRODUCER_NAME/JOB_ID/TASK_PID``**. The set is deleted when +the task exited. + +By default **``app_sampler``** sampling all available metrics (see +**``LIST OF METRICS``** section). Users may down-select the list of +metrics to monitor by specifying **``metrics``** option (comma-separated +string) or writing a JSON configuration file and specifying +**``cfg_file``** option (see **``EXAMPLES``** section). + +CONFIG OPTIONS +============================== + +name + Must be app_sampler. + +producer + The name of the data producer (e.g. hostname). + +instance + This is required by sampler_base but is not used by app_sampler. So, + this can be any string but must be present. + +schema + The optional schema name (default: app_sampler). + +component_id + An integer identifying the component (default: *0*). + +stream + The name of the **``ldmsd_stream``** to listen for SLURM job events. + (default: slurm). + +metrics + The comma-separated list of metrics to monitor. The default is '' + (empty), which is equivalent to monitor ALL metrics. + +cfg_file + The alternative config file in JSON format. The file is expected to + have an object that may contain the following attributes: + +.. + + :: + + + { + 'stream': 'STREAM_NAME' + 'metrics': [ METRICS ] + } + +The default values are assumed for the attributes that are not +specified. Attributes other than 'stream' and 'metrics' are ignored. + +If the **``cfg_file``** is given, **``stream``** and **``metrics``** +options are ignored. + +LIST OF METRICS +=============================== + + :: + + /* from /proc/[pid]/cmdline */ + cmdline_len, + cmdline, + + /* the number of open files */ + n_open_files, + + /* from /proc/[pid]/io */ + io_read_b, + io_write_b, + io_n_read, + io_n_write, + io_read_dev_b, + io_write_dev_b, + io_write_cancelled_b, + + /* /proc/[pid]/oom_score */ + oom_score, + + /* /proc/[pid]/oom_score_adj */ + oom_score_adj, + + /* path of /proc/[pid]/root */ + root, + + + /* /proc/[pid]/stat */ + stat_pid, + stat_comm, + stat_state, + stat_ppid, + stat_pgrp, + stat_session, + stat_tty_nr, + stat_tpgid, + stat_flags, + stat_minflt, + stat_cminflt, + stat_majflt, + stat_cmajflt, + stat_utime, + stat_stime, + stat_cutime, + stat_cstime, + stat_priority, + stat_nice, + stat_num_threads, + stat_itrealvalue, + stat_starttime, + stat_vsize, + stat_rss, + stat_rsslim, + stat_startcode, + stat_endcode, + stat_startstack, + stat_kstkesp, + stat_kstkeip, + stat_signal, + stat_blocked, + stat_sigignore, + stat_sigcatch, + stat_wchan, + stat_nswap, + stat_cnswap, + stat_exit_signal, + stat_processor, + stat_rt_priority, + stat_policy, + stat_delayacct_blkio_ticks, + stat_guest_time, + stat_cguest_time, + stat_start_data, + stat_end_data, + stat_start_brk, + stat_arg_start, + stat_arg_end, + stat_env_start, + stat_env_end, + stat_exit_code, + + /* from /proc/[pid]/status */ + status_name, + status_umask, + status_state, + status_tgid, + status_ngid, + status_pid, + status_ppid, + status_tracerpid, + status_uid, + status_real_user, + status_eff_user, + status_sav_user, + status_fs_user, + status_gid, + status_real_group, + status_eff_group, + status_sav_group, + status_fs_group, + status_fdsize, + status_groups, + status_nstgid, + status_nspid, + status_nspgid, + status_nssid, + status_vmpeak, + status_vmsize, + status_vmlck, + status_vmpin, + status_vmhwm, + status_vmrss, + status_rssanon, + status_rssfile, + status_rssshmem, + status_vmdata, + status_vmstk, + status_vmexe, + status_vmlib, + status_vmpte, + status_vmpmd, + status_vmswap, + status_hugetlbpages, + status_coredumping, + status_threads, + status_sig_queued, + status_sig_limit, + status_sigpnd, + status_shdpnd, + status_sigblk, + status_sigign, + status_sigcgt, + status_capinh, + status_capprm, + status_capeff, + status_capbnd, + status_capamb, + status_nonewprivs, + status_seccomp, + status_speculation_store_bypass, + status_cpus_allowed, + status_cpus_allowed_list, + status_mems_allowed, + status_mems_allowed_list, + status_voluntary_ctxt_switches, + status_nonvoluntary_ctxt_switches, + + /* /proc/[pid]/syscall */ + syscall, + + /* /proc/[pid]/timerslack_ns */ + timerslack_ns, + + /* /proc/[pid]/wchan */ + wchan, + +BUGS +==================== + +No known bugs. + +EXAMPLES +======================== + +Example 1 +---------- + +Get everyting: + + :: + + config name=app_sampler + +Example 2 +---------- + +Down-select and with non-default stream name: + + :: + + config name=app_sampler metrics=stat_pid,stat_utime stream=mystream + +Example 3 +---------- + +Down-select using config file, using default stream: + + :: + + config name=app_sampler cfg_file=cfg.json + +.. + + :: + + # cfg.json + { + "metrics" : [ + "stat_pid", + "stat_utime" + ] + } + +NOTES +==================== + +Some of the optionally collected data might be security sensitive. + +The status_uid and status_gid values can alternatively be collected as +"status_real_user", "status_eff_user", "status_sav_user", +"status_fs_user", "status_real_group", "status_eff_group", +"status_sav_group", "status_fs_group". These string values are most +efficiently collected if both the string value and the numeric values +are collected. + +SEE ALSO +======================= + +**ldmsd**\ (8), **ldms_quickstart**\ (7), **ldmsd_controller**\ (8), +**ldms_sampler_base**\ (7), **proc(5),** **sysconf(3),** **environ(3).** diff --git a/rtd/docs/source/sampler_man/Plugin_aries_linkstatus.rst b/rtd/docs/source/sampler_man/Plugin_aries_linkstatus.rst new file mode 100644 index 000000000..268ab7da8 --- /dev/null +++ b/rtd/docs/source/sampler_man/Plugin_aries_linkstatus.rst @@ -0,0 +1,147 @@ +======================= +Plugin_aries_linkstatus +======================= + +:Date: 4 Jan 2018 + +.. contents:: + :depth: 3 +.. + +NAME +======================== + +Plugin_aries_linkstatus - man page for the linkstatus plugin for Cray +Aries systems + +SYNOPSIS +============================ + +| Within ldmsd_controller or in a configuration file +| config name=cray_aries_linkstatus [ = ] + +DESCRIPTION +=============================== + +With LDMS (Lightweight Distributed Metric Service), plugins for the +ldmsd (ldms daemon) are configured via ldmsd_controller or a +configuration file. aries_linkstatus reads the send and recv status +information from where it is exposed via gpcdr. + +Note that the cray_system_sampler variants have the capability to gather +linkstatus information provided by gpcdr using the configuration and +flag for HSN. For XE/XK systems, linkstatus metrics are reasonably +gathered as part of the cray_gemini_r sampler's gathering of the link +aggregated network counter values. However, for XC (Aries) systems, we +recommend gathering the network counter metrics via the aries_nic_mmr +and aries_rtr_mmr samplers (which use the ioctls) and the link status +metrics via this sampler (which reads from the filesystem location where +gpcdr exposes these values. In order to reduce the overhead, then, we +recommend that this sampler collect at lower frequencies than the +network counter samplers. + +The aries_linkstatus sampler is built and used independently of the +cray_system_sampler variants and of the aries_mmr samplers. + +To build the aries_linkstatus sampler, build with the following flag: +**--enable_aries_linkstatus** + +The output format is as follows: There is an array metric of length 8 +hex values for each tile row. Therefore, there are 5 metrics for each of +send and receive, associated with tiles 00X-01Y. The send and receive +metrics associated with r1, for example, correspond to the 8 values for +tiles 010 - 017. + +CONFIGURATION ATTRIBUTE SYNTAX +================================================== + +The aries_linkstatus plugin uses the sampler_base base class. This man +page covers only the configuration attributes, or those with default +values, specific to the this plugin; see ldms_sampler_base.man for the +attributes of the base class. + +**config** + | name= file_send= + file_recv= [schema=] + | configuration line + + name= + | + | aries_linkstatus + + file_send= + | + | Location of the file with the sendlinkstatus metrics, as + specified in the gpcdr configuration file. In the Cray-provided + default gpcdr configuration, this will be + /sys/devices/virtual/gni/gpcdr0/metricsets/linksendstatus/metrics. + + file_recv= + | + | Location of the file with the recvlinkstatus metrics, as + specified in the gpcdr configuration file. In the Cray-provided + default gpcdr configuration, this will be + /sys/devices/virtual/gni/gpcdr0/metricsets/linkrecvstatus/metrics. + + schema= + | + | Optional schema name. It is intended that the same sampler on + different nodes with different metrics have a different schema. + This will default to aries_linkstatus, if unspecified. + +NOTES +========================= + +- The file_send and file_recv can be the same file, if gpcdr is + configured that way. However, the sampler will do an separate pass + over the file for each type of metric. + +- The linkstatus metrics are not anticipated to change frequently. In + order to reduce overhead since the metrics are read from the + filesystem location where gpcdr exposes these values, it is + recommended that this sampler collect at lower frequencies than the + network counter samplers. Reasonable intervals are on order of + minutes. + +- This sampler is for Cray Aries systems only due to the differing + format of the names of the linkstatus metrics for Aries vs Gemini. It + could be extended to handle both. + +BUGS +======================== + +No known bugs. + +EXAMPLES +============================ + +1) aries_linkstatus: Within ldmsd_controller or in a configuration file: + +:: + + load name=aries_linkstatus + config name=aries_linkstatus producer=64 instance=nid00064/aries_linkstatus file_send=/sys/devices/virtual/gni/gpcdr0/metricsets/linksendstatus/metrics file_recv=/sys/devices/virtual/gni/gpcdr0/metricsets/linkrecvstatus/metrics + start name=aries_linkstatus interval=10000000 + +:: + + #ldms_ls -h nid00064 -x ugni -p 411 -l nid00064/aries_linkstatus + +localhost1/aries_linkstatus: consistent, last update: Tue Sep 26 +11:35:51 2017 [811278us] M u64 component_id 1 D u64 job_id 0 D u8[] +sendlinkstatus_r0 0x03,0x00,0x00,0x00,0x00,0x00,0x00,0x00 D u8[] +sendlinkstatus_r1 0x03,0x03,0x00,0x00,0x00,0x00,0x00,0x00 D u8[] +sendlinkstatus_r2 0x03,0x03,0x03,0x03,0x03,0x03,0x03,0x03 D u8[] +sendlinkstatus_r3 0x00,0x00,0x00,0x03,0x03,0x03,0x03,0x03 D u8[] +sendlinkstatus_r4 0x03,0x03,0x00,0x03,0x03,0x03,0x03,0x03 D u8[] +recvlinkstatus_r0 0x03,0x00,0x00,0x00,0x00,0x00,0x00,0x00 D u8[] +recvlinkstatus_r1 0x03,0x03,0x00,0x00,0x00,0x00,0x00,0x00 D u8[] +recvlinkstatus_r2 0x03,0x03,0x03,0x03,0x03,0x03,0x03,0x03 D u8[] +recvlinkstatus_r3 0x00,0x00,0x00,0x03,0x03,0x03,0x03,0x03 D u8[] +recvlinkstatus_r4 0x03,0x03,0x00,0x03,0x03,0x03,0x03,0x03 + +SEE ALSO +============================ + +ldmsd(7), ldms_sampler_base(7), Plugin_cray_system_sampler_variants(7), +Plugin_aries_mmr(7), ldms_quickstart(7), ldmsd_controller(8) diff --git a/rtd/docs/source/sampler_man/Plugin_aries_mmr.rst b/rtd/docs/source/sampler_man/Plugin_aries_mmr.rst new file mode 100644 index 000000000..a16bd7d58 --- /dev/null +++ b/rtd/docs/source/sampler_man/Plugin_aries_mmr.rst @@ -0,0 +1,143 @@ +================ +Plugin_aries_mmr +================ + +:Date: 05 Jan 2020 + +.. contents:: + :depth: 3 +.. + +NAME +================= + +Plugin_aries_mmr - man page for the aries_mmr sampler and variants. + +SYNOPSIS +===================== + +| Within ldmsd_controller or in a configuration file +| config name=aries_mmr [ = ] +| config name=aries_nic_mmr [ = ] +| config name=aries_rtr_mmr [ = ] + +DESCRIPTION +======================== + +With LDMS (Lightweight Distributed Metric Service), plugins for the +ldmsd (ldms daemon) are configured via ldmsd_controller or a +configuration file. The aries_XXX_mmr sampler variants. provides aries +network counter information. The particular counters to be read are +specified by configuration files. No functional combinations of the +counters are supported (.i.e., does not sum or scale values). + +The aries_XXX_mmr samplers depend on Cray's libgpcd, built with aries +options. This library has been released by Cray in CLE6 and later. You +cannot build this sampler if you do not have the libraries and headers. +If you have the code to build the library, be sure to build with +**CFLAGS=-fPIC** + +The difference between the variants is that aries_nic_mmr will skip any +counters in the inputfile that do NOT begin with AR_NIC\_; aries_rtr_mmr +does the opposite; and aries_mmr does NO name filtering. + +Different types of metrics are added to separate gpcd_contexts. The +order of the metrics in the output is the contexts in a particular +order, with the metrics in each context as they are specified in the +file. + +For the config file, all counter names must be fully spelled out (i.e., +does not resolve the shorthand given in the documentation for the +counters). + +To build any of the aries_mmr samplers, build with the following flags: +**--enable-aries_mmr** +**--with-aries-libgpcd=,** + +CONFIGURATION ATTRIBUTE SYNTAX +=========================================== + +The aries_mmr plugin uses the sampler_base base class. This man page +covers only the configuration attributes, or those with default values, +specific to the this plugin; see ldms_sampler_base.man for the +attributes of the base class. + +**config** + | name= file= [aries_rtr_id= + schema=] + | configuration line + + name= + | + | This MUST be aries_mmr, aries_nic_mmr, or aries_rtr_mmr. + + schema= + | + | Optional schema name. It is intended that the same sampler on + different nodes with different metrics have a different schema. + This will default to cray_aries_r or cray_gemini_r as + appropriate, if unspecified. + + aries_rtr_id= + | + | Optional aries router identifier. Defaults to 0 length string. + + file= + | + | Configuration file of aries performance counter names that will + be added in exactly as they are specified. At least one file + must be specified. + +NOTES +================== + +- This is entirely independent of the cray_aries_r_sampler. + +- At the moment, no functions of the data (either in the sampler or in + a store) are supported. + +- Counters whose names do not resolve are left out. + +- If you start this sampler on a node for which the counters cannot be + obtained (e.g., an external login node), the set may still get + created, however the sample function will fail and the plugin will be + stopped. + +- A non-sampler, standalone version of this code is in the Source in + util/aries/mmr_reader. It is not built via the build. + +- These samplers may change at any time. + +BUGS +================= + +No known bugs. + +EXAMPLES +===================== + +| > cat metrics.txt +| #RAW METRICS +| AR_NIC_NETMON_ORB_EVENT_CNTR_REQ_FLITS + +| +| AR_NIC_RSPMON_NPT_EVENT_CNTR_NL_FLITS +| # this is a test +| AR_RTR_1_2_INQ_PRF_INCOMING_FLIT_VC0 + +| load name=aries_mmr +| config name=aries_mmr producer=localhost2 + instance=localhost2/aries_mmr schema=aries_mmr + file=/home/XXX/metrics.txt +| start name=aries_mmr interval=1000000 + +> ldms_ls localhost2/aries_mmr: consistent, last update: Wed Oct 28 +08:48:36 2015 [153343us] u64 0 AR_RTR_1_2_INQ_PRF_INCOMING_FLIT_VC0 u64 +5968204876 AR_NIC_RSPMON_NPT_EVENT_CNTR_NL_FLITS u64 4182142522 +AR_NIC_NETMON_ORB_EVENT_CNTR_REQ_FLITS + +SEE ALSO +===================== + +ldmsd(8), ldms_sampler_base(7), Plugin_cray_sampler_variants(7), +Plugin_aries_linkstatus(7), ldms_quickstart(7), ldmsd_controller(8) diff --git a/rtd/docs/source/sampler_man/Plugin_aries_mmr_configurable.rst b/rtd/docs/source/sampler_man/Plugin_aries_mmr_configurable.rst new file mode 100644 index 000000000..8a3c53112 --- /dev/null +++ b/rtd/docs/source/sampler_man/Plugin_aries_mmr_configurable.rst @@ -0,0 +1,292 @@ +============================= +Plugin_aries_mmr_configurable +============================= + +:Date: 12 Apr 2020 + +.. contents:: + :depth: 3 +.. + +NAME +============================== + +Plugin_aries_mmr_configurable - man page for the aries_mmr_configurable +sampler. + +SYNOPSIS +================================== + +| Within ldmsd_controller or in a configuration file +| config name=aries_mmr_configurable [ = ] + +DESCRIPTION +===================================== + +With LDMS (Lightweight Distributed Metric Service), plugins for the +ldmsd (ldms daemon) are configured via ldmsd_controller or a +configuration file. The aries_mmr_configurable sampler provides aries +network counter information. It is intended to be used for reading and +optionally resetting the configuable counters, however there is nothing +that currently restricts this. + +The particular counters to be read and set are specified by +configuration files. No functional combinations of the counters are +supported (i.e., does not sum or scale values). The available counter +names can be discovered by: gpcd_print_valid_tile_mmrs(); +gpcd_print_valid_nic_mmrs(); gpcd_print_valid_tile_filtering_mmrs(); +gpcd_print_valid_tile_static_mmrs(); + +A utility providing this service is built as check_mmr_configurable into +bin. The counters are described in Cray's Aries Hardware Counters +Document S-0045. Counters described in that document with ':' extensions +cannot be called by the ':' name in this sampler; rather the counter has +to be read by the base name as hex and the fields separated out by mask, +which is beyond the capability of this sampler. + +The aries_XXX_mmr samplers depend on Cray's libgpcd, built with aries +options. This library has been released by Cray in CLE6 and later. You +cannot build this sampler if you do not have the libraries and headers. +If you have the code to build the library, be sure to build the library +with **CFLAGS=-fPIC** + +The set and read metrics are added to separate gpcd_contexts. The order +of the metrics in the output is the contexts in a particular order, with +the metrics in each context as they are specified in the file. The +counters for read and set can only be specified once and cannot be +changed. The counters to be set can be reset to their configured values +at any time by issuing the action=reset command to configure. + +For the config file, all counter names must be fully spelled out (i.e., +does not resolve the shorthand given in the documentation for the +counters). + +To build the aries_mmr_configurable sampler, build with the following +flags: **--enable-aries_mmr** +**--with-aries-libgpcd=,** + +**!!!WARNING!!!** Cray does not recommend use of the configurable +counters outside of CrayPAT. Use this Plugin at your own risk. +**!!!WARNING!!!** + +CONFIGURATION COMMANDS ORDER +====================================================== + +Configuration commands are intended to be issued in the following order: + +- load + +- config action=initialize + +- config action=finalize + +- start + +The following config commands can be issued anytime after the start in +any order + +- config action=reset + +- config action=ls + +CONFIGURATION ATTRIBUTE SYNTAX +======================================================== + +The aries_mmr_configurable plugin uses the sampler_base base class. This +man page covers only the configuration attributes, or those with default +values, specific to the this plugin; see ldms_sampler_base.man for the +attributes of the base class. + +**config** + | name= action= [ = ...] + | configuration line + + name= + | + | This MUST be aries_mmr_configurable + + action= + | + | Options are initialize, finalize, reset, and ls: + + **initialize** + | [schema= setfile= rtrid=] readfile= + | initialize the plugin. sampler_base configuration arguments + should be specified at this point. + + setfile= + | + | Optional configuration file with the counter value assignment + options. + | Format: "name,type,default_value" one entry per line. + | Type is 'H' for Hex or anything else to default to uint64_t. + | Value should be written out in standard decimal or hex + (leading 0x) format. + | Blanklines and comments (specfied by leading #) are allowed. + | The sampler uses gpcd_lookup_mmr_by_name, so only the names + that are in the 'valid' sets specified by the gpcd library + are allowed. As of this writing those can be obtained by: + gpcd_print_valid_tile_mmrs(); gpcd_print_valid_nic_mmrs(); + gpcd_print_valid_tile_filtering_mmrs(); + gpcd_print_valid_tile_static_mmrs(); + + These are printed out in the utility check_mmr_configurable. + + readfile= + | + | Configuration file with the names of the counters to read. + | Format "name,type" one entry per line. + | Type is 'H' for Hex or anything else to default to uint64_t. + Hex values are written out as a char array. + | Blanklines and comments (specfied by leading #) are allowed. + | The sampler uses gpcd_lookup_mmr_by_name, so only the names + that are in the 'valid' sets specified by the gpcd library + are allowed. As of this writing those can be obtained by: + gpcd_print_valid_tile_mmrs(); gpcd_print_valid_nic_mmrs(); + gpcd_print_valid_tile_filtering_mmrs(); + gpcd_print_valid_tile_static_mmrs(); + + These are printed out in the utility check_mmr_configurable. + + rtrid= + | + | Optional unique rtr string identifier (e.g., c0-0c0s0a0). + Defaults to 0 length string. + + schema= + | + | Optional schema name. Defaults to 'aries_mmr_configurable'. + + **finalize** + | + | Creates the mmr_contexts, sets the set counters to the + configured values, and creates the set. Takes no arguments. If + finalize fails, all state is cleared and the plugin can be + configured again. + + **ls** + | + | Prints out the set counter names and their configured values and + also the read counter names. Takes no arguments. + + **reset** + | + | Resets the set counters to their configured values. Takes no + arguments. + +NOTES +=============================== + +- See WARNINGS above. + +- This is entirely independent of the cray_aries_r_sampler. + +- At the moment, no functions of the data (either in the sampler or in + a store) are supported. + +- Counters whose names do not resolve are left out. + +- If you start this sampler on a node for which the counters cannot be + obtained (e.g., an external login node), the set may still get + created, however the sample function will fail and the plugin will be + stopped. + +- While the names are checked to be in the valid set (see note above), + there is nothing that checks that the value that you choose to write + to a counter is valid. + +- If writing the counters is not enabled, this plugin must be run as + root in order to call the gpcd command that enables writing the + counters. + +- This sampler may change at any time. + +BUGS +============================== + +- There is an unavoidable race condition if someone out of band disable + permissions of writing the counters in between the check in this + sampler and the actual write. + +- Because the sampler needs to write this will toggle on the write + ability for anyone. + +EXAMPLES +================================== + +| > more setconf.txt +| AR_NIC_NETMON_ORB_EVENT_CNTR_REQ_FLITS,U,0 +| AR_NIC_ORB_CFG_NET_RSP_HIST_OVF,H,0xFF +| AR_NIC_ORB_CFG_NET_RSP_HIST_1,H,0x000A000500010000 + +| > more readconf.txt +| AR_NIC_NETMON_ORB_EVENT_CNTR_REQ_FLITS,U +| AR_NIC_ORB_CFG_NET_RSP_HIST_OVF,H +| AR_NIC_ORB_CFG_NET_RSP_HIST_1,H +| AR_NIC_ORB_PRF_NET_RSP_HIST_BIN01,H +| AR_NIC_ORB_PRF_NET_RSP_HIST_BIN23,H +| AR_NIC_ORB_PRF_NET_RSP_HIST_BIN45,H +| AR_NIC_ORB_PRF_NET_RSP_HIST_BIN67,H + +| load name=aries_mmr_configurable +| config name=aries_mmr_configurable producer=localhost1 + instance=localhost1/aries_mmr schema=aries_mmr_configurable + setfile=XXX/setconf.txt readfile=XXX/Build/readconf.txt component_id=1 + action=initialize aries_rtr_id=c0-0c0a0 +| config name=aries_mmr_configurable action=finalize +| config name=aries_mmr_configurable action=ls +| start name=aries_mmr_configurable interval=5000000 + +| >ldms_ls +| localhost1/aries_mmr: consistent, last update: Sun Apr 12 19:04:00 + 2020 -0600 [290661us] +| M u64 component_id 1 +| D u64 job_id 0 +| D u64 app_id 0 +| M char[] aries_rtr_id "c0-0c0a0" +| D u64 AR_NIC_NETMON_ORB_EVENT_CNTR_REQ_FLITS 30756 +| D char[] AR_NIC_ORB_CFG_NET_RSP_HIST_OVF "0x0" +| D char[] AR_NIC_ORB_CFG_NET_RSP_HIST_1 "0xa000500010000" +| D char[] AR_NIC_ORB_PRF_NET_RSP_HIST_BIN01 "0xcb400000d6b" +| D char[] AR_NIC_ORB_PRF_NET_RSP_HIST_BIN23 "0x0" +| D char[] AR_NIC_ORB_PRF_NET_RSP_HIST_BIN45 "0x0" +| D char[] AR_NIC_ORB_PRF_NET_RSP_HIST_BIN67 "0x0" + +| Also in the logs from the action=ls: +| Sun Apr 12 19:03:55 2020: INFO : Name default R/S +| Sun Apr 12 19:03:55 2020: INFO : + ------------------------------------------------ -------------------- + ----- +| Sun Apr 12 19:03:55 2020: INFO : + AR_NIC_NETMON_ORB_EVENT_CNTR_REQ_FLITS N/A R +| Sun Apr 12 19:03:55 2020: INFO : AR_NIC_ORB_CFG_NET_RSP_HIST_OVF N/A R +| Sun Apr 12 19:03:55 2020: INFO : AR_NIC_ORB_CFG_NET_RSP_HIST_1 N/A R +| Sun Apr 12 19:03:55 2020: INFO : AR_NIC_ORB_PRF_NET_RSP_HIST_BIN01 N/A + R +| Sun Apr 12 19:03:55 2020: INFO : AR_NIC_ORB_PRF_NET_RSP_HIST_BIN23 N/A + R +| Sun Apr 12 19:03:55 2020: INFO : AR_NIC_ORB_PRF_NET_RSP_HIST_BIN45 N/A + R +| Sun Apr 12 19:03:55 2020: INFO : AR_NIC_ORB_PRF_NET_RSP_HIST_BIN67 N/A + R +| Sun Apr 12 19:03:55 2020: INFO : + AR_NIC_NETMON_ORB_EVENT_CNTR_REQ_FLITS 0 S +| Sun Apr 12 19:03:55 2020: INFO : AR_NIC_ORB_CFG_NET_RSP_HIST_OVF 0xff + S +| Sun Apr 12 19:03:55 2020: INFO : AR_NIC_ORB_CFG_NET_RSP_HIST_1 + 0xa000500010000 S + +| At any time action=ls or action=reset can be called via + ldmsd_controller: +| > more aries_mmr_configurable_controller_reset.sh #!/bin/bash +| echo "config name=aries_mmr_configurable action=reset" +| exit +| > ldmsd_controller --host localhost --port=${port1} -a munge --script + "XXX/aries_mmr_configurable_controller_reset.sh" + +SEE ALSO +================================== + +ldmsd(8), ldms_sampler_base(7), Plugin_cray_sampler_variants(7), +Plugin_aries_linkstatus(7), ldms_quickstart(7), Plugin_aries_mmr(7), +Plugin_aries_rtr_mmr)7), Plugin_aries_nic_mmr(7), ldmsd_controller(8) diff --git a/rtd/docs/source/sampler_man/Plugin_blob_stream_writer.rst b/rtd/docs/source/sampler_man/Plugin_blob_stream_writer.rst new file mode 100644 index 000000000..46f8c7475 --- /dev/null +++ b/rtd/docs/source/sampler_man/Plugin_blob_stream_writer.rst @@ -0,0 +1,133 @@ +========================= +Plugin_blob_stream_writer +========================= + +:Date: 15 Jun 2021 + +.. contents:: + :depth: 3 +.. + +NAME +========================== + +Plugin_blob_stream_writer - man page for the LDMS blob_stream_writer +plugin + +SYNOPSIS +============================== + +| Within ldmsd_controller or a configuration file: +| config name=blob_stream_writer [ = ] + +DESCRIPTION +================================= + +With LDMS (Lightweight Distributed Metric Service), plugins for the +ldmsd (ldms daemon) are configured via ldmsd_controller or a +configuration file. The blob_stream_writer plugin writes out raw stream +messages and offsets of the messages in separate files. Messages are not +appended with ' or ' '. Multiple streams may be specified. + +CONFIGURATION ATTRIBUTE SYNTAX +==================================================== + +**config** + | name=blob_stream_writer path= container= + stream= debug=1 + | configuration line + + name= + | + | This MUST be blob_stream_writer. + + path= + | + | path to the directory of the output files + + container= + | + | directory of the output file + + stream= + | + | stream to which to subscribe. This argument may be repeated. + Each stream will be written in a separate file pair. + + debug=1 + | + | Enable logging of messages stored to the log file. + + timing=1 + | + | Enable writing timestamps to a separate file. + +OUTPUT FORMAT +=================================== + +There is no requirement that any message must the same format as any +other. + +The writer writes all messages received to a file pair: +$path/$container/$stream.OFFSET.$create_time +$path/$container/$stream.DAT.$create_time where OFFSET is the byte +offsets into the corresponding .DAT of the messages seen on the stream. + +Each byte offset is written as a little-endian 64 bit number. Data read +from .OFFSET should be converted to host order with le64toh. + +Both DAT and OFFSET files begin with an 8 byte magic number: blobdat\\0 +and bloboff\\0, respectively. + +Optionally (if timing=1 given) the additional file +$path/$container/$stream.TIMING.$create_time is created containing +binary timestamps corresponding to the messages. The TIMING file begins +with an 8 byte magic number: blobtim\\0. Each time is the delivery time +to the plugin performing the blob storage. Each timestamp is written to +the .TIMING file as a binary pair (tv_sec, tv_usec) with each value +stored as a little-endian 64 bit value which should be read and then +converted with le64toh. + +NOTES +=========================== + +This writer is in development and may be changed at any time. + +Cannot support stream=.\* as there is no corresponding regex +subscription policy currently available in the C stream API. + +The config operation may called at any time or repeated. The start and +stop operations will start and stop storage of all streams. + +The plugin appears in C code as a sampler plugin, since the storage +policy and store plugin interfaces are set-oriented and no sets are +involved here. + +EXAMPLES +============================== + +Within ldmsd_controller or a configuration file: + +:: + + load name=blob_stream_writer + config name=blob_stream_writer path=/writer/streams container=${CLUSTER} stream=foo stream=slurm stream=kokkos + start name=name=blob_stream_writer + +Examining offsets in a shell: + +:: + + od od -A d -t u8 -j 8 -w8 slurm.OFFSET.1624033344 |sed -e 's/[0-9,A-F,a-f]* *//' + +Examining timestamps in a shell: + +:: + + od -A d -j 8 -t u8 + +SEE ALSO +============================== + +ldmsd(8), ldms_quickstart(7), ldmsd_controller(8), le64toh(3), fseek(3), +od(1) diff --git a/rtd/docs/source/sampler_man/Plugin_clock.rst b/rtd/docs/source/sampler_man/Plugin_clock.rst new file mode 100644 index 000000000..f5af75541 --- /dev/null +++ b/rtd/docs/source/sampler_man/Plugin_clock.rst @@ -0,0 +1,70 @@ +============ +Plugin_clock +============ + +:Date: 18 Feb 2018 + +.. contents:: + :depth: 3 +.. + +NAME +============= + +Plugin_clock - man page for the LDMS clock plugin + +SYNOPSIS +================= + +| Within ldmsd_controller or a configuration file: +| config name=clock [ = ] + +DESCRIPTION +==================== + +The clock plugin provides a counter of samples taken since it started. +This is of pedagogical interest and useful for detecting situations +where a sample is missed either in being taken or in transmission. + +CONFIGURATION ATTRIBUTE SYNTAX +======================================= + +The clock plugin uses the sampler_base base class. This man page covers +only the configuration attributes, or those with default values, +specific to the this plugin; see ldms_sampler_base.man for the +attributes of the base class. + +**config** + | name= [schema=] + | configuration line + + name= + | + | This MUST be clock + + schema= + | + | Optional schema name. It is intended that the same sampler on + different nodes with different metrics have a different schema. + If not specified, will default to \`clock\`. + +BUGS +============= + +No known bugs. + +EXAMPLES +================= + +Within ldmsd_controller or a configuration file: + +:: + + load name=clock + config name=clock producer=vm1_1 instance=vm1_1/clock + start name=clock interval=1000000 offset=0 + +SEE ALSO +================= + +ldmsd(8), ldms_quickstart(7), ldmsd_controller(8), ldms_sampler_base(7) diff --git a/rtd/docs/source/sampler_man/Plugin_coretemp.rst b/rtd/docs/source/sampler_man/Plugin_coretemp.rst new file mode 100644 index 000000000..2065e627e --- /dev/null +++ b/rtd/docs/source/sampler_man/Plugin_coretemp.rst @@ -0,0 +1,55 @@ +=============== +Plugin_coretemp +=============== + +:Date: 3 May 2022 + +.. contents:: + :depth: 3 +.. + +NAME +================ + +Plugin_coretemp - An LDMS sampler plugin that monitors CPU temperature +data + +SYNOPSIS +==================== + +| Within ldmsd_controller or a configuration file: +| load name=coretemp config name=coretemp producer= + instance= component_id= + +DESCRIPTION +======================= + +The coretemp sampler collects information from the Linux coretemp module +through files located in /sys/devices/platform. Files in this directory +are walked recursively and regular expressions are used to select +entries produced by the Linux coretemp module. + +See the Linux modprobe(8) command for information on how to load Linux +modules. + +CONFIGURATION ATTRIBUTE SYNTAX +========================================== + +See man Plugin_base. + +EXAMPLES +==================== + +Within ldmsd_controller or a configuration file: + +:: + + load name=coretemp + config name=coretempp producer=vm1_1 instance=vm1_1/coretemp + start name=coretemp interval=1000000 offset=0 + +SEE ALSO +==================== + +ldmsd(8), ldms_quickstart(7), ldmsd_controller(8), ldms_sampler_base(7), +Plugin_filesingle(7) diff --git a/rtd/docs/source/sampler_man/Plugin_daos_sampler.rst b/rtd/docs/source/sampler_man/Plugin_daos_sampler.rst new file mode 100644 index 000000000..b8d742adb --- /dev/null +++ b/rtd/docs/source/sampler_man/Plugin_daos_sampler.rst @@ -0,0 +1,117 @@ +=================== +Plugin_daos_sampler +=================== + +:Date: 28 Apr 2022 + +.. contents:: + :depth: 3 +.. + +NAME +==================== + +Plugin_daos_sampler - man page for the LDMS DAOS sampler plugin + +SYNOPSIS +======================== + +| Within ldmsd_controller or a configuration file: +| load name=daos_sampler +| config name=daos_sampler producer=${HOSTNAME} +| start name=daos_sampler interval=1000000 + +DESCRIPTION +=========================== + +The daos_sampler plugin collects DAOS telemetry from local DAOS I/O +Engine instances. + +CONFIGURATION ATTRIBUTE SYNTAX +============================================== + +The daos_sampler plugin uses the sampler_base base class. This man page +only covers the configuration attributes specific to this plugin; see +ldms_sampler_base.man for the attributes of the base class. + +name= + | + | This MUST be daos_sampler. + +producer=$HOSTNAME + | + | The $HOSTNAME variable provides a good unique producer ID. + +engine_count=2 + | + | The default is 2; don't change it unless the number of per-server + engines is different. + +target_count=8 + | + | The default is 8; don't change it unless the number of targets per + engine is different. + +**SAMPLE FORMAT** + +The DAOS telemetry is exposed as a set of trees, with the system name as +the root: + +:: + + $system/$rank/$target - Per-engine target metrics not associated with a pool + $system/$rank/$pool - Per-engine top-level pool metrics + $system/$rank/$pool/$target - Per-engine target metrics associated with a pool + +Under each tree is a set of metrics in either counter or gauge format. +Counters are monotonically-increasing uint64 values; gauges are +instantaneous-read uint64 values that can vary up or down. Certain gauge +metrics may have associated statistics in min/max/count/mean/stddev +format. + +**EXAMPLE SAMPLER USAGE** + +Start ldmsd as usual, for example: + +:: + + $ ldmsd -m1MB -x sock:10444 -F -c /path/to/sampler.conf + +NOTE: The default memory size (512KB) may be too small for the number of +metrics collected. Larger sizes may be specified for a large number of +pools. + +Once ldmsd is running, it is possible to check that the DAOS telemetry +appears in the output of ldms_ls, for example: + +:: + + $ ldms_ls -h localhost -x sock -p 10444 -l + daos_server/0/0: consistent, last update: Wed Aug 25 18:40:25 2021 +0000 [653335us] + M char[] system "daos_server" + M u32 rank 0 + M u32 target 0 + D u64 io/latency/update/256B 0 + D u64 io/latency/update/256B/min 0 + D u64 io/latency/update/256B/max 0 + D u64 io/latency/update/256B/samples 0 + D d64 io/latency/update/256B/mean 0.000000 + D d64 io/latency/update/256B/stddev 0.000000 + D u64 io/latency/update/32KB 611 + D u64 io/latency/update/32KB/min 611 + D u64 io/latency/update/32KB/max 611 + D u64 io/latency/update/32KB/samples 1 + D d64 io/latency/update/32KB/mean 611.000000 + D d64 io/latency/update/32KB/stddev 0.000000 + D u64 io/latency/update/64KB 0 + D u64 io/latency/update/64KB/min 0 + D u64 io/latency/update/64KB/max 0 + D u64 io/latency/update/64KB/samples 0 + D d64 io/latency/update/64KB/mean 0.000000 + D d64 io/latency/update/64KB/stddev 0.000000 + D u64 io/latency/update/128KB 1018 + D u64 io/latency/update/128KB/min 567 + D u64 io/latency/update/128KB/max 1214 + D u64 io/latency/update/128KB/samples 8 + D d64 io/latency/update/128KB/mean 828.000000 + D d64 io/latency/update/128KB/stddev 238.011404 diff --git a/rtd/docs/source/sampler_man/Plugin_dcgm_sampler.rst b/rtd/docs/source/sampler_man/Plugin_dcgm_sampler.rst new file mode 100644 index 000000000..5e2942fb9 --- /dev/null +++ b/rtd/docs/source/sampler_man/Plugin_dcgm_sampler.rst @@ -0,0 +1,83 @@ +=================== +Plugin_dcgm_sampler +=================== + +:Date: 1 May 2019 + +.. contents:: + :depth: 3 +.. + +NAME +==================== + +Plugin_dcgm_sampler - man page for the LDMS dcgm_sampler plugin + +SYNOPSIS +======================== + +| Within ldmsd_controller or a configuration file: +| config name=dcgm_sampler [ = ] + +DESCRIPTION +=========================== + +With LDMS (Lightweight Distributed Metric Service), plugins for the +ldmsd (ldms daemon) are configured via ldmsd_controller or a +configuration file. The dcgm_sampler plugin provides a metric set for +each DCGM-compatible Nvidia GPU on the system. The schema is named +"dcgm" by default. + +CONFIGURATION ATTRIBUTE SYNTAX +============================================== + +**config** + | name= interval= [fields=] + [schema=] [job_set=] + | configuration line + + name= + | + | This MUST be dcgm_sampler. + + interval= + | + | The sampling interval. This MUST be set to the same value that + is set on the "start" line, otherwise behavior is undetermined. + + fields= + | + | is a comma-separated list of integers representing DCGM + field numebers that the plugin should watch. By default the + plugin will watch fields 150,155. + + schema= + | + | The schema name defaults to "dcgm", but it can be renamed at the + user's choice. + + job_set= + | + | The name of the metric set that contains the job id information + (default=job_id) + +BUGS +==================== + +No known bugs. + +EXAMPLES +======================== + +Within ldmsd_controller or a configuration file: + +:: + + load name=dcgm_sampler + config name=dcgm_sampler interval=1000000 fields=150,155,1001,1002,1003 schema=dcgmfav5 + start name=dcgm_sampler interval=1000000 + +SEE ALSO +======================== + +ldmsd(8), ldms_quickstart(7), ldmsd_controller(8), ldms_sampler_base(7) diff --git a/rtd/docs/source/sampler_man/Plugin_dstat.rst b/rtd/docs/source/sampler_man/Plugin_dstat.rst new file mode 100644 index 000000000..c09096bcf --- /dev/null +++ b/rtd/docs/source/sampler_man/Plugin_dstat.rst @@ -0,0 +1,152 @@ +============ +Plugin_dstat +============ + +:Date: 4 Nov 2020 + +.. contents:: + :depth: 3 +.. + +NAME +============= + +Plugin_dstat - man page for the LDMS dstat plugin + +SYNOPSIS +================= + +| Within ldmsd_controller +| config name=dstat [ = ] + +DESCRIPTION +==================== + +The dstat plugin provides ldmsd process information from +/proc/self/[io,stat,statm,fd]. + +CONFIGURATION ATTRIBUTE SYNTAX +======================================= + +**config** + | name= component_id= [io=] [stat=] + [statm=] [mmalloc=] [fd=] [fdtypes=] + set= + | configuration line + + name= + | + | This MUST be dstat. + + producer= + | + | The producer string value. + + instance= + | + | The name of the metric set. + + schema= + | + | Optional schema name. It is required by most storage backends + that the same sampler on different nodes with different metric + subsets needs to have a unique schema name. Use auto-schema=1 + instead of schema to automatically meet the backend requirement. + + auto-schema= + | + | If true, change the schema name to dstat\_$X, where $X will be a + unique hex value derived from the data selection options. If + both schema and auto-schema are given, for + backward-compatibility auto-schema is ignored for the dstat + plugin. + + component_id= + | + | The component id numerical value. + + io= + | + | Include the metrics from /proc/self/io. + + stat= + | + | Include the metrics from /proc/self/stat. + + tick= + | + | Include the sc_clk_tck from sysconf(3) as a metric. + + statm= + | + | Include the metrics from /proc/self/statm. + + mmalloc= + | + | Include the mmap memory usage metric from LDMS mmalloc. + + fd= + | + | Include the number of open file descriptors found in + /proc/self/fd. + + fdtypes= + | + | Include the number and types of open file descriptors found in + /proc/self/fd. This option may have high overhead on aggregators + with many open connections. + +DATA +============= + +This reports metrics from /proc/self/[io,stat,statm] by default. If +specific subsets are named (io=true), then unnamed sets are suppressed. +Units on the /proc metric values are documented in the man pages. The +unit of the mmalloc metric is bytes. + +EXAMPLES +================= + +Within ldmsd_controller or a configuration file: + +:: + + load name=dstat + config name=dstat producer=vm1_1 component_id=1 instance=vm1_1/dstat + start name=dstat interval=1000000 + +NOTES +============== + +See proc(5) for the definitions of all the metrics except sc_clk_tck and +fd data. Metrics which are invariant (other than pids and sc_clk_tck) +are not included. Where naming is potentially ambiguous and a more +specific name is used in /proc/self/status for the same metrics, the +name from /proc/self/status is used. + +Requesting mmalloc or fd or fdtypes (any of which may be high overhead) +requires explicitly requesting it and all others which are wanted. + +The numbers listed in /proc/self/fd/ are symbolic links. The "types" of +reported are based on the names pointed to by the links as follows: + +:: + + fd_count total number of open file descriptors. + fd_max highest file number. + fd_socket count of link targets starting with "socket:" + fd_dev count of link targets starting with "/dev:" + fd_anon_inode count of link targets starting with "anon_inode:" + fd_pipe count of link targets starting with "pipe:" + fd_path count of link targets starting with . or / but not /dev. + +On most HPC Linux systems sc_clk_tck is 100 Hz. Less common values are +250, 300, and 1000. + +This is the LDMSD answer to the ancient question "Quis custodiet ipsos +custodes?" + +SEE ALSO +================= + +proc(5), ldmsd(8), sysconf(3) diff --git a/rtd/docs/source/sampler_man/Plugin_edac.rst b/rtd/docs/source/sampler_man/Plugin_edac.rst new file mode 100644 index 000000000..e82811fe6 --- /dev/null +++ b/rtd/docs/source/sampler_man/Plugin_edac.rst @@ -0,0 +1,108 @@ +=========== +Plugin_edac +=========== + +:Date: 18 Feb 2018 + +.. contents:: + :depth: 3 +.. + +NAME +============ + +Plugin_edac - man page for the LDMS edac plugin + +SYNOPSIS +================ + +| Within ldmsd_controller +| config name=edac [ = ] + +DESCRIPTION +=================== + +The edac plugin provides memory error information from +/sys/devices/system/edac for correctable and uncorrectable errors. + +CONFIGURATION ATTRIBUTE SYNTAX +====================================== + +The edac plugin uses the sampler_base base class. This man page covers +only the configuration attributes, or those with default values, +specific to the this plugin; see ldms_sampler_base.man for the +attributes of the base class. + +**config** + | name= max_mc= max_csrow= + [schema=] + | configuration line + + name= + | + | This MUST be edac. + + schema= + | + | Optional schema name. It is intended that the same sampler on + different nodes with different metrics have a different schema. + If not specified, will default to edac. + + max_mc= + | + | The number of mc's in /sys/devices/system/edac/mc. Typically + this number is 2. + + max_csrow= + | + | The number of csrows in a single mc. For example, the value + should be 4 for when the largest csrow looks like: + /sys/devices/system/edac/mc/mc0/csrow3. Typically this number is + 8, but it can vary depending on the system. + +DATA +============ + +This reports counts for both correctable and uncorrectable errors per mc +and per csrow. It also reports the seconds since reset per mc. + +EXAMPLES +================ + +Within ldmsd_controller or a configuration file: + +:: + + load name=edac + config name=edac producer=vm1_1 component_id=1 instance=vm1_1/edac max_mc=2 max_csrow=4 + start name=edac interval=1000000 + +NOTES +============= + +An upper limit on metric set size is enforced. Configuring to collect +too many registers will generate an error detailing the compiled size +limit. This limit is only adjustable in the source code. + +For more detailed background information, see +www.kernel.org/doc/Documentation/edac.txt and +www.kernel.org/doc/Documentation/ABI/testing/sysfs-devices-edac. + +AUTHORS +=============== + +Kathleen Shoga (Lawrence Livermore National +Laboratory). Ported to LDMS v3 by Benjamin Allan . +Ported to LDMS v4 by Ann Gentile . + +ACKNOWLEDGMENTS +======================= + +This work was created under the auspices of the U.S. Department of +Energy by Lawrence Livermore National Laboratory under Contract +DE-AC52-07NA27344. Release Number: LLNL-SM-687054. + +SEE ALSO +================ + +edac(3), edac-util(8), edac-ctl(8), ldms(7), ldms_sampler_base(7) diff --git a/rtd/docs/source/sampler_man/Plugin_filesingle.rst b/rtd/docs/source/sampler_man/Plugin_filesingle.rst new file mode 100644 index 000000000..73cd8f547 --- /dev/null +++ b/rtd/docs/source/sampler_man/Plugin_filesingle.rst @@ -0,0 +1,129 @@ +================= +Plugin_filesingle +================= + +:Date: 15 Dec 2018 + +.. contents:: + :depth: 3 +.. + +NAME +================== + +Plugin_filesingle - man page for the LDMS filesingle plugin + +SYNOPSIS +====================== + +| Within ldmsd_controller or in a configuration file +| config name=filesingle conf= [timing] + +DESCRIPTION +========================= + +The filesingle plugin provides metrics pulled from files containing a +single numeric value or character. This supports flexible definition of, +among others, sensor hardware, file system, and cpu metrics. + +CONFIGURATION ATTRIBUTE SYNTAX +============================================ + +See ldms_sampler_base(7) for the common sampler options. + +**config** + | conf= [timing] + + conf= + | + | File lines contain the source, type, and default value for each + metric. See CONF FILE SYNTAX below. + + timing + | + | If keyword 'timing' is included in the options, extra metrics + measuring the time to collect every defined metric will be + included. This allows for the discovery of slow sensors. Each + timing metric will have the name of the timed metric with + ".time" appended. Do not use "timing="; it is ignored. + +COLLECTION +======================== + +Each metric is collected from a separate file. If this process fails for +any reason at all, the default value is collected instead. The timing +metrics (type S64) report the number of microseconds measured bracketing +the open/read/close cycle of the metric's value file. The timing of a +failed collection is -1. Each file is open, read, and closed for each +data sample collected. + +CONF FILE SYNTAX +============================== + +Each line of the conf file must be empty, contain a comment or contain: + + + +The metric and file names must not contain spaces. The metric type is +one of: S8, S16, S32, S64, U8, U16, U32, U64, F32, D64, CHAR. + +Lines starting with # are comment lines. Line continuations are not +allowed. + +The script ./ldms-sensors-config(1) generates an example metrics config +file from the data reported by sensors(1). Metric names, types, and +defaults generated can be tuned to user preferences. + +EXAMPLES +====================== + +Within ldmsd_controller or a configuration file: + +:: + + load name=filesingle + config name=filesingle conf=/etc/sysconfig/ldms.d/plugins-conf/filesingle.conf + start name=filesingle interval=10000000 offset=0 + +For the contents of filesingle.conf (on a specific machine): + +:: + + power1 /sys/class/hwmon/hwmon0/device/power1_average S64 -1 + coretemp.Physical_id_0 /sys/class/hwmon/hwmon1/temp1_input S64 -1 + coretemp.Core_0 /sys/class/hwmon/hwmon1/temp2_input S64 -1 + core0.cur_freq /sys/devices/system/cpu/cpu0/cpufreq/cpuinfo_cur_freq S64 -1 + +The power reading, two temperatures, and cpu frequency are collected. + +NOTES +=================== + +The values collected are the raw values from the sources; converting to +humane units is left to data post-processors. In the specific example +given, the raw power reading has units of microwatts, the temperatures +have units of millidegrees Celsius, and the cpu frequency is reported in +milliHertz. To determine the appropriate unit conversions for your +system, compare the output of sensors(1) or lscpu(1) to the value found +in the raw data files. + +To determine the file locations of metrics on your system consult the +documentation for the device drivers of interest or the output of +ldms-sensors-config(1) or + +"strace -e trace=open " + +Some metric files may only be readable by the users with administrative +privileges. Some of these may be available without privilege by +extracting them from larger files in /proc, e.g. "cpu MHz" in +/proc/cpuinfo. + +Some sensors may not update themselves (at the kernel level) faster than +a certain frequency, even though it is possible to more frequently read +their data files. + +SEE ALSO +====================== + +ldms-sensors-config(1), sensors(1), lscpu(1), ldms_sampler_base(7), +proc(5), ldmsd(8), ldmsd_controller(8) diff --git a/rtd/docs/source/sampler_man/Plugin_fptrans.rst b/rtd/docs/source/sampler_man/Plugin_fptrans.rst new file mode 100644 index 000000000..84802c7ef --- /dev/null +++ b/rtd/docs/source/sampler_man/Plugin_fptrans.rst @@ -0,0 +1,76 @@ +============== +Plugin_fptrans +============== + +:Date: 18 Feb 2018 + +.. contents:: + :depth: 3 +.. + +NAME +=============== + +Plugin_fptrans - man page for the LDMS fptrans plugin + +SYNOPSIS +=================== + +| Within ldmsd_controller or a configuration file: +| config name=fptrans [ = ] + +DESCRIPTION +====================== + +The fptrans plugin provides metrics that have well known values which +can be used to test transmission and storage fidelity of single and +double precision scalars and floating point arrays. + +CONFIGURATION ATTRIBUTE SYNTAX +========================================= + +The fptrans plugin uses the sampler_base base class. This man page +covers only the configuration attributes, or those with default values, +specific to the this plugin; see ldms_sampler_base.man for the +attributes of the base class. + +**config** + | name= [schema=] + | configuration line + + name= + | + | This MUST be fptrans. + + schema= + | + | Optional schema name. It is intended that the same sampler on + different nodes with different metrics have a different schema. + If not specified, it will default to \`fptrans\`. + +NOTES +================ + +The well known values used are 0, 1, and pi as determined by C macro +M_PI. + +BUGS +=============== + +No known bugs. + +EXAMPLES +=================== + +Within ldmsd_controller or a configuration file: + +:: + + load name=fptrans + config name=fptrans producer=vm1_1 instance=vm1_1/fptrans + start name=fptrans interval=1000000 + +SEE ALSO +=================== + +ldmsd(8), ldms_quickstart(7), ldmsd_controller(8), ldms_sampler_base(7) diff --git a/rtd/docs/source/sampler_man/Plugin_geopm_sampler.rst b/rtd/docs/source/sampler_man/Plugin_geopm_sampler.rst new file mode 100644 index 000000000..f9fec8474 --- /dev/null +++ b/rtd/docs/source/sampler_man/Plugin_geopm_sampler.rst @@ -0,0 +1,153 @@ +==================== +Plugin_geopm_sampler +==================== + +:Date: 06 May 2022 + +.. contents:: + :depth: 3 +.. + +NAME +===================== + +Plugin_geopm - man page for the LDMS geopm plugin + +SYNOPSIS +========================= + +| Within ldmsd_controller or a configuration file: +| config name=ldms_geopm_sampler geopm_request_path= + +DESCRIPTION +============================ + +With LDMS (Lightweight Distributed Metric Service), plugins for the +ldmsd (ldms daemon) are configured via ldmsd_controller or a +configuration file. The geopm plugin provides access to the geopm(7) +PlatformIO interface by configuring the request file with signal +requests. + +CONFIGURATION ATTRIBUTE SYNTAX +=============================================== + +The ldms_geopm_sampler plugin uses the sampler_base base class. This man +page covers only the configuration attributes, or those with default +values, specific to the this plugin; see ldms_sampler_base.man for the +attributes of the base class. + +The GEOPM LDMS sampler can be configured with the same config parameters +as other LDMS samplers (e.g., \``name`\`, \``producer`\`, +\``component_Id`\`). In addition to these paramers, the sampler must be +configured with the option - \``geopm_request_path=`\`. + +**config** + | name= geopm_request_path= [schema=] + | configuration line + + name= + | + | This MUST be ldms_geopm_sampler. + + geopm_request_path= + | + | This parameter points to the absolute path of the ASCII file + containing the list of signals that the user would like to have + monitored by the sampler. + + The format of this file is a three column white space delimited file. + Each line must contain a GEOPM PlatformIO request of the form: + + ** ** + + The signal name must be a signal supported by GEOPM on the system. To + see a full list of supported signals run the geopmread(1) command + without any options. The domain must match one of the GEOPM domains. + Run the geopmread(1) command with the -d option to see a full list of + supported domains and the number of instances of each on the system. + The domain index provided must be greater or equal to zero and less + than the number of available domains. + + schema= + | + | Optional schema name. It is intended that the same sampler on + different nodes with different metrics have a different schema. + If not specified, will default to \`ldms_geopm_sampler\`. + +EXAMPLES +========================= + +**CONFIGURING LDMSD WITH THE SAMPLER** + +Within ldmsd_controller or a configuration file: + +:: + + load name=ldms_geopm_sampler + config name=ldms_geopm_sampler producer=${HOSTNAME} geopm_request_path=/etc/ldms/geopm_sampler_request.config + start name=ldms_geopm_sampler interval=1000000 + +Here's an example of a file containing the list of signals: + +$> cat geopm_sampler_signal.config CPU_FREQUENCY_MAX board 0 +CPU_FREQUENCY_MIN board 0 CPU_FREQUENCY_STEP board 0 +CPU_FREQUENCY_STICKER board 0 TIME board 0 ENERGY_PACKAGE board 0 +INSTRUCTIONS_RETIRED board 0 POWER_DRAM board 0 POWER_PACKAGE board 0 +POWER_PACKAGE_LIMIT board 0 POWER_PACKAGE_MAX board 0 POWER_PACKAGE_MIN +board 0 POWER_PACKAGE_TDP board 0 TEMPERATURE_CORE board 0 +TEMPERATURE_PACKAGE board 0 TIMESTAMP_COUNTER board 0 + +Note the inclusion of the *geopm_request_path* parameter passed to the +*config* instruction. Also, note the name of the sampler +*ldms_geopm_sampler* passed to the *name* parameter for the *load* and +*start* instructions. + +**RUNNING LDMSD WITH THE SAMPLER** + +In order to run the GEOPM LDMS sampler, follow the same steps as you +would for any other LDMS sampler. Start the \``ldmsd`\` daemon is +running on the target node to be monitored. Example below: + +ldmsd -x sock:10444 -F -c -l +${TEST_PATH}/temp/demo_ldmsd_log + +For observing the progress of the sampler, you may choose to add the +option \`\`-v DEBUG`\` above. While the \``ldmsd`\` daemon is running, +the user may choose to query for a single instantaneous sample set +comprising of recently monitored signals. This can be achieved by using +the existing commandline tool - \``ldms_ls`\` available as part of the +installation of the LDMS framework. An example is shown below: + +$> ldms_ls -h localhost -x sock -p 10444 -l -v + +Schema Instance Flags Msize Dsize Hsize UID GID Perm Update Duration +Info -------------- ------------------------ ------ ------ ------ ------ +------ ------ ---------- ----------------- ----------------- -------- +ldms_geopm_sampler /ldms_geopm_sampler CL 1352 240 0 1024 100 +-r--r----- 1656431193.051578 0.000323 "updt_hint_us"="1000000:50000" +-------------- ------------------------ ------ ------ ------ ------ +------ ------ ---------- ----------------- ----------------- -------- +Total Sets: 1, Meta Data (kB): 1.35, Data (kB) 0.24, Memory (kB): 1.59 + +======================================================================================== + +/ldms_geopm_sampler: consistent, last update: Tue Jun 28 +08:46:33 2022 -0700 [51578us] M u64 component_id 1 D u64 job_id 0 D u64 +app_id 0 D d64 CPU_FREQUENCY_MAX_board_0 3700000000.000000 D d64 +CPU_FREQUENCY_MIN_board_0 1000000000.000000 D d64 +CPU_FREQUENCY_STEP_board_0 100000000.000000 D d64 +CPU_FREQUENCY_STICKER_board_0 2100000000.000000 D d64 TIME_board_0 +6.899751 D d64 ENERGY_PACKAGE_board_0 334936.207092 D d64 +INSTRUCTIONS_RETIRED_board_0 131016700.000000 D d64 POWER_DRAM_board_0 +0.900889 D d64 POWER_PACKAGE_board_0 25.469352 D d64 +POWER_PACKAGE_LIMIT_board_0 140.000000 D d64 POWER_PACKAGE_MAX_board_0 +594.000000 D d64 POWER_PACKAGE_MIN_board_0 140.000000 D d64 +POWER_PACKAGE_TDP_board_0 280.000000 D d64 TEMPERATURE_CORE_board_0 +26.454545 D d64 TEMPERATURE_PACKAGE_board_0 28.000000 D d64 +TIMESTAMP_COUNTER_board_0 10913748924506.000000 + +SEE ALSO +========================= + +ldmsd(8), ldms_quickstart(7), ldmsd_controller(8), ldms_sampler_base(7), +geopm(7), geopm_pio(7), geopmread(1), geopmwrite(1) diff --git a/rtd/docs/source/sampler_man/Plugin_hello_sampler.rst b/rtd/docs/source/sampler_man/Plugin_hello_sampler.rst new file mode 100644 index 000000000..24ff1ae8c --- /dev/null +++ b/rtd/docs/source/sampler_man/Plugin_hello_sampler.rst @@ -0,0 +1,89 @@ +==================== +Plugin_hello_sampler +==================== + +:Date: 21 Aug 2021 + +.. contents:: + :depth: 3 +.. + +NAME +===================== + +Plugin_hello_sampler - man page for the LDMS hello_sampler plugin + +SYNOPSIS +========================= + +| Within ldmsd_controller or a configuration file: +| config name=hello_sampler [ = ] + +DESCRIPTION +============================ + +With LDMS (Lightweight Distributed Metric Service), plugins for the +ldmsd (ldms daemon) are configured via ldmsd_controller or a +configuration file. The hello_sampler plugin does not actually sample, +but rather subscribes to an ldmsd_stream and writes the stream data to +the ldmsd logfile. + +CONFIGURATION ATTRIBUTE SYNTAX +=============================================== + +The hello_sampler plugin uses the sampler_base base class. This man page +covers only the configuration attributes, or those with default values, +specific to the this plugin; see ldms_sampler_base.man for the +attributes of the base class. + +**config** + | name= stream= + | configuration line + + name= + | + | This MUST be hello_sampler. + + stream= + | + | Name of the stream to which to subscribe. + +BUGS +===================== + +No known bugs. + +EXAMPLES +========================= + +Within ldmsd_controller or a configuration file: + +:: + + load name=hello_sampler + config name=hello_sampler producer=host1 instance=host1/hello_sampler stream=foo component_id=1 + start name=hello_sampler interval=1000000 offset=0 + +:: + + > ./hello_publisher -x sock -h localhost -p 16000 -a munge -s foo -m "foo" -t str + The data was successfully published. + The server responded with 0 + + > ./hello_publisher -x sock -h localhost -p 16000 -a munge -s foo -m "bar" -t str + The data was successfully published. + The server responded with 0 + + + In the log file of the ldmsd: + > cat log.txt + Mon May 04 19:44:05 2020: CRITICAL : stream_type: STRING, msg: "foo", msg_len: 4, entity: (nil) + Mon May 04 19:44:24 2020: CRITICAL : stream_type: STRING, msg: "bar", msg_len: 4, entity: (nil) + + Note that the hello_streams sampler does not do a sample, instead it subscribes to the stream with a callback and prints out what it got off the stream. + +SEE ALSO +========================= + +ldmsd(8), ldms_quickstart(7), ldmsd_controller(8), ldms_sampler_base(7), +ldmsd_stream_publish(7), Plugin_stream_csv_store(7) diff --git a/rtd/docs/source/sampler_man/Plugin_ibmad_records_sampler.rst b/rtd/docs/source/sampler_man/Plugin_ibmad_records_sampler.rst new file mode 100644 index 000000000..959128b2b --- /dev/null +++ b/rtd/docs/source/sampler_man/Plugin_ibmad_records_sampler.rst @@ -0,0 +1,139 @@ +============================ +Plugin_ibmad_records_sampler +============================ + +:Date: 1 May 2019 + +.. contents:: + :depth: 3 +.. + +NAME +============================= + +Plugin_ibmad_records_sampler - man page for the LDMS +ibmad_records_sampler plugin + +SYNOPSIS +================================= + +| Within ldmsd_controller or a configuration file: +| config name=ibmad_records_sampler [ = ] + +DESCRIPTION +==================================== + +With LDMS (Lightweight Distributed Metric Service), plugins for the +ldmsd (ldms daemon) are configured via ldmsd_controller or a +configuration file. The ibmad_records_sampler plugin provides a single +metric set that contains a list of records. Each record contains all of +the metrics for a single infiniband port. + +The schema is named "ibmad" by default. + +NOTE: This plugin will not currently work with virtual IB devices. + +CONFIGURATION ATTRIBUTE SYNTAX +======================================================= + +**config** + | name= [schema=] [job_set=] + | configuration line + + name= + | + | This MUST be ibmad_records_sampler. + + schema= + | + | The schema name defaults to "ibmad", but it can be renamed at + the user's choice. + + rate=0 + | + | Stop the default inclusion of rate values in the set. + + job_set= + | + | The name of the metric set that contains the job id information + (default=job_id) + + include=PORTLIST + | + | Ignore any devices and ports discovered that are not matched by + PORTLIST. See PORTLIST below. Cannot be combined with the + exclude option. + + exclude=PORTLIST + | + | Collect all devices and ports discovered and active that are not + matched by PORTLIST. See PORTLIST below. Cannot be combined with + the include option. + + refresh_interval_sec= + | + | (Optional) The sampler caches the list of infiniband devices, + and that cache is refreshed at the beginning of a sample cycle + if the refresh interval time has been exceeded. + refresh_interval_sec sets the minimum number of seconds between + refreshes of the device cache. The default refresh interval is + 600 seconds. + +PORTLIST +================================= + +Providing a port list specification will stop the automated discovery +process at every sample time from requerying devices and ports that are +not of interest, eliminating nuisance log messages from the MAD +libraries. Such messages are frequently seen on systems using +SocketDirect hardware. + +The port list is a comma-separated list of CA name and optionally +number. E.g. "mlx4_0.1,mlx4_1". A device name specified without a port +number (.N) matches all ports on that device. The maximum port number +supported for a single device is 63. Including a device or port which +does not exist or is not active in the port list has no effect on the +metric sets reported. + +BUGS +============================= + +No known bugs. + +NOTES +============================== + +The rates reported are computed from the last sample taken and the +present sample; however the last sample may not have been stored +downstream and the sample interval size may vary due to kernel wakeup +variations. Rate values are set to -1 for samples where the rate +computation is invalid. + +EXAMPLES +================================= + +Within ldmsd_controller or a configuration file: + +:: + + load name=ibmad_records_sampler + config name=ibmad_records_sampler + start name=ibmad_records_sampler interval=1000000 + +:: + + load name=ibmad_records_sampler + config name=ibmad_records_sampler include=hfi1_0.1 rate=0 + start name=ibmad_records_sampler interval=1000000 + +:: + + load name=ibmad_records_sampler + config name=ibmad_records_sampler exclude=mlx5_0.2,mlx5_0.3,mlx5_0.4, + start name=ibmad_records_sampler interval=1000000 + +SEE ALSO +================================= + +ldmsd(8), ldms_quickstart(7), ldmsd_controller(8) diff --git a/rtd/docs/source/sampler_man/Plugin_ibmad_sampler.rst b/rtd/docs/source/sampler_man/Plugin_ibmad_sampler.rst new file mode 100644 index 000000000..d4ac045c3 --- /dev/null +++ b/rtd/docs/source/sampler_man/Plugin_ibmad_sampler.rst @@ -0,0 +1,128 @@ +==================== +Plugin_ibmad_sampler +==================== + +:Date: 1 May 2019 + +.. contents:: + :depth: 3 +.. + +NAME +===================== + +Plugin_ibmad_sampler - man page for the LDMS ibmad_sampler plugin + +SYNOPSIS +========================= + +| Within ldmsd_controller or a configuration file: +| config name=ibmad_sampler [ = ] + +DESCRIPTION +============================ + +With LDMS (Lightweight Distributed Metric Service), plugins for the +ldmsd (ldms daemon) are configured via ldmsd_controller or a +configuration file. The ibmad_sampler plugin provides a metric set for +each infiniband port discovered on the node. + +The schema is named "ibmad_sampler" by default. + +NOTE: This plugin will not currently work with virtual IB devices. + +CONFIGURATION ATTRIBUTE SYNTAX +=============================================== + +**config** + | name= [schema=] [job_set=] + | configuration line + + name= + | + | This MUST be ibmad_sampler. + + schema= + | + | The schema name defaults to "ibmad_sampler", but it can be + renamed at the user's choice. + + rate=0 + | + | Stop the default inclusion of rate values in the set. + + job_set= + | + | The name of the metric set that contains the job id information + (default=job_id) + + include=PORTLIST + | + | Ignore any devices and ports discovered that are not matched by + PORTLIST. See PORTLIST below. Cannot be combined with the + exclude option. + + exclude=PORTLIST + | + | Collect all devices and ports discovered and active that are not + matched by PORTLIST. See PORTLIST below. Cannot be combined with + the include option. + +PORTLIST +========================= + +Providing a port list specification will stop the automated discovery +process at every sample time from requerying devices and ports that are +not of interest, eliminating nuisance log messages from the MAD +libraries. Such messages are frequently seen on systems using +SocketDirect hardware. + +The port list is a comma-separated list of CA name and optionally +number. E.g. "mlx4_0.1,mlx4_1". A device name specified without a port +number (.N) matches all ports on that device. The maximum port number +supported for a single device is 63. Including a device or port which +does not exist or is not active in the port list has no effect on the +metric sets reported. + +BUGS +===================== + +No known bugs. + +NOTES +====================== + +The rates reported are computed from the last sample taken and the +present sample; however the last sample may not have been stored +downstream and the sample interval size may vary due to kernel wakeup +variations. Rate values are set to -1 for samples where the rate +computation is invalid. + +EXAMPLES +========================= + +Within ldmsd_controller or a configuration file: + +:: + + load name=ibmad_sampler + config name=ibmad_sampler + start name=ibmad_sampler interval=1000000 + +:: + + load name=ibmad_sampler + config name=ibmad_sampler include=hfi1_0.1 rate=0 + start name=ibmad_sampler interval=1000000 + +:: + + load name=ibmad_sampler + config name=ibmad_sampler exclude=mlx5_0.2,mlx5_0.3,mlx5_0.4, + start name=ibmad_sampler interval=1000000 + +SEE ALSO +========================= + +ldmsd(8), ldms_quickstart(7), ldmsd_controller(8) diff --git a/rtd/docs/source/sampler_man/Plugin_ibnet.rst b/rtd/docs/source/sampler_man/Plugin_ibnet.rst new file mode 100644 index 000000000..0bfe0c6c8 --- /dev/null +++ b/rtd/docs/source/sampler_man/Plugin_ibnet.rst @@ -0,0 +1,186 @@ +============ +Plugin_ibnet +============ + +:Date: 19 May 2020 + +.. contents:: + :depth: 3 +.. + +NAME +============= + +Plugin_ibnet - man page for the LDMS ibnet plugin + +SYNOPSIS +================= + +| Within ldmsd_controller or a configuration file: +| config name=ibnet [ = ] + +DESCRIPTION +==================== + +The ibnet plugin provides port info from InfiniBand equipment supporting +extended hardware counters. Each port is handled in a separate data set. +Overall timing of the data collection process is handled in another +optional data set. Plugins for the ldmsd (ldms daemon) are configured +via ldmsd_controller or a configuration file. + +CONFIGURATION ATTRIBUTE SYNTAX +======================================= + +**config** + | name= port-name= source-list= + [port-number=] [metric-conf=] + [node-name-map=] [timing=] [millis=] + [producer=] [instance=] [component_id=] + [schema=] [uid=] [gid=] [perm=] [debug] + | configuration line + + name= + | + | This MUST be ibnet. + + producer=. + | + | The producer string value for the timing set. Default is the + result of gethostname(). + + instance= + | + | The name of the timing metric set. Default is + $producer/ibnet_timing. + + source-list= + | + | Lidfile is the name of a file of LID/port specifications. See + PORT FILE for format details. + + port-name= [port-number=] + | + | Hca is the name of the local IB interface to access the network. + Num is the number of the port on the interface used to access + the network. The default is 1. + + schema= + | + | Optional schema base name. The default is ibnet. The name base + is suffixed to create uniquely defined schema names based on the + plugin options specified. + + component_id= + | + | Optional component identifier for the timing set. Defaults to + zero. + + metric-conf= + | + | The file listing the metric groups to collect. See METRIC GROUPS + below. + + ca_port= + | + | The port number to use, which must be active. + + millis= + | + | The number of milliseconds of the timeout on the MAD calls. + Default 0, which will use the mad library timeout of 1 second. + + timing= + | + | Disable timing (T=0), enable aggregate timing (T=1), or enable + individual port timing(T=2) or enable port offset timing(T=3). + The metric set will contain sampling process timing metrics if T + > 0. + + node-name-map= + | + | The file name nnmap, as used by ibnetdiscover and opensm, of a + mapping from IB GUIDs to short names of IB hardware items + (switch, node, etc) suitable for use in populating names of + sets. + +PORT FILE +================== + +The lid/port file format is + +:: + + lid, hexguid, nports, plist + * where hexguid is 0x...., + * nports is int, + * plist is ints nports long or * if range is 1-nports, + * if not using a name map, names will be GUID_hex. + +The portrange will be an integer expression in the style 1,5,7-9,13, +without repeats, whitespace, reversed ranges, or overlapping ranges. LID +is an integer in the range 0-65535. The same LID may be on multiple +lines so long as the ports listed for it are not repeated. + +The lid file can be generated with ldms-gen-lidfile.sh. + +METRIC GROUPS +====================== + +The metric groups file contains a list of items, one per line, naming +groups of metrics to collect. The groups are named corresponding to +groups in the infiniband-diags perfquery utility options. The +correspondence is not exact. To disable a listed metric group, delete +its name from the file or comment it out by prepending a # to the group, +e.g. '#xmtsl'. '#' followed by whitespace is not allowed. Carriage +returns are optional. + +INTERNAL METRICS +========================= + +port_query_time + | + | Time in seconds spend in the single port MAD call. + +port_query_offset + | + | Time in microseconds from start of all MAD calls in the current + update to the end of the mad call for the specific port. + +ib_query_time + | + | Time in seconds making all MAD calls in the update. + +ib_data_process_time + | + | Time in seconds decoding all MAD data in the update + +BUGS +============= + +The perfquery extended_speeds option is not supported. + +EXAMPLES +================= + +Within ldmsd_controller or a configuration file: + +:: + + load name=ibnet + config name=ibnet producer=compute1 instance=compute1/ibnet component_id=1 port-name=mlx5_0 source-list=/path/lidfile + start name=ibnet interval=1000000 + +NOTES +============== + +The exact schema name that will be generated can be determined using the +ldms_ibnet_schema_name utility. The subsets available from the fabric +depend on the hardware, firmware, and in some cases the subnet manager +versions. + +SEE ALSO +================= + +ldmsd(8), ldms_quickstart(7), ldmsd_controller(8), +ldms_ibnet_schema_name(1), ldms-ibnet-sampler-gen(1). diff --git a/rtd/docs/source/sampler_man/Plugin_ipmireader.rst b/rtd/docs/source/sampler_man/Plugin_ipmireader.rst new file mode 100644 index 000000000..a7c4cd271 --- /dev/null +++ b/rtd/docs/source/sampler_man/Plugin_ipmireader.rst @@ -0,0 +1,134 @@ +================= +Plugin_ipmireader +================= + +:Date: 18 Feb 2019 + +.. contents:: + :depth: 3 +.. + +NAME +================== + +Plugin_ipmireader - man page for the LDMS ipmireader plugin + +SYNOPSIS +====================== + +| Within ldmsd_controller or a configuration file: +| config name=ipmireader [ = ] + +DESCRIPTION +========================= + +With LDMS (Lightweight Distributed Metric Service), plugins for the +ldmsd (ldms daemon) are configured via ldmsd_controller or a +configuration file. The ipmireader plugin provides data from the result +of the ipmitool sdr command. All data is reported out as floats. + +**This sampler is currently in pre-release development in V4.2.** + +CONFIGURATION ATTRIBUTE SYNTAX +============================================ + +The ipmireader plugin uses the sampler_base base class. This man page +covers only the configuration attributes, or those with default values, +specific to the this plugin; see ldms_sampler_base.man for the +attributes of the base class. + +**config** + | name= [schema=] [ = ... ] + | configuration line + + name= + | + | This MUST be ipmireader. + + schema= + | + | Optional schema name. It is intended that the same sampler on + different nodes with different metrics have a different schema. + If not specified, will default to \`ipmireader\`. + + address=
+ | + | address of the host to contact. H flag in the ipmitool command. + + username= + | + | username for the query. U flag in the ipmitool command. Defaults + to 'admin'. + + password= + | + | password for the query. P flag in the ipmitool command. Defaults + to 'password'. + + sdrcache= + | + | output for the sdr cache file, to improve performance. Optional. + + retry= + | + | interval to retry creating set if initially fails (host down). + Default 600 sec. + +BUGS +================== + +No known bugs. + +NOTES +=================== + +- This sampler is currently in pre-release development in V4.2. + +- Parameters in the ipmitool call are: -N1 (timeout for LAN interface) + -R1 (number of retries for LAN interface). These are in order to + reduce the time waiting for a non responsive node. + +- The ipmitool command appears to have less overhead than ipmi-sensors + and so is preferred over the ipmisensors sampler for single node + calls. + +- If the dump cache command fails, this is not reported. If the file + does not exist after a short sleep, there is a log message. Without + the sdr file, the sampler will continue. On one system, using the + cached sdr information reduces the call response time by about 0.5 + seconds. This manifests itself in the timestamp of the call. + +- There is a one time occurrence of a sleep of 2 seconds (empirically + chosen) after the dump cache command, to enable the file to be + written by the time of the next data call. If it takes longer, but is + in place for later sample calls, it will be used then. + +- There is currently no call to redump the file. + +- There is no way to check that a dumped file is still accurate for + your system. + +- Currently all the data is reported as type float. + +- In case of signficant error or cannot open the file, all metrics are + set to the FAIL value, which currently is -8888. In case of a metric + error, like a missing fan and hence the reported value is not + numeric, the metric is set to the ERR value, which currently is + -9999. + +EXAMPLES +====================== + +Within ldmsd_controller or a configuration file: + +:: + + load name=ipmireader + config name=ipmireader producer=vm1_1 instance=vm1_1/ipmireader address=cn1-ipmi + start name=ipmireader interval=1000000 + +SEE ALSO +====================== + +ldmsd(8), ldms_quickstart(7), ldmsd_controller(8), ldms_sampler_base(7), +Plugin_ipmisensors(7) diff --git a/rtd/docs/source/sampler_man/Plugin_ipmisensors.rst b/rtd/docs/source/sampler_man/Plugin_ipmisensors.rst new file mode 100644 index 000000000..58d7c2a35 --- /dev/null +++ b/rtd/docs/source/sampler_man/Plugin_ipmisensors.rst @@ -0,0 +1,119 @@ +================== +Plugin_ipmisensors +================== + +:Date: 21 Mar 2019 + +.. contents:: + :depth: 3 +.. + +NAME +=================== + +Plugin_ipmisensors - man page for the LDMS ipmisensors plugin + +SYNOPSIS +======================= + +| Within ldmsd_controller or a configuration file: +| config name=ipmisensors [ = ] + +DESCRIPTION +========================== + +With LDMS (Lightweight Distributed Metric Service), plugins for the +ldmsd (ldms daemon) are configured via ldmsd_controller or a +configuration file. The ipmisensors plugin provides data from the result +of the ipmi-sensors command. Specific parameters for the command +described below. All data is reported out as floats. + +**This sampler is currently in pre-release development in V4.2.** + +CONFIGURATION ATTRIBUTE SYNTAX +============================================= + +The ipmisensors plugin uses the sampler_base base class. This man page +covers only the configuration attributes, or those with default values, +specific to the this plugin; see ldms_sampler_base.man for the +attributes of the base class. + +**config** + | name= [schema=] [ = ... ] + | configuration line + + name= + | + | This MUST be ipmisensors. + + schema= + | + | Optional schema name. It is intended that the same sampler on + different nodes with different metrics have a different schema. + If not specified, will default to \`ipmisensors\`. + + address=
+ | + | address of the host to contact. h flag in the ipmi-sensors + command. + + username= + | + | username for the query. u flag in the ipmi-sensors command. + Defaults to 'admin'. + + password= + | + | password for the query. p flag in the ipmi-sensors command. + Defaults to 'password'. + +BUGS +=================== + +No known bugs. + +NOTES +==================== + +- This sampler is currently in pre-release development in V4.2. + +- The ipmi-sensors call appears to have more overhead than the ipmitool + commandfor single node queries, and so the impireader sampler is + preferred. + +- Specific args to the command are: --comma-separated-output + --no-header-output --session-timeout=500 --retransmission-timeout=250 + --quiet-cache --no-sensor-type. Of note are the timeouts. These will + limit how long the call will wait (and thus the duration of the + sample) if a host is not responding. + +- The ipmi-sensors call can be called with a fan out. This would cause + significant parsing in the return, so it is not used here. Also the + return of the fan out call will wait on the return of all the + individual calls. Thus a non-responsive node can cause a long delay, + affecting all values, without a timeout. + +- Currently all the data is reported as type float. + +- In case of signficant error or cannot open the file, all metrics are + set to the FAIL value, which currently is -8888. In case of a metric + error, like a missing fan and hence the reported value is not + numeric, the metric is set to the ERR value, which currently is + -9999. + +EXAMPLES +======================= + +Within ldmsd_controller or a configuration file: + +:: + + load name=ipmisensors + config name=ipmisensors producer=vm1_1 instance=vm1_1/ipmisensors address=cn1-ipmi + start name=ipmisensors interval=1000000 + +SEE ALSO +======================= + +ldmsd(8), ldms_quickstart(7), ldmsd_controller(8), ldms_sampler_base(7), +Plugin_ipmireader(7) diff --git a/rtd/docs/source/sampler_man/Plugin_kgnilnd.rst b/rtd/docs/source/sampler_man/Plugin_kgnilnd.rst new file mode 100644 index 000000000..a238a8a00 --- /dev/null +++ b/rtd/docs/source/sampler_man/Plugin_kgnilnd.rst @@ -0,0 +1,72 @@ +============== +Plugin_kgnilnd +============== + +:Date: 10 Feb 2018 + +.. contents:: + :depth: 3 +.. + +NAME +=============== + +Plugin_kgnilnd - man page for the LDMS kgnilnd plugin + +SYNOPSIS +=================== + +| Within ldmsd_controller or in a configuration file +| config name=kgnilnd [ = ] + +DESCRIPTION +====================== + +With LDMS (Lightweight Distributed Metric Service), plugins for the +ldmsd (ldms daemon) are configured via ldmsd_controller or a +configuration file. The kgnilnd plugin provides Cray specific info from +/proc/kgnilnd. + +CONFIGURATION ATTRIBUTE SYNTAX +========================================= + +The kgnilnd plugin uses the sampler_base base class. This man page +covers only the configuration attributes, or those with default values, +specific to the this plugin; see ldms_sampler_base.man for the +attributes of the base class. + +**config** + | name= [schema=] + | configuration line + + name= + | + | This MUST be kgnilnd. + + schema= + | + | Optional schema name. It is intended that the same sampler on + different nodes with different metrics have a different schema. + If not specified, will default to \`kgnilnd\`. + +BUGS +=============== + +No known bugs. + +EXAMPLES +=================== + +Within ldmsd_controller or in a configuration file + +:: + + load name=kgnilnd + config name=kgnilnd producer=vm1_1 instance=vm1_1/kgnilnd + start name=kgnilnd interval=1000000 + +SEE ALSO +=================== + +ldmsd(8), Plugin_cray_system_sampler_variants(7), ldms_quickstart(7), +ldmsd_controller(8), ldms_sampler_base(7) diff --git a/rtd/docs/source/sampler_man/Plugin_linux_proc_sampler.rst b/rtd/docs/source/sampler_man/Plugin_linux_proc_sampler.rst new file mode 100644 index 000000000..9e2cbd280 --- /dev/null +++ b/rtd/docs/source/sampler_man/Plugin_linux_proc_sampler.rst @@ -0,0 +1,426 @@ +========================= +Plugin_linux_proc_sampler +========================= + +:Date: 15 Jul 2021 + +.. contents:: + :depth: 3 +.. + +NAME +========================== + +Plugin_linux_proc_sampler - man page for the LDMS linux_proc_sampler +plugin + +SYNOPSIS +============================== + +| Within ldmsd_controller or a configuration file: +| config name=linux_proc_sampler [common attributes] [stream=STREAM] + [metrics=METRICS] [cfg_file=FILE] [instance_prefix=PREFIX] + [exe_suffix=1] [argv_sep=] [argv_msg=1] [argv_fmt=<1,2>] + [env_msg=1] [env_exclude=EFILE] [fd_msg=1] [fd_exclude=EFILE] + +DESCRIPTION +================================= + +With LDMS (Lightweight Distributed Metric Service), plugins for the +ldmsd (ldms daemon) are configured via ldmsd_controller or a +configuration file. The linux_proc_sampler plugin provides data from +/proc/, creating a different set for each process identified in the +named stream. The stream can come from the ldms-netlink-notifier daemon +or the spank plugin slurm_notifier. The per-process data from +/proc/self/environ and /proc/self/cmdline can optionally be published to +streams. + +CONFIGURATION ATTRIBUTE SYNTAX +==================================================== + +The linux_proc_sampler plugin uses the sampler_base base class. This man +page covers only the configuration attributes, or those with default +values, specific to the this plugin; see ldms_sampler_base.man for the +attributes of the base class. + +**config** + | name= [other options] + | configuration line + + name= + | + | This MUST be linux_proc_sampler. + + instance_prefix=PREFIX + | + | Prepend PREFIX to the set instance names. Typically a cluster + name when needed to disambiguate producer names that appear in + multiple clusters. (default: no prefix). + + exe_suffix=1 + | + | If present, set instance names are appended with the full path + of the executable. This is most likely useful for debugging + configuration of the notifier up-stream using ldms_ls. (default: + no such suffix) + + sc_clk_tck=1 + | + | If present, include sc_clk_tck in the metric set. sc_clk_tck is + the ticks per second from sysconf(\_SC_CLK_TCK). (default: not + included). + + stream=STREAM + | + | The name of the \`ldmsd_stream\` to listen for SLURM job events. + (default: slurm). + + argv_sep= + | + | Replace nul within the cmdline string with char. Special + specifiers \\b,\\n,\\t,\\v,\\r,\\f are also supported. + + syscalls=FILE + | + | File mapping syscall integers to symbolic names. Not needed + unless syscall_name is included in the metrics. See FILES for + details. + + metrics + | + | The comma-separated list of metrics to monitor. The default is + (empty), which is equivalent to monitor ALL metrics. + + cfg_file=CFILE + | + | The alternative configuration file in JSON format. The file is + expected to have an object that contains the following + attributes: { "stream": "STREAM_NAME", "syscalls" : "/file", + "metrics": [ comma-separated-quoted-strings ] }. If the + \`cfg_file\` is given, all other sampler-specific options given + on the key=value line are ignored. + + argv_msg=1 + | + | Publish the argv items to a stream named _argv, where if + the schema is not specified, the default schema is + linux_proc_sampler. (Default: argv_msg=0; no publication of + argv). E.g. a downstream daemon will need to subscribe to + linux_proc_sampler_argv to receive the published messages and + store them. + + argv_fmt=<1,2> + | + | Publish the argv items formatted as (1) a json list of strings + ['argv0', 'argv1'] or (2) a json list of key/value tuples, e.g. + [ {"k":0, "v":"argv[0]"}, {"k":1, "v":"argv[1]"}]. + + env_msg=1 + | + | Publish the environment items to a stream named _env, + where if the schema is not specified, the default SCHEMA is + linux_proc_sampler. (Default: env_msg=0; no publication of the + environment). Environment data is published as a list in the + style of argv_fmt=2. E.g. a downstream daemon will need to + subscribe to linux_proc_sampler_env to receive the published + messages and store them. + + env_exclude=ELIST + | + | Exclude the environment items named with regular expressions in + ELIST. On the configuration key=value line, ELIST must be a file + name of a file containing a list of regular expressions one per + line. An environment variable that matches any of the listed + regular expressions will be excluded. When used in the cfg_file, + the env_exclude value may be either the string name of the + regular expression file or a JSON array of expression strings as + shown in EXAMPLES. + + fd_exclude=ELIST + | + | Exclude the files named with regular expressions in ELIST. On + the configuration key=value line, ELIST must be a file name of a + file containing a list of regular expressions one per line. A + file that matches any of the listed regular expressions will be + excluded. When used in the cfg_file, the fd_exclude value may be + either the string name of the regular expression file or a JSON + array of expression strings as shown in EXAMPLES. + + fd_msg=N + | + | Publish new /proc/pid/fd scan data to the _files stream + every N-th sample, where if the schema is not specified, the + default SCHEMA is linux_proc_sampler. (Default: fd_msg=0; no + publication of the file details). A downstream daemon will need + to subscribe to linux_proc_sampler_files to receive the + published messages and store them. Files that are not opened + long enough to be caught in a scan of fds will be missed. Files + will be reported as 'opened' the first time seen and as 'closed' + when they are no longer seen. A file both no longer seen and no + longer existing will be reported as 'deleted'. Only regular + files (not sockets, etc) are reported, and additionally files + matching the fd_expressions are ignored. Use a larger N to + reduce the scan overhead at the cost of missing short-access + files. If a close-reopen of the same file occurs between scans, + no corresponding events are generated. + + published_pid_dir= + | + | Name of the directory where netlink-notifier or other notifier + pids of interest may be found. This directory is scanned at + sampler startup only, so that pids which were the subject of + events published before the sampler started can be tracked. If + not specified, the default directory is + /var/run/ldms-netlink-tracked. Absence of this directory is not + a sampler configuration error, as ldmsd may start before the + notifier process. When starting, the sampler will clean up any + stale pid references found in this directory. Any pid not + appearing in this directory is not being tracked. + +INPUT STREAM FORMAT +========================================= + +The named ldmsd stream should deliver messages with a JSON format which +includes the following. Messages which do not contain event, data, +job_id, and some form of PID will be ignored. Extra fields will be +ignored. + +:: + + { "event" = "$e", + "data" : { + "job_id" : INT, + "task_pid" : INT, + "os_pid" : INT, + "parent_pid" : INT, + "is_thread" : INT, + "exe" : STRING, + "start" : STRING, + "start_tick" : STRING + } + } + +where $e is one of task_init_priv or task_exit. The data fields other +than job_id are all optional, but at least one of os_pid and task_pid +must contain the PID of a process to be monitored. If present and > 0, +task_pid should be the value taken from SLURM_TASK_PID or an equivalent +value from another resource management environment. The value of start, +if provided, should be approximately the epoch time ("%lu.%06lu") when +the PID to be monitored started. + +OUTPUT STREAM FORMAT +========================================== + +The json formatted output for argv and environment values includes a +common header: + +:: + + { + "producerName":"localhost1", + "component_id":1, + "pid":8991, + "job_id":0, + "timestamp":"1663086686.947600", + "task_rank":-1, + "parent":1, + "is_thread":0, + "exe":"/usr/sbin/ldmsd", + "data":[LIST] + +where LIST is formatted as described for argv_fmt option. + +EXAMPLES +============================== + +Within ldmsd_controller or a configuration file: + +:: + + load name=linux_proc_sampler + config name=linux_proc_sampler producer=vm1_1 instance=vm1_1/linux_proc_sampler metrics=stat_comm,stat_pid,stat_cutime + start name=linux_proc_sampler interval=1000000 + +An example metrics configuration file is: + +:: + + { + "stream": "slurm", + "instance_prefix" : "cluster2", + "syscalls": "/etc/sysconfig/ldms.d/plugins-conf/syscalls.map", + "env_msg": 1, + "argv_msg": 1, + "fd_msg" : 1, + "fd_exclude": [ + "/dev/", + "/run/", + "/var/", + "/etc/", + "/sys/", + "/tmp/", + "/proc/", + "/ram/tmp/", + "/usr/lib" + ], + "env_exclude": [ + "COLORTERM", + "DBU.*", + "DESKTOP_SESSION", + "DISPLAY", + "GDM.*", + "GNO.*", + "XDG.*", + "LS_COLORS", + "SESSION_MANAGER", + "SSH.*", + "XAU.*" + ], + "metrics": [ + "stat_pid", + "stat_state", + "stat_rss", + "stat_utime", + "stat_stime", + "stat_cutime", + "stat_cstime", + "stat_num_threads", + "stat_comm", + "n_open_files", + "io_read_b", + "io_write_b", + "status_vmdata", + "status_rssfile", + "status_vmswap", + "status_hugetlbpages", + "status_voluntary_ctxt_switches", + "status_nonvoluntary_ctxt_switches", + "syscall_name" + ] + } + +Generating syscalls.map: + +:: + + # ldms-gen-syscalls-map > /etc/sysconfig/ldms.d/plugins-conf/syscalls.map + +Obtaining the currently supported optional metrics list: + +:: + + ldms-plugins.sh linux_proc_sampler + +FILES +=========================== + +Data is obtained from (depending on configuration) the following files +in /proc/[PID]/: + +:: + + cmdline + exe + statm + stat + status + fd + io + oom_score + oom_score_adj + root + syscall + timerslack_ns + wchan + +The system call integer:name mapping varies with kernel and is therefore +read from an input file of the format: + +:: + + # comments + 0 read + ... + +where all lines are pairs. This file can be created from the +output of ldms-gen-syscall-map. System call names must be less than 64 +characters. Unmapped system calls will be given names of the form +SYS\_. + +The env_msg option can have its output filtered by json or a text file, +e.g.: + +:: + + # env var name regular expressions (all OR-d together) + COLORTERM + DBU.* + DESKTOP_SESSION + DISPLAY + GDM.* + GNO.* + XDG.* + LS_COLORS + SESSION_MANAGER + SSH.* + XAU.* + +The fd_msg option can have its output filtered by json or a text file, +e.g.: + +:: + + /dev/ + /run/ + /var/ + /etc/ + /sys/ + /tmp/ + /proc/ + /ram/tmp/ + /usr/lib64/ + /usr/lib/ + +The files defined with published_pid_dir appear in (for example) + +:: + + /var/run/ldms-netlink-tracked/[0-9]* + +and each contains the JSON message sent by the publisher. Publishers, +not ldmsd, populate this directory to allow asynchronous startup. + +NOTES +=========================== + +The value strings given to the options sc_clk_tck and exe_suffix are +ignored; the presence of the option is sufficient to enable the +respective features. + +Some of the optionally collected data might be security sensitive. + +The publication of environment and cmdline (argv) stream data is done +once at the start of metric collection for the process. The message will +not be reemitted unless the sampler is restarted. Also, changes to the +environment and argv lists made within a running process are NOT +reflected in the /proc data maintained by the linux kernel. The +environment and cmdline values may contain non-JSON characters; these +will be escaped in the published strings. + +The publication of file information via fd_msg information may be +effectively made one-shot-per-process by setting fd_msg=2147483647. This +will cause late-loaded plugin library dependencies to be missed, +however. + +The status_uid and status_gid values can alternatively be collected as +"status_real_user", "status_eff_user", "status_sav_user", +"status_fs_user", "status_real_group", "status_eff_group", +"status_sav_group", "status_fs_group". These string values are most +efficiently collected if both the string value and the numeric values +are collected. + +SEE ALSO +============================== + +syscalls(2), ldmsd(8), ldms_quickstart(7), ldmsd_controller(8), +ldms_sampler_base(7), proc(5), sysconf(3), environ(3). diff --git a/rtd/docs/source/sampler_man/Plugin_lnet_stats.rst b/rtd/docs/source/sampler_man/Plugin_lnet_stats.rst new file mode 100644 index 000000000..949907086 --- /dev/null +++ b/rtd/docs/source/sampler_man/Plugin_lnet_stats.rst @@ -0,0 +1,90 @@ +================= +Plugin_lnet_stats +================= + +:Date: 18 Feb 2018 + +.. contents:: + :depth: 3 +.. + +NAME +================== + +Plugin_lnet_stats - man page for the LDMS lnet_stats plugin + +SYNOPSIS +====================== + +| Within ldmsctl +| ldmsctl> config name=lnet_stats [ = ] + +DESCRIPTION +========================= + +With LDMS (Lightweight Distributed Metric Service), plugins for the +ldmsd (ldms daemon) are configured via ldmsd_controller or a +configuration file. The lnet_stats plugin provides memory info from +/proc/sys/lnet/stats or equivalent. + +CONFIGURATION ATTRIBUTE SYNTAX +============================================ + +The lnet_stats plugin uses the sampler_base base class. This man page +covers only the configuration attributes, or those with default values, +specific to the this plugin; see ldms_sampler_base.man for the +attributes of the base class. + +**config** + | name= [schema= file=] + | ldmsctl configuration line. + + name= + | + | This MUST be lnet_stats. + + schema= + | + | Optional schema name. It is intended that the same sampler on + different nodes with different metrics have a different schema. + If not specified, will default to \`lnet_stats\`. + + file= + | + | Optional full path name of stats file to use. If not supplied, + the default search path described in NOTES is searched. + Typically, this option is only used in test environments which + may not have a real Lustre installation or in order to test + transient disappearance of the file. + +NOTES +=================== + +The default search path followed for LNET stats is: +/sys/kernel/debug/lnet/stats:/proc/sys/lnet/stats. Which file will +exist, if either, depends on the Lustre version and how many volumes are +currently mounted. Be aware that /sys/kernel/debug normally is only +readable by privileged users. + +The stats file disappears when all mounts are unmounted or not yet +mounted. While it is missing, the data set is not updated. + +This assumes the file search path as described above, instead of looking +it up from the Lustre runtime libraries. This avoids compile time +dependence on Lustre which may be upgraded independently of LDMS. This +is not considered a bug. + +EXAMPLES +====================== + +:: + + Within ldmsd_controller or a configuration file: + load name=lnet_stats + config name=lnet_stats producer=vm1_1 instance=vm1_1/lnet_stats component_id=10 + start name=lnet_stats interval=1000000 + +SEE ALSO +====================== + +ldmsd(8), ldms_quickstart(7), ldmsd_controller(8), ldms_sampler_base(7) diff --git a/rtd/docs/source/sampler_man/Plugin_loadavg.rst b/rtd/docs/source/sampler_man/Plugin_loadavg.rst new file mode 100644 index 000000000..d51797b02 --- /dev/null +++ b/rtd/docs/source/sampler_man/Plugin_loadavg.rst @@ -0,0 +1,90 @@ +========================= +Plugin_loadavg +========================= + +:Date: 7 Apr 2020 + +.. contents:: + :depth: 3 +.. + +NAME +========================== + +Plugin_loadavg - Plugin_loadavg page for the LDMS loadavg plugin + +SYNOPSIS +============================== + +| Within ldmsd_controller +| config name=loadavg [ = ] + +DESCRIPTION +================================= + +The loadavg plugin provides OS information from /proc/loadavg + +CONFIGURATION ATTRIBUTE SYNTAX +==================================================== + +This plugin uses the sampler_base base class. This man page covers only +the configuration attributes, or those with default values, specific to +the this plugin; see ldms_sampler_base.man for the attributes of the +base class. + +**config** + name= [schema=] [metrics=] [force_integer] + + name= + | + | This MUST be loadavg. + + force_integer + | + | If present, this flag forces load metrics to be stored as + integers of 100*value provided in the proc file. + + schema= + | + | Optional schema name. If schema is not specified, it will be + computed. The default name is loadavg if the metrics option is + not supplied. The default name when metrics is specified is + loadavgXXXXXX, where each X corresponds to whether or not that + metric is included. When force_integer is configured, the + loadavg prefix becomes loadavgi. + + metrics= + | + | comma separated list of metrics to include. If not given, all + are included. The complete list is load1min, load5min, + load15min, runnable, scheduling_entities, newest_pid. + +DATA +========================== + +This reports metrics from /proc/loadavg, which has the format: load1min +load5min load15min runnable/scheduling_entities newest_pid. + +The load numbers are multiplied by 100 and cast to unsigned integers as +they are collected, rather than being collected as real numbers. + +EXAMPLES +============================== + +Within ldmsd_controller or a configuration file: + +:: + + load name=loadavg + config name=loadavg producer=vm1_1 component_id=1 instance=vm1_1/loadavg + start name=loadavg interval=1000000 + +NOTES +=========================== + +See proc(5) for the definitions of the metrics. + +SEE ALSO +============================== + +proc(5), ldmsd(8), ldms_sampler_base(7), ldmsd_controller(8) diff --git a/rtd/docs/source/sampler_man/Plugin_lustre_client.rst b/rtd/docs/source/sampler_man/Plugin_lustre_client.rst new file mode 100644 index 000000000..341c4ea55 --- /dev/null +++ b/rtd/docs/source/sampler_man/Plugin_lustre_client.rst @@ -0,0 +1,92 @@ +==================== +Plugin_lustre_client +==================== + +:Date: 1 May 2019 + +.. contents:: + :depth: 3 +.. + +NAME +===================== + +Plugin_lustre_client - man page for the LDMS lustre_client plugin + +SYNOPSIS +========================= + +| Within ldmsd_controller or a configuration file: +| config name=lustre_client [ = ] + +DESCRIPTION +============================ + +With LDMS (Lightweight Distributed Metric Service), plugins for the +ldmsd (ldms daemon) are configured via ldmsd_controller or a +configuration file. The lustre_client plugin provide a metric set for +each of the lustre client mounts found on a node. The schema is named +"lustre_client". The data for the metric sets is generally found in +/proc/fs/lustre/llite/\*/stats. + +This plugin currently employs zero configuration. The producer name is +set to the hostname by default, and the metric set instance names are +derived from the llite instance name. Any user-supplied configuration +values not documented here will be ignored. + +This plugin should work with at least Lustre versions 2.8, 2.10, and +2.12. + +CONFIGURATION ATTRIBUTE SYNTAX +=============================================== + +**config** + | name= [job_set=] [producer=] + [component_id=] + | configuration line + + name= + | + | This MUST be lustre_client. + + job_set= + | + | The name of the metric set that contains the job id information + (default=job_id) + + producer= + | + | The default used for producer (if not provided) is the result of + gethostname(). The set instance names will be + $producer/$llite_name. + + component_id= + | + | Optional (defaults to 0) number of the host where the sampler is + running. All sets on a host will have the same value. + + perm= + | + | Set the access permissions for the metric sets. (default 440). + +NOTES +====================== + +Improperly spelled option names are not trapped as configuration errors. + +EXAMPLES +========================= + +Within ldmsd_controller or a configuration file: + +:: + + load name=lustre_client + config name=lustre_client + start name=lustre_client interval=1000000 + +SEE ALSO +========================= + +ldmsd(8), ldms_quickstart(7), ldmsd_controller(8), ldms_sampler_base(7), +gethostname(2) diff --git a/rtd/docs/source/sampler_man/Plugin_lustre_mdc.rst b/rtd/docs/source/sampler_man/Plugin_lustre_mdc.rst new file mode 100644 index 000000000..7a783a443 --- /dev/null +++ b/rtd/docs/source/sampler_man/Plugin_lustre_mdc.rst @@ -0,0 +1,160 @@ +================= +Plugin_lustre_mdc +================= + +:Date: 1 May 2019 + +.. contents:: + :depth: 3 +.. + +NAME +================== + +Plugin_lustre_mdc - man page for the LDMS lustre_mdc plugin + +SYNOPSIS +====================== + +| Within ldmsd_controller or a configuration file: +| config name=lustre_mdc + +DESCRIPTION +========================= + +With LDMS (Lightweight Distributed Metric Service), plugins for the +ldmsd (ldms daemon) are configured via ldmsd_controller or a +configuration file. + +The lustre_mdc plugin provides schema lustre_mdc for daemons with read +access to the lustre files in /proc/fs/lustre/mdc/\*/md_stats and +/sys/kernel/debug/lustre/mdc/\*/stats. The metric sets will have +instance names combining the producer name and the mdc name. + +This plugin will work with Lustre versions 2.12 and others which share +these file locations and formats. + +CONFIGURATION ATTRIBUTE SYNTAX +============================================ + +**config** + | name= [producer=] [component_id=] + | configuration line + + name= + | + | This MUST be lustre_mdc. + + producer= + | + | The default used for producer (if not provided) is the result of + gethostname(). The set instance names will be + / + + component_id= + | + | Optional (defaults to 0) number of the host where the sampler is + running. All sets on a host will have the same value. + + job_set= + | + | Optional (defaults to "job_info"). Typically should be set to + /jobid or /job_info depending on choice of + job sampling plugin. + + mdc_timing=0 + | + | Optionally exclude timing data from + /sys/kernel/debug/lustre/mdc/\*/stats. If given, the sampler may + be run by unprivileged users. If /sys/kernel/debug/ cannot be + opened by the user, it is a configuration error unless + mdc_timing=0 is given. + + auto_reset=0 + | + | Turn off the default behavior of resetting the counters when an + overflow condition is detected. Reset is implemented by writing + 0 to the corresponding /proc or /sys file. + +SCHEMA +==================== + +The default schema name is lustre_mdc_ops_timing with all the data +described in DATA REPORTED below included. If mdc_timing=0 is given, +only the operation counts from md_stats are reported and the default +schema name changes to lustre_mdc_ops. + +DATA REPORTED +=========================== + +fs_name: The lustre file system name, e.g. xscratch. mdc: The mdc target +that goes with the metrics, e.g. xscratch-MDT0000. last_reset: The time +of the last reset performed by this sampler for any of its metric sets. + +Operation counts from /proc/fs/lustre/mdc/\*/md_stats. See also kernel +source lustre/lustre/obdclass/lprocfs_status.c and +lustre/lustre/include/obd_class.h: mps_stats[]: "close", "create", +"enqueue", "getattr", "intent_lock", "link", "rename", "setattr", +"fsync", "read_page", "unlink", "setxattr", "getxattr", +"intent_getattr_async", "revalidate_lock", + +Client operation timing statistics (all but .count are in microseconds) +for the following list of fields in +/sys/kernel/debug/lustre/mdc/\*/stats: "req_waittime", "mds_getattr", +"mds_getattr_lock", "mds_close", "mds_readpage", "mds_connect", +"mds_get_root", "mds_statfs", "ldlm_cancel", "obd_ping", "seq_query", +"fld_query" + +and statistics: "\__count" the number of events observed, "\__min" the +minimum event duration observed, "\__max" the maximum duration observed, +"\__sum" the sum of all durations observed, "\__sumsqs" the sum of +squares of all durations observed + +NOTES +=================== + +The counters and file locations supported by this plugin are those +present in Lustre 2.12. The fields labeled [reqs] are omitted. Data +names not listed here are simply ignored. + +The minimum sample interval recommended for this sampler is 5-10 +seconds, as the data volume may be substantial and resolving shorter +bursts of metadata activity is generally unnecessary. + +The average and sample standard deviation can be computed from sum and +sumsqs, but once these counters roll over to negative values on a high +up-time client, they may be less useful. The counters can be manually +reset with bash: + +:: + + for i in /proc/fs/lustre/mdc/*/md_stats /sys/kernel/debug/lustre/mdc/*/stats; do + echo 0 $i; + done + +The lustre utility equivalent of this plugin is to inspect the output of +lctl get_param -R mdc.\*.stats lctl get_param -R mdc.\*.md_stats + +Specifying instance=xxx as an option will be ignored. + +BUGS +================== + +No known bugs. + +EXAMPLES +====================== + +Within ldmsd_controller or a configuration file: + +:: + + load name=lustre_mdc + config name=lustre_mdc + start name=lustre_mdc interval=1000000 + +SEE ALSO +====================== + +ldmsd(8), ldms_quickstart(7), ldmsd_controller(8), ldms_sampler_base(7), +lctl(8). diff --git a/rtd/docs/source/sampler_man/Plugin_lustre_mdt.rst b/rtd/docs/source/sampler_man/Plugin_lustre_mdt.rst new file mode 100644 index 000000000..6c1355d51 --- /dev/null +++ b/rtd/docs/source/sampler_man/Plugin_lustre_mdt.rst @@ -0,0 +1,91 @@ +================= +Plugin_lustre_mdt +================= + +:Date: 1 May 2019 + +.. contents:: + :depth: 3 +.. + +NAME +================== + +Plugin_lustre_mdt - man page for the LDMS lustre_mdt plugin + +SYNOPSIS +====================== + +| Within ldmsd_controller or a configuration file: +| config name=lustre_mdt + +DESCRIPTION +========================= + +With LDMS (Lightweight Distributed Metric Service), plugins for the +ldmsd (ldms daemon) are configured via ldmsd_controller or a +configuration file. + +The lustre_mdt plugin provides metric sets for two different schemas: +lustre_mdt and lustre_mdt_job_stats. + +The metric sets using schema lustre_mdt will have a producer name set to +the hostname, and the instance name set to the mdt name. The data for +these metrics sets come from a combination of the data in +/proc/fs/lustre/mdt/\*/stats and a few other single-value files in +/proc/fs/lustre/mdt/\*/. + +The metric sets using schema lustre_mdt_job_stats will have a producer +name set to the hostname, and the instance name will be set to a +combination of the mdt name and the job_id string. The data for these +metrics sets come from /proc/fs/lustre/mdt/\*/job_stats. + +This plugin currently employs zero configuration. Any user-supplied +configuration values will be ignored. Future versions may add +configuration options. + +This plugin should work with at least Lustre versions 2.8, 2.10, and +2.12. + +CONFIGURATION ATTRIBUTE SYNTAX +============================================ + +**config** + | name= [producer=] [component_id=] + | configuration line + + name= + | + | This MUST be lustre_mdt. + + producer= + | + | The default used for producer (if not provided) is the result of + gethostname(). The set instance names will be + $producer/$mdt_name. + + component_id= + | + | Optional (defaults to 0) number of the host where the sampler is + running. All sets on a host will have the same value. + +BUGS +================== + +No known bugs. + +EXAMPLES +====================== + +Within ldmsd_controller or a configuration file: + +:: + + load name=lustre_mdt + config name=lustre_mdt + start name=lustre_mdt interval=1000000 + +SEE ALSO +====================== + +ldmsd(8), ldms_quickstart(7), ldmsd_controller(8), ldms_sampler_base(7) diff --git a/rtd/docs/source/sampler_man/Plugin_lustre_ost.rst b/rtd/docs/source/sampler_man/Plugin_lustre_ost.rst new file mode 100644 index 000000000..3cd09b493 --- /dev/null +++ b/rtd/docs/source/sampler_man/Plugin_lustre_ost.rst @@ -0,0 +1,91 @@ +================= +Plugin_lustre_ost +================= + +:Date: 1 May 2019 + +.. contents:: + :depth: 3 +.. + +NAME +================== + +Plugin_lustre_ost - man page for the LDMS lustre_ost plugin + +SYNOPSIS +====================== + +| Within ldmsd_controller or a configuration file: +| config name=lustre_ost + +DESCRIPTION +========================= + +With LDMS (Lightweight Distributed Metric Service), plugins for the +ldmsd (ldms daemon) are configured via ldmsd_controller or a +configuration file. + +The lustre_ost plugin provides metric sets for two different schemas: +lustre_ost and lustre_ost_job_stats. + +The metric sets using schema lustre_ost will have a producer name set to +the hostname, and the instance name set to the ost name. The data for +these metrics sets come from a combination of the data in +/proc/fs/lustre/ost/\*/stats and a few other single-value files in +/proc/fs/lustre/ost/\*/. + +The metric sets using schema lustre_ost_job_stats will have a producer +name set to the hostname, and the instance name will be set to a +combination of the ost name and the job_id string. The data for these +metrics sets come from /proc/fs/lustre/ost/\*/job_stats. + +This plugin currently employs zero configuration. Any user-supplied +configuration values will be ignored. Future versions may add +configuration options. + +This plugin should work with at least Lustre versions 2.8, 2.10, and +2.12. + +CONFIGURATION ATTRIBUTE SYNTAX +============================================ + +**config** + | name= [producer=] [component_id=] + | configuration line + + name= + | + | This MUST be lustre_ost. + + producer= + | + | The default used for producer (if not provided) is the result of + gethostname(). The set instance names will be + $producer/$ost_name. + + component_id= + | + | Optional (defaults to 0) number of the host where the sampler is + running. All sets on a host will have the same value. + +BUGS +================== + +No known bugs. + +EXAMPLES +====================== + +Within ldmsd_controller or a configuration file: + +:: + + load name=lustre_ost + config name=lustre_ost + start name=lustre_ost interval=1000000 + +SEE ALSO +====================== + +ldmsd(8), ldms_quickstart(7), ldmsd_controller(8), ldms_sampler_base(7) diff --git a/rtd/docs/source/sampler_man/Plugin_meminfo.rst b/rtd/docs/source/sampler_man/Plugin_meminfo.rst new file mode 100644 index 000000000..1cf06fd6d --- /dev/null +++ b/rtd/docs/source/sampler_man/Plugin_meminfo.rst @@ -0,0 +1,71 @@ +============== +Plugin_meminfo +============== + +:Date: 04 Feb 2018 + +.. contents:: + :depth: 3 +.. + +NAME +=============== + +Plugin_meminfo - man page for the LDMS meminfo plugin + +SYNOPSIS +=================== + +| Within ldmsd_controller or a configuration file: +| config name=meminfo [ = ] + +DESCRIPTION +====================== + +With LDMS (Lightweight Distributed Metric Service), plugins for the +ldmsd (ldms daemon) are configured via ldmsd_controller or a +configuration file. The meminfo plugin provides memory info from +/proc/meminfo. + +CONFIGURATION ATTRIBUTE SYNTAX +========================================= + +The meminfo plugin uses the sampler_base base class. This man page +covers only the configuration attributes, or those with default values, +specific to the this plugin; see ldms_sampler_base.man for the +attributes of the base class. + +**config** + | name= [schema=] + | configuration line + + name= + | + | This MUST be meminfo. + + schema= + | + | Optional schema name. It is intended that the same sampler on + different nodes with different metrics have a different schema. + If not specified, will default to \`meminfo\`. + +BUGS +=============== + +No known bugs. + +EXAMPLES +=================== + +Within ldmsd_controller or a configuration file: + +:: + + load name=meminfo + config name=meminfo producer=vm1_1 instance=vm1_1/meminfo + start name=meminfo interval=1000000 + +SEE ALSO +=================== + +ldmsd(8), ldms_quickstart(7), ldmsd_controller(8), ldms_sampler_base(7) diff --git a/rtd/docs/source/sampler_man/Plugin_msr_interlagos.rst b/rtd/docs/source/sampler_man/Plugin_msr_interlagos.rst new file mode 100644 index 000000000..d596b0171 --- /dev/null +++ b/rtd/docs/source/sampler_man/Plugin_msr_interlagos.rst @@ -0,0 +1,425 @@ +===================== +Plugin_msr_interlagos +===================== + +:Date: 04 Jan 2018 + +.. contents:: + :depth: 3 +.. + +NAME +====================== + +Plugin_msr - man page for the LDMS msr interlagos plugin + +SYNOPSIS +========================== + +| Within ldmsd_controller script or a configuration script: +| load name=msr_interlagos +| config name=msr_interlagos action= [ = ] +| add name=msr_interlagos [ = ] + +DESCRIPTION +============================= + +With LDMS (Lightweight Distributed Metric Service), plugins for the +ldmsd (ldms daemon) are configured via ldmsd_controller or by use of a +configuration file provided as an argument to the "-c" flag when +starting ldmsd. In the case of the configuration file the commands are +the same as those used via the ldmsd_controller interface. The +msr_interlagos plugin provides msr counter information for the AMD +Family 15h Models 00h-0Fh Processors (Interlagos) only. + +This is a developmental version of the sampler. It may change at any +time. + +The sampler will allow you to select from an identified set of counters. +These are only correctly defined for the AMD Interlagos processor. The +counter addresses, what events are potentially being counted, the event +names, the counter types (core, uncore), etc. are defined in a +configuration file. An example of this file can be found at: +/util/configs/msr/interlagos/bw_msr_configs. + +The actual counters desired can be defined/modified at/during run time +using the defined event names subject to the constraint that each +counter can only count a single event name at a time. If a second name +mapping into an already selected counter is selected, the selection will +fail. The event names must be identified via the action=add directive +for each desired event name. When all desired event names have been +added, the directive action=finalize is used to instantiate the event +name to counter mappings. + +The metric names are reported as generic names in the output set since +their actual identities may be changed dynamically. For any given +counter the first value (e.g., CTR) is the uint64 representation of +the counter configuration used in the counter setup. The subsequent +values (e.g., CTR\_) are the values read from the counters (1 +per numa node or num core values (with optional additional zero values +if maxcore specified (see more below)). + +To build the msr_interlagos sampler, build with the following flags: +**--enable_msr_interlagos** + +The ldmsd_controller interface includes functions for manipulating the +sampling state and counter identities as described below. + +EXTERNAL MODIFICATION OF COUNTERS AND EXTERNAL INTERACTIONS +============================================================================= + +Note that a user, with appropriate privilege, can change the identity of +the event being collected via an external methodology such as wrmsr. +Because of this, the msr_interlagos plugin first rechecks the event +identity of each counter before sampling, however this is not atomic so +there is a slight possibility of a race condition where the user may +change the counter between the check and the read. If the check fails +zero values are reported for all metrics for that particular counter, +including the control register(s), and the metric name is a zero length +string. This continues until the identity is reset, either by external +methods or by issuing the action=rewrite directive. + +If a user job changes the counters, it is intended that interaction with +the Resource Manager can invoke the rewrite command for the counters +once the user job has exited. A script is supplied that can be called +from epilog to perform this event rewrite. The script is blocking on the +rewrite in order to avoid a race condition with the next job setting the +counters before the rewrite is completed. There is a maximum time time +limit on the blocking call in the script. The script return code +indicates success or failure. Note that options that require the LDMS +daemon to check for a flag set by the scheduler are subject to race +conditions. + +COUNTER CONFIGURATION FILE +============================================ + +**!!!WARNING!!!** This plugin only works for Interlagos. Using this +sampler on other architectures or misconfiguration of the configuration +file may result in unforseen results with possible damage to the system +as the control register addresses will not map into the same +functionality. **!!!WARNING!!!** + +Fields in the MSR sampler configuration file are: Name, Write_addr, +Event, Umask, Read_addr, os_user, core_ena, core_sel, special_flag, +ctr_type. Please use or modify the example configuration file provided +in /util/configs/msr/interlagos/bw_msr_configs. + +Valid options for core_flag are MSR_DEFAULT and UNCORE_PER_NUMA. +MSR_DEFAULT indicates that the associated register will collect the same +event across all entities (core or numa domain). UNCORE_PER_NUMA is only +valid for uncore counters for which the unit mask can be used to specify +for which target numa domain events are being counted. A unit mask of +"0x0" indicates events will be counted for only the numa domain in which +the counter resides. A unit mask of "0xF" indicates events will be +counted for only numa domains in which the counter does not reside. This +enables understanding cache affinity and the level of IO crossing numa +boundaries. Valid options for ctr_type are CTR_NUMCORE and CTR_UNCORE. +These distinguish core and uncore counters. + +Lines starting with a # mark are comments and are skipped. + +:: + + ##### Core counters ########## + TLB_DM, 0xc0010200, 0x046, 0x07, 0xc0010201, 0x3, 0x0, 0x0, MSR_DEFAULT, CTR_NUMCORE + TOT_CYC, 0xc0010202, 0x076, 0x00, 0xc0010203, 0x3, 0x0, 0x0, MSR_DEFAULT, CTR_NUMCORE + L2_DCM, 0xc0010202, 0x043, 0x00, 0xc0010203, 0x3, 0x0, 0x0, MSR_DEFAULT, CTR_NUMCORE + L1_DCM, 0xc0010204, 0x041, 0x01, 0xc0010205, 0x3, 0x0, 0x0, MSR_DEFAULT, CTR_NUMCORE + L1_DCA, 0xc0010204, 0x040, 0x00, 0xc0010205, 0x3, 0x0, 0x0, MSR_DEFAULT, CTR_NUMCORE + #LS_DISP, 0xc0010204, 0x029, 0x01, 0xc0010205, 0x3, 0x0, 0x0, MSR_DEFAULT, CTR_NUMCORE + #LS_DISP, 0xc0010204, 0x029, 0x02, 0xc0010205, 0x3, 0x0, 0x0, MSR_DEFAULT, CTR_NUMCORE + #LS_DISP, 0xc0010204, 0x029, 0x04, 0xc0010205, 0x3, 0x0, 0x0, MSR_DEFAULT, CTR_NUMCORE + LS_DISP, 0xc0010204, 0x029, 0x07, 0xc0010205, 0x3, 0x0, 0x0, MSR_DEFAULT, CTR_NUMCORE + RETIRED_FLOPS, 0xc0010206, 0x003, 0xFF, 0xc0010207, 0x3, 0x0, 0x0, MSR_DEFAULT, CTR_NUMCORE + DP_OPS, 0xc0010206, 0x003, 0xF0, 0xc0010207, 0x3, 0x0, 0x0, MSR_DEFAULT, CTR_NUMCORE + VEC_INS, 0xc0010208, 0x0CB, 0x04, 0xc0010209, 0x3, 0x0, 0x0, MSR_DEFAULT, CTR_NUMCORE + TOT_INS, 0xc001020A, 0x0C0, 0x00, 0xc001020B, 0x3, 0x0, 0x0, MSR_DEFAULT, CTR_NUMCORE + ##### Uncore counters ########## + L3_CACHE_MISSES, 0xc0010240, 0x4E1, 0xF7, 0xc0010241, 0x0, 0x1, 0x0, MSR_DEFAULT, CTR_UNCORE + RW_DRAM_EXT, 0xc0010242, 0x1E0, 0xF, 0xc0010243, 0x0, 0x1, 0x0, UNCORE_PER_NUMA, CTR_UNCORE + IO_DRAM_INT, 0xc0010242, 0x1E1, 0x0, 0xc0010243, 0x0, 0x1, 0x0, UNCORE_PER_NUMA, CTR_UNCORE + DCT_PREFETCH, 0xc0010242, 0x1F0, 0x64, 0xc0010243, 0x0, 0x1, 0x0, MSR_DEFAULT, CTR_UNCORE + DCT_RD_TOT, 0xc0010244, 0x1F0, 0x62, 0xc0010245, 0x0, 0x1, 0x0, MSR_DEFAULT, CTR_UNCORE + RW_DRAM_INT, 0xc0010246, 0x1E0, 0x0, 0xc0010247, 0x0, 0x1, 0x0, UNCORE_PER_NUMA, CTR_UNCORE + IO_DRAM_EXT, 0xc0010246, 0x1E1, 0xF, 0xc0010247, 0x0, 0x1, 0x0, UNCORE_PER_NUMA, CTR_UNCORE + DCT_WRT, 0xc0010246, 0x1F0, 0x19, 0xc0010247, 0x0, 0x1, 0x0, MSR_DEFAULT, CTR_UNCORE + # + # Note that for the following, CTR_NUMCORE pairs are: + # [0] Control: 0xc0010200 Data: 0xc0010201 + # [1] Control: 0xc0010202 Data: 0xc0010203 + # [2] Control: 0xc0010204 Data: 0xc0010205 + # [3] Control: 0xc0010206 Data: 0xc0010207 + # [4] Control: 0xc0010208 Data: 0xc0010209 + # [5] Control: 0xc001020A Data: 0xc001020B + # + # And CTR_UNCORE pairs are: + # [0] Control: 0xc0010240 Data: 0xc0010241 + # [1] Control: 0xc0010242 Data: 0xc0010243 + # [2] Control: 0xc0010244 Data: 0xc0010245 + # [3] Control: 0xc0010246 Data: 0xc0010247 + # + # The first column below indicates the counters available for a particular + # feature. For example [2:0] indicates that the core counters (CTR_NUMCORE) + # 0, 1, and 2, as indicated above, are available to count TLB_DM. + # + # NOTE: For the UNCORE_PER_NUMA case, use 0x0 to exclude external numa access + # and 0xF to exclude local numa access and only count external access. + ##### Core counters ########## + #[2:0] TLB_DM, 0xc0010200, 0x046, 0x07, 0xc0010201, 0x3, 0x0, 0x0, MSR_DEFAULT, CTR_NUMCORE + #[2:0] TOT_CYC, 0xc0010202, 0x076, 0x00, 0xc0010203, 0x3, 0x0, 0x0, MSR_DEFAULT, CTR_NUMCORE + #[2:0] L2_DCM, 0xc0010202, 0x043, 0x00, 0xc0010203, 0x3, 0x0, 0x0, MSR_DEFAULT, CTR_NUMCORE + #[5:0] L1_DCM, 0xc0010204, 0x041, 0x01, 0xc0010205, 0x3, 0x0, 0x0, MSR_DEFAULT, CTR_NUMCORE + #[5:0] L1_DCA, 0xc0010204, 0x040, 0x00, 0xc0010205, 0x3, 0x0, 0x0, MSR_DEFAULT, CTR_NUMCORE + #[5:0] LS_DISP, 0xc0010204, 0x029, 0x01, 0xc0010205, 0x3, 0x0, 0x0, MSR_DEFAULT, CTR_NUMCORE + #[5:0] LS_DISP, 0xc0010204, 0x029, 0x02, 0xc0010205, 0x3, 0x0, 0x0, MSR_DEFAULT, CTR_NUMCORE + #[5:0] LS_DISP, 0xc0010204, 0x029, 0x04, 0xc0010205, 0x3, 0x0, 0x0, MSR_DEFAULT, CTR_NUMCORE + #[5:0] LS_DISP, 0xc0010204, 0x029, 0x07, 0xc0010205, 0x3, 0x0, 0x0, MSR_DEFAULT, CTR_NUMCORE + #[3] RETIRED_FLOPS, 0xc0010206, 0x003, 0xFF, 0xc0010207, 0x3, 0x0, 0x0, MSR_DEFAULT, CTR_NUMCORE + #[3] DP_OPS, 0xc0010206, 0x003, 0xF0, 0xc0010207, 0x3, 0x0, 0x0, MSR_DEFAULT, CTR_NUMCORE + #[5:0] VEC_INS, 0xc0010208, 0x0CB, 0x04, 0xc0010209, 0x3, 0x0, 0x0, MSR_DEFAULT, CTR_NUMCORE + #[5:0] TOT_INS, 0xc001020A, 0x0C0, 0x00, 0xc001020B, 0x3, 0x0, 0x0, MSR_DEFAULT, CTR_NUMCORE + ##### Uncore counters ########## + #[3:0] L3_CACHE_MISSES, 0xc0010240, 0x4E1, 0xF7, 0xc0010241, 0x0, 0x1, 0x0, MSR_DEFAULT, CTR_UNCORE + #[3:0] RW_DRAM_EXT, 0xc0010242, 0x1E0, 0xF, 0xc0010243, 0x0, 0x1, 0x0, UNCORE_PER_NUMA, CTR_UNCORE + #[3:0] IO_DRAM_INT, 0xc0010242, 0x1E1, 0x0, 0xc0010243, 0x0, 0x1, 0x0, UNCORE_PER_NUMA, CTR_UNCORE + #[3:0] DCT_PREFETCH, 0xc0010242, 0x1F0, 0x64, 0xc0010243, 0x0, 0x1, 0x0, MSR_DEFAULT, CTR_UNCORE + #[3:0] DCT_RD_TOT, 0xc0010244, 0x1F0, 0x62, 0xc0010245, 0x0, 0x1, 0x0, MSR_DEFAULT, CTR_UNCORE + #[3:0] RW_DRAM_INT, 0xc0010246, 0x1E0, 0x0, 0xc0010247, 0x0, 0x1, 0x0, UNCORE_PER_NUMA, CTR_UNCORE + #[3:0] IO_DRAM_EXT, 0xc0010246, 0x1E1, 0xF, 0xc0010247, 0x0, 0x1, 0x0, UNCORE_PER_NUMA, CTR_UNCORE + #[3:0] DCT_WRT, 0xc0010246, 0x1F0, 0x19, 0xc0010247, 0x0, 0x1, 0x0, MSR_DEFAULT, CTR_UNCORE + +OUTPUT FORMAT +=============================== + +Example output format from the "ldms_ls" command is shown below. Since +the counters can be added in any order and be changed dynamically, the +names are generic (e.g., Ctr0_n) with CtrN_name being the string version +of the name and CtrN_wctl being the write control register (event code +and unit mask for the msr variable assigned to that counter). + +This is followed a vector of the values. If there is only 1 value in the +vector, then the name is CtrN. If there is a value per numa domain, then +the name is CtrN_n. If there is a value per core, then the name is +CtrN_c. + +If the write control register is the same for all values in the vector, +it is only written once and called CtrN_wctl. If the write control +register is different for the values in the vector, as it would be for +the per numa domain values, then the write control register variable is +a vector of length > 1 and is named CtrN_wctl_n. Zeros in the +CtrN_wctl_n indicate that the "maxcore" value specified in the +configuration of the sampler was greater than the actual number of cores +and hence those wctl and variable data values will be 0. + +Example output is below: + +:: + + nid00010/msr_interlagos: consistent, last update: Sun Oct 30 16:34:16 2016 [4398us] + M u64 component_id 10 + D u64 job_id 0 + D char[] Ctr0_name "L3_CACHE_MISSES" + D u64[] Ctr0_wctl 85903603681 + D u64[] Ctr0_n 8761095,660101,0,0 + D char[] Ctr1_name "DCT_RD_TOT" + D u64[] Ctr1_wctl 73018663664 + D u64[] Ctr1_n 16748451,1103973,0,0 + D char[] Ctr2_name "RW_DRAM_EXT" + D u64[] Ctr2_wctl_n 73018642144,73018641888,0,0 + D u64[] Ctr2_n 4901448,7120727,0,0 + D char[] Ctr3_name "RW_DRAM_INT" + D u64[] Ctr3_wctl_n 73018638816,73018639072,0,0 + D u64[] Ctr3_n 74099900,3773483,0,0 + D char[] Ctr4_name "TOT_CYC" + D u64[] Ctr4_wctl 4391030 + D u64[] Ctr4_c 775759456,2595008788,234822206,155962379,51951208,53210798,82771568,52716295,85501768,50656894,175839012,619930959,179902397,110558187,334344071,353769784,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 + D char[] Ctr5_name "TOT_INS" + D u64[] Ctr5_wctl 4391104 + D u64[] Ctr5_c 211085929,410194651,45686350,11096207,4489395,4565853,13261794,3626609,15062986,3753527,3802413,194511990,55444449,7321398,39989531,36190191,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 + D char[] Ctr6_name "L1_DCM" + D u64[] Ctr6_wctl 4391233 + D u64[] Ctr6_c 5101215,22654419,1078523,247674,101807,99840,403194,75661,403958,81801,106359,2316889,663984,186842,944343,921712,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 + D char[] Ctr7_name "RETIRED_FLOPS" + D u64[] Ctr7_wctl 4456195 + D u64[] Ctr7_c 122,197,408,57,3,0,2,0,0,0,2,131,272,0,13,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 + D char[] Ctr8_name "VEC_INS" + D u64[] Ctr8_wctl 4392139 + D u64[] Ctr8_c 13185,32428971,9960,8153,65,0,6517,0,2863,0,280,497910,88393,624,59806,26,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 + D char[] Ctr9_name "TLB_DM" + D u64[] Ctr9_wctl 4392774 + D u64[] Ctr9_c 1312,131553,1080,698,154,2,546,3,266,59,125,678,901,196,6254,155,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 + +LDMSD_CONTROLLER CONFIGURATION COMMANDS ORDER +=============================================================== + +Configuration commands are intended to be issued in the following order: + +- load + +- config action=initialize + +- config action=add (one or more) + +- config action=finalize (one or more) + +- start + +The following config commands can be issued anytime after the start in +any order + +- config action=halt + +- config action=continue + +- config action=reassign + +- config action=rewrite + +LDMSD_CONTROLLER CONFIGURATION ATTRIBUTE SYNTAX +================================================================= + +The msr_interlagos plugin uses the sampler_base base class. This man +page covers only the configuration attributes, or those with default +values, specific to the this plugin; see ldms_sampler_base.man for the +attributes of the base class. + +**config** + | name= action= [ = ... ] + | configuration line + + name= + | + | This MUST be msr_interlagos + + action= + | + | Options are initialize, add, finalize, halt, continue, reassign, + rewrite, and ls: + + **initialize** + | corespernuma= conffile= [maxcore= + schema= ] + | initialize the plugin. sampler_base configuration arguments + should be specified at this point. + + corespernuma= + | + | Cores per numa node. Used to determine which and how many + cores are used in setting counters that report per numa node. + + maxcore= + | + | Maxcores that will be reported for all core counters and will + also be used in counters that report per numa node. Must be + >= actual number of cores. Any additional values will be + reported with 0 values. Optional. Defaults to using the + actual number of cores. + + schema= + | + | Schema name. Optional. Defaults to msr_interlagos. + + **add** + | metricname= + | add a counter metric to the set. The metric set will be built in + the order the metrics are added + + metricname= + | + | The name of counter e.g., L3_CACHE_MISSES. Options are listed + in a separate section of this man page. + + **finalize** + | + | creates the set after all the adds. No metrics may be added + after this point. + + **halt** + | metricname= + | halts collection for this counter. Zero values will be returned + for all metrics for this counter. + + metricname= + | + | The name of counter e.g., L3_CACHE_MISSES. metricname=all + halts all. + + **continue** + | metricname= + | continues collection for this counter after a halt. + + metricname= + | + | The name of counter e.g., L3_CACHE_MISSES. metricname=all + continues all. + + **rewrite** + | metricname= + | rewrites the counter variable. Used in case the counter variable + has been changed for this address external to ldms. + + metricname= + | + | The name of counter e.g., L3_CACHE_MISSES. metricname=all + rewrites all counters. + + **reassign** + | oldmetricname= newmetricname= + | replaces a metric in the metric set with a new one. It must be + the same size (e.g., numcores vs single value) as the previous + counter. + + oldmetricname= + | + | The name of counter to be replaced e.g., TOT_CYC + + newmetricname= + | + | The name of counter that the previous variable will be + replaced with e.g., TOT_INS + + **ls** + | + | writes info about the intended counters to the log file. + +BUGS +====================== + +The sampler is not robust to errors in the configuration file (i.e., +there is no error checking with respect to registers being written to or +the contents being written). An error could result in unexpected +operation including damage to the host. + +NOTES +======================= + +- This is a developmental version of the sampler. It may change at any + time. + +- The format of the configuration file and the fields has changed since + the v2 release. + +- This plugin only works for Interlagos. Using this sampler on other + architectures may result in badness as the addresses will not be + correct. + +EXAMPLES +========================== + +Within ldmsd_controller or a configuration file: + +| config name=msr_interlagos action=initialize producer=nid00010 + instance=nid00010 component_id=10 corespernuma=8 + conffile=/XXX/msr_conf.txt +| config name=msr_interlagos action=add metricname=L3_CACHE_MISSES +| config name=msr_interlagos action=add metricname=TOT_CYC +| config name=msr_interlagos action=finalize +| config name=msr_interlagos action=reassign oldmetricname=TOT_CYC + newmetricname=TOT_INS +| config name=msr_interlagos action=halt metricname=TOT_CYC + +SEE ALSO +========================== + +ldmsd(7), ldms_quickstart(7), ldms_sampler_base(7), +Plugin_store_function_csv(7), ldmsd_controller(8) diff --git a/rtd/docs/source/sampler_man/Plugin_opa2.rst b/rtd/docs/source/sampler_man/Plugin_opa2.rst new file mode 100644 index 000000000..f8139f351 --- /dev/null +++ b/rtd/docs/source/sampler_man/Plugin_opa2.rst @@ -0,0 +1,90 @@ +=========== +Plugin_opa2 +=========== + +:Date: 5 Feb 2018 + +.. contents:: + :depth: 3 +.. + +NAME +============ + +Plugin_opa2 - man page for the LDMS opa2 OmniPath network plugin + +SYNOPSIS +================ + +| Within ldmsd_controller or a configuration file: +| load name=opa2 config name=opa2 [ = ] + +DESCRIPTION +=================== + +The opa2 plugin provides local port counters from OmniPath hardware. A +separate data set is created for each port. All sets use the same +schema. + +CONFIGURATION ATTRIBUTE SYNTAX +====================================== + +**config** + | name= producer= instance= + [schema=] [component_id=] [ports=] + | configuration line + + name= + | + | This MUST be opa2. + + producer= + | + | The producer string value. + + instance= + | + | The set_name supplied is ignored, and the name + $producer/$CA/$port is used. + + schema= + | + | Optional schema name. Default opa2. The same schema is used for + all sets. + + component_id= + | + | Optional component identifier. Defaults to zero. + + ports= + | + | Port list is a comma separated list of ca_name.portnum or a + '\*'. The default is '\*', which collects a set for every host + fabric interface port. + +BUGS +============ + +None known. + +EXAMPLES +================ + +Within ldmsd_controller or a configuration file: + +:: + + load name=opa2 + config name=opa2 producer=compute1 instance=compute1/opa2 component_id=1 + start name=opa2 interval=1000000 + +NOTES +============= + +This sampler will be expanded in the future to capture additional +metrics. + +SEE ALSO +================ + +ldmsd(8), ldms_quickstart(7), ldmsd_controller(8) diff --git a/rtd/docs/source/sampler_man/Plugin_papi_sampler.rst b/rtd/docs/source/sampler_man/Plugin_papi_sampler.rst new file mode 100644 index 000000000..40ed5ec72 --- /dev/null +++ b/rtd/docs/source/sampler_man/Plugin_papi_sampler.rst @@ -0,0 +1,129 @@ +=================== +Plugin_papi_sampler +=================== + +:Date: 30 Sep 2019 + +.. contents:: + :depth: 3 +.. + +NAME +==================== + +Plugin_papi_sampler - man page for the LDMSD papi_sampler plugin + +SYNOPSIS +======================== + +Within ldmsd_controller or a configuration file: **config** +**name=papi_sampler** **producer=**\ *PRODUCER* +**instance=**\ *INSTANCE* [ **component_id=\ COMP_ID** ] [ +**stream=\ STREAM** ] [ **job_expiry=\ EXPIRY_SEC** ] + +DESCRIPTION +=========================== + +**papi_sampler** monitors PAPI events of processes of Slurm jobs. + +The job script must define **SUBSCRIBER_DATA** environment variable as a +JSON object that has at least **"papi_sampler"** attribute as follows: + + :: + + SUBSCRIBER_DATA='{"papi_sampler":{"file":"/PATH/TO/PAPI.JSON"}}' + +where the **"file"** attribute inside **"papi_sampler"** points to a +JSON-formatted text file containing user-defined schema name and PAPI +events of interest, e.g. + + :: + + { + "schema": "my_papi", + "events": [ + "PAPI_TOT_INS", + "PAPI_L1_DCM" + ] + } + +**papi_sampler** relies on **slurm_notfifier** SPANK plugin to notify it +about the starting/stopping of jobs on the node over ldmsd_stream. +Please consult **Plugin_slurm_notifier(7)** for more information on how +to deploy and configure it. The value of SUBSCRIBER_DATA from the job +script is carried over to **papi_sampler** when the job started, and an +LDMS set will be created according to the PAPI JSON file pointed by the +SUBSCRIBER_DATA. In the case of multi-tenant (multiple jobs running on a +node), each job has its own set. The set is deleted after *job_expiry* +period after the job exited. + +CONFIG OPTIONS +============================== + +**name=papi_sampler** + This MUST be papi_sampler (the name of the plugin). + +**producer=**\ *PRODUCER* + The name of the data producer (e.g. hostname). + +**instance=**\ *INSTANCE* + This is mandatory due to the fact that **papi_sampler** extends + **sampler_base** and this option is required by **sampler_base** + config. However, the value is ignored and can be anything. The actual + name of the **papi_sampler** instance is + *PRODUCER*/*SCHEMA*/*JOB_ID*. + +**component_id=**\ *COMPONENT_ID* + An integer identifying the component (default: *0*). + +**stream=**\ *STREAM* + The name of the stream that **slurm_notifier** SPANK plugin uses to + notify the job events. This attribute is optional with the default + being *slurm*. + +**job_expiry=**\ *EXPIRY_SEC* + The number of seconds to retain the set after the job has exited. The + default value is *60*. + +BUGS +==================== + +No known bugs. + +EXAMPLES +======================== + +Plugin configuration example: + + :: + + load name=papi_sampler + config name=papi_sampler producer=node0 instance=NA component_id=2 job_expiry=10 + start name=papi_sampler interval=1000000 offset=0 + +Job script example: + + :: + + #!/bin/bash + export SUBSCRIBER_DATA='{"papi_sampler":{"file":"/tmp/papi.json"}}' + srun bash -c 'for X in {1..60}; do echo $X; sleep 1; done' + +PAPI JSON example: + + :: + + { + "schema": "my_papi", + "events": [ + "PAPI_TOT_INS", + "PAPI_L1_DCM" + ] + } + +SEE ALSO +======================== + +**Plugin_slurm_notifier**\ (7), **Plugin_syspapi_sampler**\ (7), +**ldmsd**\ (8), **ldms_quickstart**\ (7), **ldmsd_controller**\ (8), +**ldms_sampler_base**\ (7). diff --git a/rtd/docs/source/sampler_man/Plugin_perfevent.rst b/rtd/docs/source/sampler_man/Plugin_perfevent.rst new file mode 100644 index 000000000..8670122c9 --- /dev/null +++ b/rtd/docs/source/sampler_man/Plugin_perfevent.rst @@ -0,0 +1,355 @@ +================ +Plugin_perfevent +================ + +:Date: 18 Feb 2018 + +.. contents:: + :depth: 3 +.. + +NAME +================= + +Plugin_perfevent - man page for the LDMS perfevent sampler plugin. + +SYNOPSIS +===================== + +| Within ldmsctl +| ldmsctl> config name=perfevent [ = ] + +DESCRIPTION +======================== + +With LDMS (Lightweight Distributed Metric Service), plugins for the +ldmsd (ldms daemon) are configured via ldmsd_controller or a +configuration file. The perfevent sampler plugin runs on the nodes and +provides data about the the occurrence of micro-architectural events +using linux perfevent subsystem by accessing hardware performance +counters. + +ENVIRONMENT +======================== + +You will need to build LDMS with --enable-perfevent. Perfevent subsystem +is available since Linux 2.6.31. + +CONFIGURATION ATTRIBUTE SYNTAX +=========================================== + +The perfevent plugin uses the sampler_base base class. This man page +covers only the configuration attributes, or those with default values, +specific to the this plugin. See ldms_sampler_base.man for the +attributes of the base class; those attributes are specified as part of +the 'init' action arguments. + +**config** + | name= action [schema=] + | configuration line + + name= + | + | This MUST be perfevent. + + schema= + | + | Optional schema name. It is intended that the same sampler on + different nodes with different metrics have a different schema. + + action=init + | + | Perform initialization + + action=del metricname= + | + | Deletes the specified event. + + action=ls + | + | List the currently configured events. + + action=add metricname= pid= cpu= type= id= + | + | Adds a metric to the list of configured events. + | metricname + | The metric name for the event + | pid + | The PID for the process being monitored. The counter will follow + the process to whichever CPU/core is in use. Note that 'pid' and + 'cpu' are mutually exclusive. + | cpu + | Count this event on the specified CPU. This will accumulate + events across all PID that land on the specified CPU/core. Note + that 'pid' and 'cpu' are mutually exclusive. + | type + | The event type. + | id + | The event id. + + The pid and cpu arguments allow specifying which process and CPU to monitor: + | + | pid == 0 and cpu == -1 + | This measures the calling process/thread on any CPU. + | pid == 0 and cpu >= 0 + | This measures the calling process/thread only when running on + the specified CPU. + | pid > 0 and cpu == -1 + | This measures the specified process/thread on any CPU. + | pid > 0 and cpu >= 0 + | This measures the specified process/thread only when running on + the specified CPU. + | pid == -1 and cpu >= 0 + | This measures all processes/threads on the specified CPU. This + requires CAP_SYS_ADMIN capability or a + /proc/sys/kernel/perf_event_paranoid value of less than 1. + | pid == -1 and cpu == -1 + | This setting is invalid and will return an error. + + For more information visit: http://man7.org/linux/man-pages/man2/perf_event_open.2.html + + **type** + | + | This field specifies the overall event type. It has one of the + following values: + | PERF_TYPE_HARDWARE + | This indicates one of the "generalized" hardware events provided + by the kernel. See the id field definition for more details. + | PERF_TYPE_SOFTWARE + | This indicates one of the software-defined events provided by + the kernel (even if no hardware support is available). + | PERF_TYPE_TRACEPOINT + | This indicates a tracepoint provided by the kernel tracepoint + infrastructure. + | PERF_TYPE_HW_CACHE + | This indicates a hardware cache event. This has a special + encoding, described in the id field definition. + | PERF_TYPE_RAW + | This indicates a "raw" implementation-specific event in the id + field. + | PERF_TYPE_BREAKPOINT (since Linux 2.6.33) + | This indicates a hardware breakpoint as provided by the CPU. + Breakpoints can be read/write accesses to an address as well as + execution of an instruction address. + + **id** + | + | This specifies which event you want, in conjunction with the + type field. + | There are various ways to set the id field that are dependent on + the value of the previously described type field. + | What follows are various possible settings for id separated out + by type. + | If type is PERF_TYPE_HARDWARE, we are measuring one of the + generalized hardware CPU events. Not all of these are available + on all platforms. Set id to one of the following: + | PERF_COUNT_HW_CPU_CYCLES + | Total cycles. Be wary of what happens during CPU frequency + scaling. + | PERF_COUNT_HW_INSTRUCTIONS + | Retired instructions. Be careful, these can be affected by + various issues, most notably hardware interrupt counts. + | PERF_COUNT_HW_CACHE_REFERENCES + | Cache accesses. Usually this indicates Last Level Cache accesses + but this may vary depending on your CPU. This may include + prefetches and coherency messages; again this depends on the + design of your CPU. + | PERF_COUNT_HW_CACHE_MISSES + | Cache misses. Usually this indicates Last Level Cache misses; + this is intended to be used in conjunction with the + | PERF_COUNT_HW_CACHE_REFERENCES + | event to calculate cache miss rates. + | PERF_COUNT_HW_BRANCH_INSTRUCTIONS + | Retired branch instructions. Prior to Linux 2.6.35, this used + the wrong event on AMD processors. + | PERF_COUNT_HW_BRANCH_MISSES + | Mispredicted branch instructions. + | PERF_COUNT_HW_BUS_CYCLES + | Bus cycles, which can be different from total cycles. + | PERF_COUNT_HW_STALLED_CYCLES_FRONTEND (since Linux 3.0) + | Stalled cycles during issue. + | PERF_COUNT_HW_STALLED_CYCLES_BACKEND (since Linux 3.0) + | Stalled cycles during retirement. + + | PERF_COUNT_HW_REF_CPU_CYCLES (since Linux 3.3) + | Total cycles; not affected by CPU frequency scaling. + | If type is PERF_TYPE_SOFTWARE, we are measuring software events + provided by the kernel. Set config to one of the following: + | PERF_COUNT_SW_CPU_CLOCK + | This reports the CPU clock, a high-resolution per-CPU timer. + | PERF_COUNT_SW_TASK_CLOCK + | This reports a clock count specific to the task that is running. + | PERF_COUNT_SW_PAGE_FAULTS + | This reports the number of page faults. + | PERF_COUNT_SW_CONTEXT_SWITCHES + | This counts context switches. Until Linux 2.6.34, these were all + reported as user-space events, after that they are reported as + happening in the kernel. + | PERF_COUNT_SW_CPU_MIGRATIONS + | This reports the number of times the process has migrated to a new + CPU. + | PERF_COUNT_SW_PAGE_FAULTS_MIN + | This counts the number of minor page faults. These did not require + disk I/O to handle. + | PERF_COUNT_SW_PAGE_FAULTS_MAJ + | This counts the number of major page faults. These required disk + I/O to handle. + | PERF_COUNT_SW_ALIGNMENT_FAULTS (since Linux 2.6.33) + | This counts the number of alignment faults. These happen when + unaligned memory accesses happen; the kernel can handle these but + it reduces performance. This happens only on some architectures + (never on x86). + | PERF_COUNT_SW_EMULATION_FAULTS (since Linux 2.6.33) + | This counts the number of emulation faults. The kernel sometimes + traps on unimplemented instructions and emulates them for user + space. This can negatively impact performance. + | PERF_COUNT_SW_DUMMY (since Linux 3.12) + | This is a placeholder event that counts nothing. Informational + sample record types such as mmap or comm must be associated with an + active event. This dummy event allows gathering such records + without requiring a counting event. + | If type is PERF_TYPE_TRACEPOINT, then we are measuring kernel + tracepoints. The value to use in id can be obtained from under + debugfs tracing/events/\*/\*/id if ftrace is enabled in the kernel. + | If type is PERF_TYPE_HW_CACHE, then we are measuring a hardware CPU + cache event. To calculate the appropriate id value use the + following equation: + | (perf_hw_cache_id) \| (perf_hw_cache_op_id << 8) \| + (perf_hw_cache_op_result_id << 16) + | where perf_hw_cache_id is one of: + | PERF_COUNT_HW_CACHE_L1D + | for measuring Level 1 Data Cache + | PERF_COUNT_HW_CACHE_L1I + | for measuring Level 1 Instruction Cache + | PERF_COUNT_HW_CACHE_LL + | for measuring Last-Level Cache + | PERF_COUNT_HW_CACHE_DTLB + | for measuring the Data TLB + | PERF_COUNT_HW_CACHE_ITLB + | for measuring the Instruction TLB + | PERF_COUNT_HW_CACHE_BPU + | for measuring the branch prediction unit + | PERF_COUNT_HW_CACHE_NODE (since Linux 3.1) + | for measuring local memory accesses + | and perf_hw_cache_op_id is one of + | PERF_COUNT_HW_CACHE_OP_READ + | for read accesses + | PERF_COUNT_HW_CACHE_OP_WRITE + | for write accesses + | PERF_COUNT_HW_CACHE_OP_PREFETCH + | for prefetch accesses and perf_hw_cache_op_result_id is one of + | PERF_COUNT_HW_CACHE_RESULT_ACCESS + | to measure accesses + | PERF_COUNT_HW_CACHE_RESULT_MISS + | to measure misses + | If type is PERF_TYPE_RAW, then a custom "raw" id value is needed. + Most CPUs support events that are not covered by the "generalized" + events. These are implementation defined; see your CPU manual (for + example the Intel Volume 3B documentation or the AMD BIOS and + Kernel Developer Guide). The libpfm4 library can be used to + translate from the name in the architectural manuals to the raw hex + value perf_event_open() expects in this field. + +NOTES +================== + +The official way of knowing if perf_event_open() support is enabled is +checking for the existence of the file +/proc/sys/kernel/perf_event_paranoid. + +The enum values for type and id are specified in kernel. Here are the +values in version 3.9 (retrieved from +http://lxr.cpsc.ucalgary.ca/lxr/linux+v3.9/include/uapi/linux/perf_event.h#L28): + +enum perf_type_id { PERF_TYPE_HARDWARE = 0, PERF_TYPE_SOFTWARE = 1, +PERF_TYPE_TRACEPOINT = 2, PERF_TYPE_HW_CACHE = 3, PERF_TYPE_RAW = 4, +PERF_TYPE_BREAKPOINT = 5, + +PERF_TYPE_MAX, /\* non-ABI \*/ }; + +enum perf_hw_id { /\* \* Common hardware events, generalized by the +kernel: \*/ PERF_COUNT_HW_CPU_CYCLES = 0, PERF_COUNT_HW_INSTRUCTIONS = +1, PERF_COUNT_HW_CACHE_REFERENCES = 2, PERF_COUNT_HW_CACHE_MISSES = 3, +PERF_COUNT_HW_BRANCH_INSTRUCTIONS = 4, PERF_COUNT_HW_BRANCH_MISSES = 5, +PERF_COUNT_HW_BUS_CYCLES = 6, PERF_COUNT_HW_STALLED_CYCLES_FRONTEND = 7, +PERF_COUNT_HW_STALLED_CYCLES_BACKEND = 8, PERF_COUNT_HW_REF_CPU_CYCLES = +9, + +PERF_COUNT_HW_MAX, /\* non-ABI \*/ }; + +/\* \* Generalized hardware cache events: \* \* { L1-D, L1-I, LLC, ITLB, +DTLB, BPU, NODE } x \* { read, write, prefetch } x \* { accesses, misses +} \*/ enum perf_hw_cache_id { PERF_COUNT_HW_CACHE_L1D = 0, +PERF_COUNT_HW_CACHE_L1I = 1, PERF_COUNT_HW_CACHE_LL = 2, +PERF_COUNT_HW_CACHE_DTLB = 3, PERF_COUNT_HW_CACHE_ITLB = 4, +PERF_COUNT_HW_CACHE_BPU = 5, PERF_COUNT_HW_CACHE_NODE = 6, + +PERF_COUNT_HW_CACHE_MAX, /\* non-ABI \*/ }; enum perf_hw_cache_op_id { +PERF_COUNT_HW_CACHE_OP_READ = 0, PERF_COUNT_HW_CACHE_OP_WRITE = 1, +PERF_COUNT_HW_CACHE_OP_PREFETCH = 2, + +PERF_COUNT_HW_CACHE_OP_MAX, /\* non-ABI \*/ }; + +enum perf_hw_cache_op_result_id { PERF_COUNT_HW_CACHE_RESULT_ACCESS = 0, +PERF_COUNT_HW_CACHE_RESULT_MISS = 1, + +PERF_COUNT_HW_CACHE_RESULT_MAX, /\* non-ABI \*/ }; + +/\* \* Special "software" events provided by the kernel, even if the +hardware \* does not support performance events. These events measure +various \* physical and sw events of the kernel (and allow the profiling +of them as \* well): \*/ enum perf_sw_ids { PERF_COUNT_SW_CPU_CLOCK = 0, +PERF_COUNT_SW_TASK_CLOCK = 1, PERF_COUNT_SW_PAGE_FAULTS = 2, +PERF_COUNT_SW_CONTEXT_SWITCHES = 3, PERF_COUNT_SW_CPU_MIGRATIONS = 4, +PERF_COUNT_SW_PAGE_FAULTS_MIN = 5, PERF_COUNT_SW_PAGE_FAULTS_MAJ = 6, +PERF_COUNT_SW_ALIGNMENT_FAULTS = 7, PERF_COUNT_SW_EMULATION_FAULTS = 8, + +PERF_COUNT_SW_MAX, /\* non-ABI \*/ }; + +BUGS +================= + +No known bugs. + +EXAMPLES +===================== + +The following is a short example that measures 4 events. + | + | Total CPU cycles + | Total CPU instructions + | Total branch instructions + | Mispredicted branch instructions + +| IF we set the value of PID=1234 and CPU_NUM is -1, this measures the + process with pid=1234 on any CPU. If the CPU_NUM is 1, this measures + the process with pid=1234 only on CPU 1. +| IF we set the value of PID=-1 and CPU_NUM is 1, this measures all + processes/threads on the CPU number 1. This requires CAP_SYS_ADMIN + capability or a /proc/sys/kernel/perf_event_paranoid value of less + than 1. + +$ldmsctl -S $LDMSD_SOCKPATH + +| ldmsctl> load name=perfevent +| ldmsctl> config name=perfevent action=add + metricname="PERF_COUNT_HW_CPU_CYCLES" pid=$PID cpu=$CPU_NUM type=0 + id=0 +| ldmsctl> config name=perfevent action=add + metricname="PERF_COUNT_HW_INSTRUCTIONS" pid=$PID cpu=$CPU_NUM type=0 + id=1 +| ldmsctl> config name=perfevent action=add + metricname="PERF_COUNT_HW_BRANCH_INSTRUCTIONS" pid=$PID cpu=$CPU_NUM + type=0 id=4 +| ldmsctl> config name=perfevent action=add + metricname="PERF_COUNT_HW_BRANCH_MISSES" pid=$PID cpu=$CPU_NUM type=0 + id=5 +| ldmsctl> config name=perfevent action=init instance=$INSTANCE_NAME + producer=$PRODUCER_NAME +| ldmsctl> start name=perfevent interval=$INTERVAL_VALUE +| ldmsctl> quit + +SEE ALSO +===================== + +PERF_EVENT_OPEN(2), ldmsd(7), ldms_quickstart(7), ldms_sampler_base(7) diff --git a/rtd/docs/source/sampler_man/Plugin_procdiskstats.rst b/rtd/docs/source/sampler_man/Plugin_procdiskstats.rst new file mode 100644 index 000000000..97185aad6 --- /dev/null +++ b/rtd/docs/source/sampler_man/Plugin_procdiskstats.rst @@ -0,0 +1,81 @@ +==================== +Plugin_procdiskstats +==================== + +:Date: 18 Feb 2018 + +.. contents:: + :depth: 3 +.. + +NAME +===================== + +Plugin_procdiskstats - man page for the LDMS procdiskstats plugin + +SYNOPSIS +========================= + +| Within ldmsd_controller or a configuration file: +| config name=procdiskstats [ = ] + +DESCRIPTION +============================ + +With LDMS (Lightweight Distributed Metric Service), plugins for the +ldmsd (ldms daemon) are configured via ldmsd_controller or a +configuration file. The procdiskstats plugin provides disk info. + +WARNING: This sampler is unsupported. + +CONFIGURATION ATTRIBUTE SYNTAX +=============================================== + +The procdiskstats plugin uses the sampler_base base class. This man page +covers only the configuration attributes, or those with default values, +specific to the this plugin; see ldms_sampler_base.man for the +attributes of the base class. + +**config** + | name= [schema=] device= + | configuration line + + name= + | + | This MUST be procdiskstats. + + schema= + | + | Optional schema name. It is intended that the same sampler on + different nodes with different metrics have a different schema. + If not specified, will default to \`procdiskstats\`. + + device= + | + | Comma separated list of devices + +BUGS +===================== + +No known bugs. + +NOTES +====================== + +- This sampler is unsupported. + +EXAMPLES +========================= + +Within ldmsd_controller or a configuration file: + +:: + + load name=procdiskstats + config name=procdiskstats producer=vm1_1 instance=vm1_1/procdiskstats component_id=1 + start name=procdiskstats interval=1000000 + +SEE ALSO +========================= + +ldmsd(8), ldms_quickstart(7), ldmsd_controller(8), ldms_sampler_base(7) diff --git a/rtd/docs/source/sampler_man/Plugin_procinterrupts.rst b/rtd/docs/source/sampler_man/Plugin_procinterrupts.rst new file mode 100644 index 000000000..d8aa92f2e --- /dev/null +++ b/rtd/docs/source/sampler_man/Plugin_procinterrupts.rst @@ -0,0 +1,71 @@ +===================== +Plugin_procinterrupts +===================== + +:Date: 10 Feb 2018 + +.. contents:: + :depth: 3 +.. + +NAME +====================== + +Plugin_interrupts - man page for the LDMS interrupts plugin + +SYNOPSIS +========================== + +| Within ldmsd_controller or a configuration file: +| config name=interrupts [ = ] + +DESCRIPTION +============================= + +With LDMS (Lightweight Distributed Metric Service), plugins for the +ldmsd (ldms daemon) are configured via ldmsd_controller or a +configuration file. The interrupts plugin provides info from +/proc/interrupts. The metric name will be irq.#CPU_NUMBER. + +CONFIGURATION ATTRIBUTE SYNTAX +================================================ + +The procinterrupts plugin uses the sampler_base base class. This man +page covers only the configuration attributes, or those with default +values, specific to the this plugin; see ldms_sampler_base.man for the +attributes of the base class. + +**config** + | name= [schema=] + | configuration line + + name= + | + | This MUST be procinterrupts. + + schema= + | + | Optional schema name. It is intended that the same sampler on + different nodes with different metrics have a different schema. + If not specified, will default to \`procinterrupts\`. + +BUGS +====================== + +No known bugs. + +EXAMPLES +========================== + +Within ldmsd_controller or a configuration file: + +:: + + load name=procinterrupts + config name=procinterrupts producer=1 instance=vm1_1/procinterrupts + start name=procinterrupts interval=1000000 + +SEE ALSO +========================== + +ldmsd(8), ldms_quickstart(7), ldmsd_controller(8), ldms_sampler_base(7) diff --git a/rtd/docs/source/sampler_man/Plugin_procnet.rst b/rtd/docs/source/sampler_man/Plugin_procnet.rst new file mode 100644 index 000000000..eb9b758d0 --- /dev/null +++ b/rtd/docs/source/sampler_man/Plugin_procnet.rst @@ -0,0 +1,76 @@ +============== +Plugin_procnet +============== + +:Date: 9 Apr 2021 + +.. contents:: + :depth: 3 +.. + +NAME +=============== + +Plugin_procnet - man page for the LDMS procnet plugin + +SYNOPSIS +=================== + +| Within ldmsd_controller or a configuration file: +| config name=procnet [common attributes] [exclude_ports=] + +DESCRIPTION +====================== + +With LDMS (Lightweight Distributed Metric Service), plugins for the +ldmsd (ldms daemon) are configured via ldmsd_controller or a +configuration file. The procnet plugin provides network info from +/proc/net/dev, creating a different set for each device, reporting only +active devices, and reporting an active device only when counters +change. + +CONFIGURATION ATTRIBUTE SYNTAX +========================================= + +The procnet plugin uses the sampler_base base class. This man page +covers only the configuration attributes, or those with default values, +specific to the this plugin; see ldms_sampler_base.man for the +attributes of the base class. + +**config** + | name= exclude_ports= + | configuration line + + name= + | + | This MUST be procnet. + + exclude_ports= + | + | Comma separated list of ports to exclude. + + schema= + | + | Optional schema name. If not specified, will default to + \`procnet\`. + +BUGS +=============== + +Interfaces reported and exclude_ports lists are each limited to 20. + +EXAMPLES +=================== + +Within ldmsd_controller or a configuration file: + +:: + + load name=procnet + config name=procnet producer=vm1_1 instance=vm1_1/procnet exclude_ports=lo + start name=procnet interval=1000000 + +SEE ALSO +=================== + +ldmsd(8), ldms_quickstart(7), ldmsd_controller(8), ldms_sampler_base(7) diff --git a/rtd/docs/source/sampler_man/Plugin_procnetdev.rst b/rtd/docs/source/sampler_man/Plugin_procnetdev.rst new file mode 100644 index 000000000..b0a95b81c --- /dev/null +++ b/rtd/docs/source/sampler_man/Plugin_procnetdev.rst @@ -0,0 +1,77 @@ +================= +Plugin_procnetdev +================= + +:Date: 10 Dec 2018 + +.. contents:: + :depth: 3 +.. + +NAME +================== + +Plugin_procnetdev - man page for the LDMS procnetdev plugin + +SYNOPSIS +====================== + +| Within ldmsd_controller or a configuration file: +| config name=procnetdev [ = ] + +DESCRIPTION +========================= + +With LDMS (Lightweight Distributed Metric Service), plugins for the +ldmsd (ldms daemon) are configured via ldmsd_controller or a +configuration file. The procnetdev plugin provides network info from +/proc/net/dev. + +CONFIGURATION ATTRIBUTE SYNTAX +============================================ + +The procnetdev plugin uses the sampler_base base class. This man page +covers only the configuration attributes, or those with default values, +specific to the this plugin; see ldms_sampler_base.man for the +attributes of the base class. + +**config** + | name= ifaces= + | configuration line + + name= + | + | This MUST be procnetdev. + + ifaces= + | + | CSV list of ifaces. Order matters. Non-existent ifaces will be + included and default to 0-value data. + + schema= + | + | Optional schema name. It is intended that the same sampler on + different nodes with different metrics or ifaces have a + different schema. If not specified, will default to + \`procnetdev\`. + +BUGS +================== + +Interfaces list is limited to 20. + +EXAMPLES +====================== + +Within ldmsd_controller or a configuration file: + +:: + + load name=procnetdev + config name=procnetdev producer=vm1_1 instance=vm1_1/procnetdev iface=eth0,eth1 + start name=procnetdev interval=1000000 + +SEE ALSO +====================== + +ldmsd(8), ldms_quickstart(7), ldmsd_controller(8), ldms_sampler_base(7) diff --git a/rtd/docs/source/sampler_man/Plugin_procnetdev2.rst b/rtd/docs/source/sampler_man/Plugin_procnetdev2.rst new file mode 100644 index 000000000..37f39f743 --- /dev/null +++ b/rtd/docs/source/sampler_man/Plugin_procnetdev2.rst @@ -0,0 +1,79 @@ +================== +Plugin_procnetdev2 +================== + +:Date: 07 Jan 2022 + +.. contents:: + :depth: 3 +.. + +NAME +=================== + +Plugin_procnetdev2 - man page for the LDMS procnetdev2 plugin + +SYNOPSIS +======================= + +| Within ldmsd_controller or a configuration file: +| config name=procnetdev2 [ = ] + +DESCRIPTION +========================== + +With LDMS (Lightweight Distributed Metric Service), plugins for the +ldmsd (ldms daemon) are configured via ldmsd_controller or a +configuration file. The procnetdev2 plugin uses LDMS_V_LIST and +LDMS_V_RECORD to provide network info from /proc/net/dev. + +CONFIGURATION ATTRIBUTE SYNTAX +============================================= + +The procnetdev2 plugin uses the sampler_base base class. This man page +covers only the configuration attributes, or those with default values, +specific to the this plugin; see ldms_sampler_base.man for the +attributes of the base class. + +**config** + | name= [ifaces=] + | configuration line + + name= + | + | This MUST be procnetdev2. + + ifaces= + | + | (Optional) A CSV list of interfaces to sample. If not specified, + all available interfaces in /proc/net/dev will be reported. It + is OK to specify non-existing interfaces in the ifaces list. + + schema= + | + | Optional schema name. It is intended that the same sampler on + different nodes with different metrics or ifaces have a + different schema. If not specified, will default to + \`procnetdev\`. + +BUGS +=================== + +The maximum number of interfaces is limited to 32. + +EXAMPLES +======================= + +Within ldmsd_controller or a configuration file: + +:: + + load name=procnetdev + config name=procnetdev producer=vm1_1 instance=vm1_1/procnetdev2 ifaces=eth0,eth1 + start name=procnetdev interval=1000000 offset=0 + +SEE ALSO +======================= + +ldmsd(8), ldms_quickstart(7), ldmsd_controller(8), ldms_sampler_base(7), +Plugin_procnetdev(7) diff --git a/rtd/docs/source/sampler_man/Plugin_procnfs.rst b/rtd/docs/source/sampler_man/Plugin_procnfs.rst new file mode 100644 index 000000000..49bc21ea3 --- /dev/null +++ b/rtd/docs/source/sampler_man/Plugin_procnfs.rst @@ -0,0 +1,70 @@ +============== +Plugin_procnfs +============== + +:Date: 10 Feb 2018 + +.. contents:: + :depth: 3 +.. + +NAME +=============== + +Plugin_procnfs - man page for the LDMS procnfs plugin + +SYNOPSIS +=================== + +| Within ldmsd_controller or a configuration file: +| config name=procnfs [ = ] + +DESCRIPTION +====================== + +With LDMS (Lightweight Distributed Metric Service), plugins for the +ldmsd (ldms daemon) are configured via ldmsd_controller or a +configuration file. The procnfs plugin provides info from +/proc/net/rpc/nfs + +CONFIGURATION ATTRIBUTE SYNTAX +========================================= + +The procnfs plugin uses the sampler_base base class. This man page +covers only the configuration attributes, or those with default values, +specific to the this plugin; see ldms_sampler_base.man for the +attributes of the base class. + +**config** + | name= [schema=] + | configuration line + + name= + | + | This MUST be procnfs. + + schema= + | + | Optional schema name. It is intended that the same sampler on + different nodes with different metrics have a different schema. + If not specified, will default to \`procnfs\`. + +BUGS +=============== + +No known bugs. + +EXAMPLES +=================== + +:: + + Within ldmsd_controller or a configuration file: + load name=procnfs + config name=procnfs producer=vm1_1 instance=vm1_1/procnfs + start name=procnfs interval=1000000 + +SEE ALSO +=================== + +ldmsd(8), ldms_quickstart(7), ldmsd_controller(8), ldms_sampler_base(7) diff --git a/rtd/docs/source/sampler_man/Plugin_procstat.rst b/rtd/docs/source/sampler_man/Plugin_procstat.rst new file mode 100644 index 000000000..1c49164e7 --- /dev/null +++ b/rtd/docs/source/sampler_man/Plugin_procstat.rst @@ -0,0 +1,85 @@ +=============== +Plugin_procstat +=============== + +:Date: 03 Dec 2016 + +.. contents:: + :depth: 3 +.. + +NAME +================ + +Plugin_procstat - man page for the LDMS procstat plugin + +SYNOPSIS +==================== + +| Within ldmsd_controller or in a configuration file +| config name=procstat [ = ] + +DESCRIPTION +======================= + +The procstat plugin provides cpu utilization info from /proc/stat, +allowing for hyperthreading and downed core variability. As +hyperthreading might be variable and user selectable depending on system +configuration, the maximum number of cores potentially appearing should +be set in the plugin options with the maxcpu parameter. Cores not +actually appearing will be reported as 0 values. + +CONFIGURATION ATTRIBUTE SYNTAX +========================================== + +See ldms_sampler_base(7) for the common sampler options. + +**config** + | maxcpu= + | configuration line + + maxcpu= + | + | Values are 0 to N, where 0 logs only totalized data and N + reserves slots for N cores. If less than N cores are found, + 0-values are reported. If more than N cores are found, they are + ignored with an INFO note in the log. Default is the number of + cores found locally when the sampler is started. If machines + monitored may have cores disabled or variable hyperthreading + status, set maxcpu to the most cores that will be reported + anywhere in the cluster. + + sc_clk_tck=1 + | + | Enable optional reporting of sysconf(\_SC_CLK_TCK), the + scheduler ticks-per-second defined at kernel build time as + CONFIG_HZ, collected from sysconf(3). Typically HPC systems use + 100, while 250, 300, 1000 may also occur. + +DATA +================ + +This reports both interrupt count and time processing them. For detailed +interrupt data by type, consider Plugin_procinterrupts(7). + +BUGS +================ + +Reporting all interrupts by name is not implemented. + +EXAMPLES +==================== + +Within ldmsd_controller or a configuration file: + +:: + + load name=procstat + config name=procstat producer=vm1_1 component_id=1 maxcpu=4 instance=vm1_1/procstat with_jobid=0 + start name=procstat interval=1000000 offset=0 + +SEE ALSO +==================== + +ldms_sampler_base(7), Plugin_procinterrupts(7), Kernel source +fs/proc/stat.c and proc(5), ldmsd(8), ldmsd_controller(8) diff --git a/rtd/docs/source/sampler_man/Plugin_procstat2.rst b/rtd/docs/source/sampler_man/Plugin_procstat2.rst new file mode 100644 index 000000000..88f6896a4 --- /dev/null +++ b/rtd/docs/source/sampler_man/Plugin_procstat2.rst @@ -0,0 +1,76 @@ +================ +Plugin_procstat2 +================ + +:Date: 14 Jan 2022 + +.. contents:: + :depth: 3 +.. + +NAME +================= + +Plugin_procstat2 - man page for the LDMS procstat2 plugin + +SYNOPSIS +===================== + +| Within ldmsd_controller or a configuration file: +| config name=procstat2 [ = ] + +DESCRIPTION +======================== + +With LDMS (Lightweight Distributed Metric Service), plugins for the +ldmsd (ldms daemon) are configured via ldmsd_controller or a +configuration file. The procstat2 plugin provides data from /proc/stat. + +CONFIGURATION ATTRIBUTE SYNTAX +=========================================== + +The procstat2 plugin uses the sampler_base base class. This man page +covers only the configuration attributes, or those with default values, +specific to the this plugin; see ldms_sampler_base(7) for the attributes +of the base class. + +**config** + | name= [schema=] + | configuration line + + name= + | + | This MUST be procstat2. + + schema= + | + | Optional schema name. It is intended that the same sampler on + different nodes with different metrics have a different schema. + If not specified, will default to \`procstat2\`. + + intr_max= + | + | (Optional). The maximum number of inerrupt numbers supported in + intr_list. If not specified, intr_max will be the current number + of interrupts in the intr list. + +BUGS +================= + +No known bugs. + +EXAMPLES +===================== + +Within ldmsd_controller or a configuration file: + +:: + + load name=procstat2 + config name=procstat2 producer=vm1_1 instance=vm1_1/procstat2 + start name=procstat2 interval=1000000 + +SEE ALSO +===================== + +ldmsd(8), ldms_quickstart(7), ldmsd_controller(8), ldms_sampler_base(7) diff --git a/rtd/docs/source/sampler_man/Plugin_rdc_sampler.rst b/rtd/docs/source/sampler_man/Plugin_rdc_sampler.rst new file mode 100644 index 000000000..8a25f553f --- /dev/null +++ b/rtd/docs/source/sampler_man/Plugin_rdc_sampler.rst @@ -0,0 +1,120 @@ +================== +Plugin_rdc_sampler +================== + +:Date: 1 Apr 2021 + +.. contents:: + :depth: 3 +.. + +NAME +=================== + +Plugin_rdc_sampler - man page for the LDMS rdc_sampler plugin + +SYNOPSIS +======================= + +| Within ldmsd_controller or a configuration file: +| config name=rdc_sampler [ = ] + +DESCRIPTION +========================== + +The rdc_sampler plugin provides AMD gpu device data. Data sets may be +wide or per-device. Plugins for the ldmsd (ldms daemon) are configured +via ldmsd_controller or a configuration file. + +CONFIGURATION ATTRIBUTE SYNTAX +============================================= + +**config** + | name= [producer=] [instance=] + [component_id=] [schema=] [uid=] + [gid=] [perm=] [metrics=LIST] + [update_freq=MICROSEC] [max_keep_age=SEC] [max_keep_samples=N] + | configuration line + + name= + | + | This MUST be rdc_sampler. + + producer=. + | + | The producer string value for the timing set. + + instance= + | + | The set instance names will be suffixed by device number + (gpu%d). + + schema= + | + | Optional schema base name. The default is rdc_sampler. The name + base is suffixed to create uniquely defined schema names based + on the plugin options specified. + + component_id= + | + | Optional component identifier for the timing set. Defaults to + zero. + + metrics=LIST + | + | The list of values to be collected as named in rdc_field_t from + rdc/rdc.h. + + update_freq=MICROSEC + | + | An argument passed to rdc_field_watch. + + max_keep_age=SEC + | + | An argument passed to rdc_field_watch. + + max_keep_samples=N + | + | An argument passed to rdc_field_watch. + + warmup=K + | + | Delay K cycles update_freq long before attempting to read data + from the gpu. + +EXAMPLES +======================= + +Within ldmsd_controller or a configuration file: + +:: + + load name=rdc_sampler + config name=rdc_sampler component_id=1 + start name=rdc_sampler interval=1000000 + +NOTES +==================== + +The exact schema name that will be generated can be determined using the +ldms_rdc_schema_name utility. The data available may depend on the +specific GPUs and their configuration. + +The rdc libraries loaded by the plugin may emit inconsequential error +messages to stdout. Two such begin with " ERROR +RdcLibraryLoader.cc" " ERROR RdcMetricFetcherImpl.cc" The +latter suggests you may have requested metrics unsupported by your +hardware. + +BUGS +=================== + +At ldmsd exit, there is a race between sampler termination and the rdc +library thread cleanup. This may lead to an exception being thrown in +the library code that terminates ldmsd with a C++ exception message. + +SEE ALSO +======================= + +ldmsd(8), ldms_quickstart(7), ldmsd_controller(8), +ldms_rdc_schema_name(1) diff --git a/rtd/docs/source/sampler_man/Plugin_sampler_atasmart.rst b/rtd/docs/source/sampler_man/Plugin_sampler_atasmart.rst new file mode 100644 index 000000000..d0eb73577 --- /dev/null +++ b/rtd/docs/source/sampler_man/Plugin_sampler_atasmart.rst @@ -0,0 +1,87 @@ +======================= +Plugin_sampler_atasmart +======================= + +:Date: 18 Feb 2018 + +.. contents:: + :depth: 3 +.. + +NAME +======================== + +Plugin_sampler_atasmart - man page for the LDMS sampler_atasmart plugin + +SYNOPSIS +============================ + +| Within ldmsd_controller or a configuration file: +| config name=sampler_atasmart [ = ] + +DESCRIPTION +=============================== + +With LDMS (Lightweight Distributed Metric Service), plugins for the +ldmsd (ldms daemon) are configured via ldmsd_controller or a +configuration file. The sampler_atasmart plugin provides disk info via +sampler_atasmart. + +WARNING: This sampler is unsupported. + +ENVIRONMENT +=============================== + +To build this sampler, the tasmart library must be loaded. + +CONFIGURATION ATTRIBUTE SYNTAX +================================================== + +The sampler_atasmart plugin uses the sampler_base base class. This man +page covers only the configuration attributes, or those with default +values, specific to the this plugin; see ldms_sampler_base.man for the +attributes of the base class. + +**config** + | name= [schema=] disks= + | configuration line + + name= + | + | This MUST be sampler_atasmart. + + schema= + | + | Optional schema name. It is intended that the same sampler on + different nodes with different metrics have a different schema. + If not specified, will default to \`sampler_atasmart\`. + + disks + | + | A comma-separated list of disk names (e.g., /dev/sda,/dev/sda1) + +BUGS +======================== + +No known bugs. + +NOTES +========================= + +- This sampler is unsupported. + +EXAMPLES +============================ + +Within ldmsd_controller or a configuration file: + +:: + + load name=sampler_atasmart + config name=sampler_atasmart producer=vm1_1 instance=vm1_1/sampler_atasmart component_id=1 + start name=sampler_atasmart interval=1000000 + +SEE ALSO +============================ + +ldmsd(8), ldms_quickstart(7), ldmsd_controller(8), ldms_sampler_base(7) diff --git a/rtd/docs/source/sampler_man/Plugin_slingshot_info.rst b/rtd/docs/source/sampler_man/Plugin_slingshot_info.rst new file mode 100644 index 000000000..804b73ae5 --- /dev/null +++ b/rtd/docs/source/sampler_man/Plugin_slingshot_info.rst @@ -0,0 +1,71 @@ +===================== +Plugin_slingshot_info +===================== + +:Date: 1 May 2022 + +.. contents:: + :depth: 3 +.. + +NAME +====================== + +Plugin_slingshot_info - man page for the LDMS slingshot_info plugin + +SYNOPSIS +========================== + +| Within ldmsd_controller or a configuration file: +| config name=slingshot_info [ = ] + +DESCRIPTION +============================= + +With LDMS (Lightweight Distributed Metric Service), plugins for the +ldmsd (ldms aemon) are configured via ldmsd_controller or a +configuration file. The slingshot_info plugin provides a single metric +set that contains a list of records. Each record contains all of the +informational fields for a single slingshot NIC. + +The slingshot_info sampler plugin provides a fairly small set of general +information about each slingshot NIC, including FRU description, serial +number, etc. Likely users will want to sample this plugin relatively +infrequently. For detailed slingshot NIC counter data, see the +slingshot_metrics sampler plugin. + +The schema is named "slingshot_info" by default. + +CONFIGURATION ATTRIBUTE SYNTAX +================================================ + +The slingshot_info plugin uses the sampler_base base class. This man +page covers only the configuration attributes, or those with default +values, specific to the this plugin; see ldms_sampler_base.man for the +attributes of the base class. + +**config** + | name= [counters=] [counters_file=] + | configuration line + + name= + | + | This MUST be slingshot_info. + +EXAMPLES +========================== + +Within ldmsd_conteroller or a configuration file: + +:: + + load name=slingshot_info + config name=slingshot_info producer=host1 instance=host1/slingshot_info + start name=slingshot_info interval=1000000 offset=0 + +SEE ALSO +========================== + +ldmsd(8), ldms_quickstart(7), ldmsd_controller(8), ldms_sampler_base(7), +Plugin_slingshot_metrics(7) diff --git a/rtd/docs/source/sampler_man/Plugin_slingshot_metrics.rst b/rtd/docs/source/sampler_man/Plugin_slingshot_metrics.rst new file mode 100644 index 000000000..9bd5c2577 --- /dev/null +++ b/rtd/docs/source/sampler_man/Plugin_slingshot_metrics.rst @@ -0,0 +1,106 @@ +======================== +Plugin_slingshot_metrics +======================== + +:Date: 1 May 2022 + +.. contents:: + :depth: 3 +.. + +NAME +========================= + +Plugin_slingshot_metrics - man page for the LDMS slingshot_metrics +plugin + +SYNOPSIS +============================= + +| Within ldmsd_controller or a configuration file: +| config name=slingshot_metrics [ = ] + +DESCRIPTION +================================ + +With LDMS (Lightweight Distributed Metric Service), plugins for the +ldmsd (ldms aemon) are configured via ldmsd_controller or a +configuration file. The slingshot_metrics plugin provides a single +metric set that contains a list of records. Each record contains all of +the metrics for a single slingshot NIC. + +The slingshot_metrics sampler plugin provides detailed counter metrics +for each slignshot NIC. + +The schema is named "slingshot_metrics" by default. + +CONFIGURATION ATTRIBUTE SYNTAX +=================================================== + +The slingshot_metrics plugin uses the sampler_base base class. This man +page covers only the configuration attributes, or those with default +values, specific to the this plugin; see ldms_sampler_base.man for the +attributes of the base class. + +**config** + | name= [counters=] [counters_file=] + | configuration line + + name= + | + | This MUST be slingshot_metrics. + + counters= + | + | (Optional) A CSV list of names of slingshot counter names. See + Section COUTNER NAMES for details. If neither this option nor + counters_file are specified, a default set of counters will be + used. + + counters_files= + | + | (Optional) A path to a file that contains a list of counter + names, one per line. See Section COUNTER NAMES for details. A + line will be consider a comment if the character on the line is + a "#". If neither this option nor counters are specified, a + default set of counters will be used. + + refresh_interval_sec= + | + | (Optional) The sampler caches the list of slinghost devices, and + that cache is refreshed at the beginning of a sample cycle if + the refresh interval time has been exceeded. + refresh_interval_sec sets the minimum number of seconds between + refreshes of the device cache. The default refresh interval is + 600 seconds. + +COUNTER NAMES +================================== + +The names of the counters can be found in the slingshot/cassini header +file cassini_cntr_def.h in the array c1_cntr_defs (specifically the +strings in the "name" field of said array entries). + +In addition to the individual counter names, this plugin allows +specifying entire groups of counters by using the counter name pattern +"group:", for insance, "group:hni". The available groups +are: ext, pi_ipd, mb, cq, lpe, hni, ext2. These groups correspond with +the enum c_cntr_group in the cassini_cntr_def.h file. Additionally, one +may use "group:all", which simply includes all available counters. + +EXAMPLES +============================= + +Within ldmsd_conteroller or a configuration file: + +:: + + load name=slingshot_metrics + config name=slingshot_metrics producer=host1 instance=host1/slingshot_metrics counters=ixe_rx_tcp_pkt,group:hni refresh_interval_sec=3600 + start name=slingshot_metrics interval=1000000 offset=0 + +SEE ALSO +============================= + +ldmsd(8), ldms_quickstart(7), ldmsd_controller(8), ldms_sampler_base(7) diff --git a/rtd/docs/source/sampler_man/Plugin_slurm_notifier.rst b/rtd/docs/source/sampler_man/Plugin_slurm_notifier.rst new file mode 100644 index 000000000..cb0c80a84 --- /dev/null +++ b/rtd/docs/source/sampler_man/Plugin_slurm_notifier.rst @@ -0,0 +1,78 @@ +===================== +Plugin_slurm_notifier +===================== + +:Date: 30 Sep 2019 + +.. contents:: + :depth: 3 +.. + +NAME +====================== + +Plugin_slurm_notifier - man page for the SPANK slurm_notifier plugin + +SYNOPSIS +========================== + +Within plugstack.conf: **required** +*OVIS_PREFIX*/*LIBDIR*/ovis-ldms/libslurm_notifier.so +**stream=**\ *STREAM_NAME* **timeout=**\ *TIMEOUT_SEC* **[user_debug]** +**client=**\ *XPRT*\ **:**\ *HOST*\ **:**\ *PORT*\ **:**\ *AUTH* ... + +DESCRIPTION +============================= + +**slurm_notifier** is a SPANK plugin that notifies **ldmsd** about job +events (e.g. job start, job termination) and related information (e.g. +job_id, task_id, task process ID). The notification is done over +**ldmsd_stream** publish mechanism. See SUBSCRIBERS below for plugins +known to consume the spank plugin messages. + +**stream=**\ *STREAM_NAME* specifies the name of publishing stream. The +default value is *slurm*. + +**timeout=**\ *TIMEOUT_SEC* is the number of seconds determining the +time-out of the LDMS connections (default *5*). + +**user_debug,** if present, enables sending certain plugin management +debugging messages to the user's slurm output. (default: disabled -- +slurm_debug2() receives the messages instead). + +**client=**\ *XPRT*\ **:**\ *HOST*\ **:**\ *PORT*\ **:**\ *AUTH* +specifies **ldmsd** to which **slurm_notifier** publishes the data. The +*XPRT* specifies the type of the transport, which includes **sock**, +**rdma**, **ugni**, and **fabric**. The *HOST* is the hostname or the IP +address that **ldmsd** resides. The *PORT* is the listening port of the +**ldmsd**. The *AUTH* is the LDMS authentication method that the +**ldmsd** uses, which are **munge**, or **none**. The **client** option +can be repeated to specify multiple **ldmsd**'s. + +SUBSCRIBERS +============================= + +The following plugins are known to process slurm_notifier messages: + +:: + + slurm_sampler (collects slurm job & task data) + slurm_sampler2 (collects slurm job & task data) + papi_sampler (collects PAPI data from tasks identified) + linux_proc_sampler (collects /proc data from tasks identified) + +EXAMPLES +========================== + +/etc/slurm/plugstack.conf: + + :: + + required /opt/ovis/lib64/ovis-ldms/libslurm_notifier.so stream=slurm timeout=5 client=sock:localhost:10000:munge client=sock:node0:10000:munge + +SEE ALSO +========================== + +**spank**\ (8), **Plugin_slurm_sampler**\ (7), +**Plugin_papi_sampler**\ (7), **Plugin_linux_proc_sampler**\ (7), +**ldmsd**\ (8), **ldms_quickstart**\ (7), diff --git a/rtd/docs/source/sampler_man/Plugin_slurm_sampler.rst b/rtd/docs/source/sampler_man/Plugin_slurm_sampler.rst new file mode 100644 index 000000000..06054ed1b --- /dev/null +++ b/rtd/docs/source/sampler_man/Plugin_slurm_sampler.rst @@ -0,0 +1,93 @@ +==================== +Plugin_slurm_sampler +==================== + +:Date: 30 Sep 2019 + +.. contents:: + :depth: 3 +.. + +NAME +===================== + +Plugin_slurm_sampler - man page for the LDMSD slurm_sampler plugin + +SYNOPSIS +========================= + +Within ldmsd_controller or a configuration file: **config** +**name=slurm_sampler** **producer=**\ *PRODUCER* +**instance=**\ *INSTANCE* [ **component_id=\ COMP_ID** ] [ +**stream=\ STREAM** ] [ **job_count=\ MAX_JOBS** ] [ +**task_count=\ MAX_TASKS** ] + +DESCRIPTION +============================ + +**slurm_sampler** is a sampler plugin that collects the information of +the Slurm jobs running on the node. It subscribes to the specified +**stream** to which the **slurm_notifier** SPANK plugin (see +**Plugin_slurm_notifier**\ (7)) publish Slurm job events (default +stream: *slurm*). The sampler supports multi-tenant jobs. + +The **job_count** option is the number of slots in the LDMS set +allocated for concurrent jobs. If the number of concurrent jobs on the +node is greater than **job_count**, the new job will occupy the slot of +the oldest job. If **job_count** is not specified, the default value is +*8*. + +The **task_count** is the maximum number of tasks per job on the node. +If not specified, it is *CPU_COUNT*. In the event of the sampler failed +to obtain *CPU_COUNT*, the default value is *64*. + +CONFIG OPTIONS +=============================== + +**name=slurm_sampler** + This MUST be slurm_sampler (the name of the plugin). + +**producer=**\ *PRODUCER* + The name of the data producer (e.g. hostname). + +**instance=**\ *INSTANCE* + The name of the set produced by this plugin. This option is required. + +**component_id=**\ *COMPONENT_ID* + An integer identifying the component (default: *0*). + +**stream=**\ *STREAM* + The name of the LDMSD stream to get the job event data. + +**job_count=**\ *MAX_JOBS* + The number of slots to hold job information. If all slots are + occupied at the time the new job arrived, the oldest slot is reused. + The default value is *8*. + +**task_count=**\ *MAX_TASKS* + The number of slots for tasks information per job. If not specified, + the sampler will try to obtain system CPU_COUNT and use it as + task_count. If it failed, the default value is *64*. + +BUGS +===================== + +No known bugs. + +EXAMPLES +========================= + +Plugin configuration example: + + :: + + load name=slurm_sampler + config name=slurm_sampler producer=${HOSTNAME} instance=${HOSTNAME}/slurm \ + component_id=2 stream=slurm job_count=8 task_count=8 + start name=slurm_sampler interval=1000000 offset=0 + +SEE ALSO +========================= + +**ldmsd**\ (8), **ldms_quickstart**\ (7), **ldmsd_controller**\ (8), +**ldms_sampler_base**\ (7). diff --git a/rtd/docs/source/sampler_man/Plugin_sysclassib.rst b/rtd/docs/source/sampler_man/Plugin_sysclassib.rst new file mode 100644 index 000000000..d93c04b0b --- /dev/null +++ b/rtd/docs/source/sampler_man/Plugin_sysclassib.rst @@ -0,0 +1,73 @@ +================= +Plugin_sysclassib +================= + +:Date: 10 Feb 2018 + +.. contents:: + :depth: 3 +.. + +NAME +================== + +Plugin_sysclassib - man page for the LDMS sysclassib plugin + +SYNOPSIS +====================== + +| Within ldmsd_controller or in a configuration file +| config name=sysclassib [ = ] + +DESCRIPTION +========================= + +The sysclassib plugin provides IB metric information in raw and rate +(per second) forms. + +CONFIGURATION ATTRIBUTE SYNTAX +============================================ + +The sysclassib plugin uses the sampler_base base class. This man page +covers only the configuration attributes, or those with default values, +specific to the this plugin; see ldms_sampler_base.man for the +attributes of the base class. + +**config**\ name=\ **[schema=]**\ ports=\ **[metrics_type=]** + | + | configuration line + + name= + | + | This MUST be sysclassib. + + metrics_type= + | + | Values are 0 or 1. 0 = counter data only. 1 = include rate data + (per second) in addition. Default is 0. + + ports= + | + | CSV list of the form CARD1.PORT1,CARD2.PORT2. Default is all + discovered values. + +BUGS +================== + +No known bugs. + +EXAMPLES +====================== + +Within ldmsd_controller or a configuration file: + +:: + + load name=sysclassib + config name=sysclassib component_id=1 producer=vm1_1 instance=vm1_1/sysclassib metric_type=1 + start name=sysclassib interval=1000000 offset=0 + +SEE ALSO +====================== + +ldms(7), Plugin_procnetdev(7), ldms_sampler_base(7) diff --git a/rtd/docs/source/sampler_man/Plugin_syspapi_sampler.rst b/rtd/docs/source/sampler_man/Plugin_syspapi_sampler.rst new file mode 100644 index 000000000..7aa35cb66 --- /dev/null +++ b/rtd/docs/source/sampler_man/Plugin_syspapi_sampler.rst @@ -0,0 +1,135 @@ +====================== +Plugin_syspapi_sampler +====================== + +:Date: 30 Sep 2019 + +.. contents:: + :depth: 3 +.. + +NAME +======================= + +Plugin_syspapi_sampler - man page for the LDMSD syspapi_sampler plugin + +SYNOPSIS +=========================== + +Within ldmsd_controller or a configuration file: **config** +**name=syspapi_sampler** **producer=**\ *PRODUCER* +**instance=**\ *INSTANCE* [ **schema=\ SCHEMA** ] [ +**component_id=\ COMPONENT_ID** ] [ **cfg_file=\ PATH** ] [ +**events=\ EVENTS** ] [ **cumulative=\ 0\ \|\ 1** ] [ +**auto_pause=\ 0\ \|\ 1** ] + +DESCRIPTION +============================== + +**syspapi_sampler** collects system-wide hardware event counters using +Linux perf event (see **perf_event_open**\ (2)), but use PAPI event +names. **libpapi** and **libpfm** are used to translate PAPI event names +to Linux perf event attributes. In the case of per-process (job) data +collection, please see **Plugin_papi_sampler**. There are two approaches +to define a list of events: 1) **events** option, or 2) PAPI JSON config +file. For the **events** option, simply list the events of interest +separated by comma (e.g. events=PAPI_TOT_INS,PAPI_TOT_CYC). For the PAPI +JSON config file (**cfg_file** option), the format of the file is as +follows: + + :: + + { + "schema": "my_syspapi", + "events": [ + ... + ] + } + +The **schema** is optional, but if specified in the JSON config file, it +precedes the schema name given at the **config** command. The **events** +is a list of PAPI event names (strings). + +If both **cfg_file** and **events** options are given to the config +command, the list are concatenated. Please note that an event that +appears on both lists will result in an error. + +**auto_pause**\ =\ *1* (which is the default) makes **syspapi_sampler** +paused the data sampling when receiving a notification from +**papi_sampler** that a job is active, and resumed the data sampling +when receiving a notification from **papi_sampler** that all jobs have +terminated. This is to prevent perf system resource contention. We have +seen all 0 counters on **papi_sampler** without any errors (could be a +silent error) when run it with active **syspapi_sampler**. + +CONFIG OPTIONS +================================= + +**name=syspapi_sampler** + This MUST be syspapi_sampler (the name of the plugin). + +**producer=**\ *PRODUCER* + The name of the data producer (e.g. hostname). + +**instance=**\ *INSTANCE* + The name of the set produced by this plugin. + +**schema=**\ *SCHEMA* + The optional schema name (default: syspapi_sampler). Please note that + the **"schema"** from the JSON **cfg_file** overrides this option. + +**component_id=**\ *COMPONENT_ID* + An integer identifying the component (default: *0*). + +**cfg_file=**\ *PATH* + The path to JSON-formatted config file. This is optional if + **events** option is specified. Otherwise, this option is required. + +**events=**\ *EVENTS* + The comma-separated list of PAPI events of interest (e.g. + *PAPI_TOT_INS,PAPI_TOT_CYC*). This is optional if **cfg_file** is + specified. Otherwise, this option is required. + +**cumulative=**\ *0*\ **\|**\ *1* + *0* (default) for non-cumulative data sampling (reset after read), or + *1* for cumulative data sampling. + +**auto_pause=**\ *0*\ **\|**\ *1* + *0* to ignore **papi_sampler** pause/resume notification, or *1* + (default) to pause/resume according to notifications from + **papi_sampler**. + +BUGS +======================= + +No known bugs. + +EXAMPLES +=========================== + +Plugin configuration example: + + :: + + load name=syspapi_sampler + config name=syspapi_sampler producer=${HOSTNAME} \ + instance=${HOSTNAME}/syspapi component_id=2 \ + cfg_file=/tmp/syspapi.json + start name=syspapi_sampler interval=1000000 offset=0 + +JSON cfg_file example: + + :: + + { + "events": [ + "PAPI_TOT_INS", + "PAPI_TOT_CYC" + ] + } + +SEE ALSO +=========================== + +**Plugin_papi_sampler**\ (7), **ldmsd**\ (8), **ldms_quickstart**\ (7), +**ldmsd_controller**\ (8), **ldms_sampler_base**\ (7). diff --git a/rtd/docs/source/sampler_man/Plugin_tutorial_sampler.rst b/rtd/docs/source/sampler_man/Plugin_tutorial_sampler.rst new file mode 100644 index 000000000..2a0188d7a --- /dev/null +++ b/rtd/docs/source/sampler_man/Plugin_tutorial_sampler.rst @@ -0,0 +1,99 @@ +======================= +Plugin_tutorial_sampler +======================= + +:Date: 24 Oct 2019 + +.. contents:: + :depth: 3 +.. + +NAME +======================== + +Plugin_tutorial_sampler - man page for the LDMS tutorial_sampler plugin + +SYNOPSIS +============================ + +| Within ldmsd_controller or a configuration file: +| config name=tutorial_sampler [ = ] + +DESCRIPTION +=============================== + +With LDMS (Lightweight Distributed Metric Service), plugins for the +ldmsd (ldms daemon) are configured via ldmsd_controller or a +configuration file. The tutorial_sampler plugin is a demo sampler +described in the LDMSCON2019 tutorial "LDMS v4: Sampler and Store +Writing". + +This sampler is a simplified version of test_sampler, with a fixed +number of sets and u64 data types only. Max sets is determined by +MAXSETS in the source. + +CONFIGURATION ATTRIBUTE SYNTAX +================================================== + +The tutorial_sampler plugin uses the sampler_base base class. This man +page covers only the configuration attributes, or those with default +values, specific to the this plugin; see ldms_sampler_base.man for the +attributes of the base class. + +**config** + | name= [schema=] [num_metrics=] + | configuration line + + name= + | + | This MUST be tutorial_sampler. + + num_metrics= + | + | Optional number of metrics for this set. Metrics will be U64. + Metric names will be 'metric\_%d'. If not specified, default + number of metrics is determined by DEFAULTNUMMETRICS in the + source. + + schema= + | + | Optional schema name. It is intended that any sets with + different metrics have a different schema. If not specified, + will default to \`tutorial_sampler\`. Therefore, if you are + creating multiple sets in this sampler, you will most likely + want to define schema for each set. + +BUGS +======================== + +No known bugs. + +EXAMPLES +============================ + +Within ldmsd_controller or a configuration file: + +:: + + config name=tutorial_sampler producer=localhost1 instance=localhost1/test1 schema=test1 component_id=1 + config name=tutorial_sampler producer=localhost1 instance=localhost1/test2 schema=test2 component_id=2 num_metrics=5 + config name=tutorial_sampler producer=localhost1 instance=localhost1/test3 schema=test3 component_id=1 num_metrics=2 + job_set=localhost1/jobid + start name=tutorial_sampler interval=1000000 + +> ldms_ls localhost1/test1: consistent, last update: Thu Oct 24 10:55:14 +2019 -0600 [223680us] M u64 component_id 1 D u64 job_id 0 D u64 app_id 0 +D u64 metric0 2 D u64 metric1 4 D u64 metric2 6 D u64 metric3 8 D u64 +metric4 10 D u64 metric5 12 D u64 metric6 14 D u64 metric7 16 D u64 +metric8 18 D u64 metric9 20 localhost1/test2: consistent, last update: +Thu Oct 24 10:55:14 2019 -0600 [223699us] M u64 component_id 2 D u64 +job_id 0 D u64 app_id 0 D u64 metric0 4 D u64 metric1 8 D u64 metric2 12 +D u64 metric3 16 D u64 metric4 20 localhost1/test3: consistent, last +update: Thu Oct 24 10:55:14 2019 -0600 [223717us] M u64 component_id 1 D +u64 job_id 0 D u64 app_id 0 D u64 metric0 6 D u64 metric1 12 + +SEE ALSO +============================ + +ldmsd(8), ldms_quickstart(7), ldmsd_controller(8), ldms_sampler_base(7), +test_sampler(7), Plugin_store_tutorial(7) diff --git a/rtd/docs/source/sampler_man/Plugin_tx2mon.rst b/rtd/docs/source/sampler_man/Plugin_tx2mon.rst new file mode 100644 index 000000000..0f41c1a73 --- /dev/null +++ b/rtd/docs/source/sampler_man/Plugin_tx2mon.rst @@ -0,0 +1,185 @@ +============= +Plugin_tx2mon +============= + +:Date: 25 Dec 2020 + +.. contents:: + :depth: 3 +.. + +NAME +============== + +Plugin_tx2mon - man page for the LDMS tx2mon plugin + +SYNOPSIS +================== + +| Within ldmsd configuration +| config name=tx2mon [ = ] + +DESCRIPTION +===================== + +The tx2mon plugin provides cpu and system-on-chip information from +/sys/bus/platform/devices/tx2mon/[socinfo, node_raw] and reports it +in the same units as the tx2mon command-line utility. + +CONFIGURATION ATTRIBUTE SYNTAX +======================================== + +The standard options from sampler_base apply. The specific options for +tx2mon are listed here + +**config** + | name=tx2mon [array=] [extra=] + [auto-schema=] + + schema= + | + | Optional schema name. It is required by most storage backends + that the same sampler on different nodes with different metric + subsets needs to have a unique schema name. Use auto-schema=1 + instead of or in addition to schema to automatically meet the + backend requirement. + + auto-schema= + | + | If true, change the schema name to tx2mon\_$X, where $X will be + a unique value derived from the data selection options. If both + schema and auto-schema=1 are given, the schema name given is + used as the base instead of "tx2mon". + + array= + | + | For per-core data, report all array value elements if true. + Report only maximum and minimum values if false. The default is + false. + + extra= + | + | For per-core data, report additional information of the internal + block frequencies and the set system metrics. These additional + values are static. If false, additional information will not be + reported. The default is false. + +METRICS +================= + +The sampler_base standard metrics are included. The following data is +reported in a set instance per socket. + +:: + + node Number of socket i from + /sys/bus/platform/devices/tx2mon/node_raw + +The metrics listed here are named as their respective fields in +tx2mon/mc_oper_region.h. Where applicable, metrics are converted to the +units listed here from the raw values. + +:: + + counter Snapshot counter of the cpu. + + + Include the following metrics when array=true: + freq_cpu[] Frequency reading of each core. + tmon_cpu[] Temperature reading of each core. (deg. C). + +Include the following metrics when array=false: + +:: + + freq_cpu_min Minimum value found in freq_cpu. + freq_cpu_max Maximum value found in freq_cpu. + tmon_cpu_min Minimum value found in tmon_cpu. (deg. C) + tmon_cpu_max Maximum value found in tmon_cpu. (deg. C) + +Include the following metrics unconditionally: + +:: + + tmon_soc_avg Average temperature on the SoC. (deg. C) + pwr_core Power consumed by all cores on the SoC. (Watt). + pwr_sram Power consumed by all internal SRAM on the SoC. (Watt). + pwr_mem Power consumed by the LLC ring on the SoC. (Watt) + pwr_soc Power consumed by SoC blocks that are misc. (Watt) + v_core Voltage consumed by all cores on the SoC. (V) + v_sram Voltage consumed by all internal SRAM on the SoC. (V) + v_mem Voltage consumed by the LLC ring on the SoC. (V) + v_soc Voltage consumed by SoC blocks that are misc. (V). + active_evt Provides a bit list of active events that are causing throttling. + Temperature Active event with a bit flag where 1 is true. + Power Active event with a bit flag where 1 is true. + External Active event with a bit flag where 1 is true. + Unk3 Active event with a bit flag where 1 is true. + Unk4 Active event with a bit flag where 1 is true. + Unk5 Active event with a bit flag where 1 is true. + temp_evt_cnt Total number of temperature events. + pwr_evt_cnt Total number of power events. + ext_evt_cnt Total number of exteral events. + temp_throttle_ms Time duration of all temperature events in ms. + pwr_throttle_ms Time duration of all power events in ms. + ext_throttle_ms Time duration of all external events in ms. + cpu_num Which processor the data comes from. + +Include the following metrics with extra=true: + +:: + + temp_abs_max Absolute maximum limit of temperature beyond + which the SoC will throttle voltage and frequency. + temp_soft_thresh Soft limit of temperature beyond which the SoC will + throttle voltage and frequency down. + temp_hard_thresh Hard limit of temperature beyond which the SoC will + throttle voltage and frequency down. + freq_mem_net Frequency reading of the SoC and ring connection. + freq_max Maximum limit of SoC frequency. Depends on the SKU. + freq_min Minimum limit of SoC frequency. Depends on the SKU. + freq_socs Internal block frequency of SOC South clock. (Mhz) + freq_socn Internal block frequency of SOC North clock. (Mhz) + +EXAMPLES +================== + +Within ldmsd_controller or a configuration file: + +:: + + load name=tx2mon + config name=tx2mon producer=vm1_1 component_id=1 instance=vm1_1/tx2mon + start name=tx2mon interval=1000000 + +NOTES +=============== + +By default, root privilege is required to read the data files produced +by tx2mon_kmod. The kernel module tx2mon_kmod must be loaded, e.g. by +"modprobe /lib/modules/$(uname -r)/extra/tx2mon_kmod.ko". + +The current generated schema names are: tx2mon, tx2mon_01, +tx2mon_11\_$n_core, and tx2mon_10\_$n_core, where the suffix is derived +as \_(array)(extra)[\_ncore]. "tx2mon" is used when tx2mon_00 would +occur. If present, $n_core is the size of the array metrics. + +There is additional power consumed by cross-socket interconnect, PCIe, +DDR and other IOs that is not currently reported by this tool. + +tx2mon reports on the sensors monitored by the on-chip management +controller. Some of the on-chip components (such as the IO blocks) do +not have sensors and therefore the voltage and power measurements of +these blocks are not provided by tx2mon. + +On systems that are not arm 64 (aarch64 from uname), the sampler does +nothing. On systems that are aarch64 but missing +/sys/bus/platform/devices/tx2mon, the sampler issues an error about the +missing tx2mon kernel module. + +SEE ALSO +================== + +ldmsd(8), ldms_sampler_base + +:: diff --git a/rtd/docs/source/sampler_man/Plugin_variable.rst b/rtd/docs/source/sampler_man/Plugin_variable.rst new file mode 100644 index 000000000..c0514fbeb --- /dev/null +++ b/rtd/docs/source/sampler_man/Plugin_variable.rst @@ -0,0 +1,88 @@ +=============== +Plugin_variable +=============== + +:Date: 08 Jul 2020 + +.. contents:: + :depth: 3 +.. + +NAME +================ + +Plugin_variable - man page for the LDMS variable plugin + +SYNOPSIS +==================== + +| Within ldmsd_controller or a configuration file: +| config name=variable [ = ] + +DESCRIPTION +======================= + +The variable plugin provides test data with a periodically redefined +schema and set. Currently the period is every 4th sample. The data of +the sampler is monotonically increasing integers. The data set size +changes with each redefinition. + +CONFIGURATION ATTRIBUTE SYNTAX +========================================== + +The variable plugin does not use the sampler_base base class, but +follows the naming conventions of sampler_base except for schema and +instance name. + +**config** + | name= [schema=] + | configuration line + + name= + | + | This MUST be variable. + + schema= + | + | Optional schema name prefix. The string given will be suffixed + with an integer N in the range 1-9 to create the schema name. + The schema will also contain N integer metrics. + + instance= + | + | Optional instance name prefix. The string given will be suffixed + with an integer in the range 1-9 to create the instance name. If + not specified, will default prefix is \`$HOST/variable\`. + +NOTES +================= + +The intent of the sampler is to simulate any sampler which may under +some condition redefine the same instance name and schema name for a set +after properly retiring a different definition using the same names. It +is not for production use. + +To collect CSV data from this sampler, configure 9 store policies +matching ${schema}[1-9], since the current storage policy mechanism does +not allow matching multiple schemas. + +BUGS +================ + +No known bugs. + +EXAMPLES +==================== + +Within ldmsd_controller or a configuration file: + +:: + + load name=variable + config name=variable producer=vm1_1 instance=vm1_1/variable + start name=variable interval=1000000 + +SEE ALSO +==================== + +ldmsd(8), ldms_quickstart(7), ldmsd_controller(8), ldms_sampler_base(7) diff --git a/rtd/docs/source/sampler_man/Plugin_variorum_sampler.rst b/rtd/docs/source/sampler_man/Plugin_variorum_sampler.rst new file mode 100644 index 000000000..43f01b5a8 --- /dev/null +++ b/rtd/docs/source/sampler_man/Plugin_variorum_sampler.rst @@ -0,0 +1,101 @@ +======================= +Plugin_variorum_sampler +======================= + +:Date: 27 Jun 2022 + +.. contents:: + :depth: 3 +.. + +NAME +======================== + +Plugin_variorum_sampler - man page for the LDMS Variorum plugin + +SYNOPSIS +============================ + +| Within ldmsd_controller or a configuration file: +| config name=variorum_sampler [common attributes] + +DESCRIPTION +=============================== + +With LDMS (Lightweight Distributed Metric Service), plugins for the +ldmsd (ldms daemon) are configured via ldmsd_controller or a +configuration file. The variorum_sampler plugin provides power data +using the JSON API in Variorum, a vendor-neutral library that provides +access to low-level hardware knobs. The sampler, when configured, +automatically detects the number of sockets on the host machine and then +provides, for each socket, an LDMS record containing power data. For +each socket, the values provided are: node power consumption in Watts +(identical across sockets); socket ID number; CPU power consumption in +Watts; GPU power consumption in Watts (aggregated across all GPUs on the +socket, and reported as -1 on unsupported platforms); and memory power +consumption in Watts. + +The variorum sampler depends on Variorum 0.6.0 or higher and Jansson. +The sampler cannot be built without these libraries. If either library +is installed in a non-standard location, paths to the respective install +directories should be provided to Autoconf using the +--with-libjansson-prefix and/or --with-libvariorum-prefix flag. + +CONFIGURATION ATTRIBUTE SYNTAX +================================================== + +The variorum sampler plugin uses the sampler_base base class. This man +page covers only the configuration attributes, or those with default +values, specific to the this plugin; see ldms_sampler_base.man for the +attributes of the base class. + +**config** + | name= exclude_ports= + | configuration line + + name= + | + | This MUST be variorum_sampler. + + schema= + | + | Optional schema name. It is intended that the same sampler on + different nodes with different metrics have a different schema. + If not specified, will default to \`variorum_sampler\`. + +BUGS +======================== + +No known bugs; however, if Variorum cannot access the hardware knobs, +the sampler will be unable to access any data. This will result in an +error being printed to the log file: "variorum_sampler: unable to obtain +JSON object data". This error can be resolved by ensuring that hardware +knob access is enabled using the requirements here: +https://variorum.readthedocs.io/en/latest/HWArchitectures.html + +EXAMPLES +============================ + +Within ldmsd_controller or a configuration file: + +:: + + load name=variorum_sampler + config name=variorum_sampler producer=vm1_1 instance=vm1_1/variorum_sampler + start name=variorum_sampler interval=1000000 + +AUTHORS +=========================== + +Jessica Hannebert (Colorado College, +internship at Lawrence Livermore National Laboratory). Tapasya Patki + (Lawrence Livermore National Laboratory). Kathleen +Shoga (Lawrence Livermore National Laboratory). +Stephanie Brink (Lawrence Livermore National +Laboratory). Barry Rountree (Lawrence Livermore +National Laboratory). + +SEE ALSO +============================ + +ldmsd(8), ldms_quickstart(7), ldmsd_controller(8), ldms_sampler_base(7) diff --git a/rtd/docs/source/sampler_man/Plugin_vmstat.rst b/rtd/docs/source/sampler_man/Plugin_vmstat.rst new file mode 100644 index 000000000..bbfd2f0ee --- /dev/null +++ b/rtd/docs/source/sampler_man/Plugin_vmstat.rst @@ -0,0 +1,70 @@ +============= +Plugin_vmstat +============= + +:Date: 04 Dec 2018 + +.. contents:: + :depth: 3 +.. + +NAME +============== + +Plugin_vmstat - man page for the LDMS vmstat plugin + +SYNOPSIS +================== + +| Within ldmsd_controller or in a configuration file +| config name=vmstat [ = ] + +DESCRIPTION +===================== + +With LDMS (Lightweight Distributed Metric Service), plugins for the +ldmsd (ldms daemon) are configured via ldmsd_controller or a +configuration file. The vmstat plugin provides info from /proc/vmstat. + +CONFIGURATION ATTRIBUTE SYNTAX +======================================== + +The vmstat plugin uses the sampler_base base class. This man page covers +only the configuration attributes, or those with default values, +specific to the this plugin; see ldms_sampler_base.man for the +attributes of the base class. + +**config** + | name= [schema=] + | configuration line + + name= + | + | This MUST be vmstat. + + schema= + | + | Optional schema name. It is intended that the same sampler on + different nodes with different metrics have a different schema. + If not specified, will default to \`vmstat\`. + +BUGS +============== + +No known bugs. + +EXAMPLES +================== + +Within ldmsd_controller or a configuration file: + +:: + + load name=vmstat + config name=vmstat producer=1 instance=vm1_1/vmstat + start name=vmstat interval=1000000 + +SEE ALSO +================== + +ldmsd(8), ldms_quickstart(7), ldmsd_controller(8), ldms_sampler_base(7) diff --git a/rtd/docs/source/sampler_man/index.rst b/rtd/docs/source/sampler_man/index.rst new file mode 100644 index 000000000..b19f60e71 --- /dev/null +++ b/rtd/docs/source/sampler_man/index.rst @@ -0,0 +1,8 @@ +Sampler Man Pages +===== + +.. toctree:: + :maxdepth: 1 + :glob: + + * diff --git a/rtd/docs/source/sampler_man/ldms-ibnet-sampler-gen.rst b/rtd/docs/source/sampler_man/ldms-ibnet-sampler-gen.rst new file mode 100644 index 000000000..349134bae --- /dev/null +++ b/rtd/docs/source/sampler_man/ldms-ibnet-sampler-gen.rst @@ -0,0 +1,116 @@ +====================== +ldms-ibnet-sampler-gen +====================== + +:Date: 4 June 2020 + +.. contents:: + :depth: 3 +.. + +NAME +======================= + +ldms-ibnet-sampler-gen - man page for the LDMS ibnet plugin support +utility + +ldms-get-opa-network.sh - man page for the LDMS ibnet plugin support +utility + +SYNOPSIS +=========================== + +ldms-ibnet-sampler-gen --samplers --out +[options] + +DESCRIPTION +============================== + +The ldms-ibnet-sampler-gen script produces files splitting the ports in +the netfile among the hosts listed in the samplers file. The input is +expected to be the network dump of an approximately three-level FAT +tree. + +OPTIONS +========================== + +:: + + -h, --help show the help message and exit + --out OUTPREFIX prefix of output files + --net IBNDPFILE file name of output collected from 'ibnetdiscover -p' + --opa OPAFILE file name of output collected from 'ldms-get-opa-network.sh' + --samplers HOSTFILE file listing samplers as named in the node name map, one per line. + --lidnames dump lid,name map to stdout and exit. + --annotate annotate out sampler assignment files with node-name-map strings. + and lists of unassigned switch ports. + --sharp port to exclude in topology calculations (for sharp) + --tier0 generate tier0-1 graphs + --tier1 generate tier1-2 graphs + --tier2 generate tier2-3 graphs + --circo-tiers CIRCO_PREFIX + dump circo tier plots to files starting with prefix + given CIRCO_PREFIX. + --sfdp-tiers SFDP_PREFIX + dump circo tier plots to files starting with prefix + given SFDP_PREFIX. + --info print key intermediate results + --debug print miscellaneous debug messages + --dump_sw print switches parsed + --dump_ca print HCA list parsed + --dump_links print links parsed + --dump_tiers print tiers discovered + --dump_parse print parser debugging + +EXAMPLES +=========================== + +:: + + cat <cluster-samplers + admin1 qib0 + admin2 qib0 + admin3 qib0 + EOF + + ibnetdiscover -p > cluster-p-netdiscover + + # check lids for being parsed right + ldms-ibnet-sampler-gen --lidnames --net cluster-p-netdiscover --samplers x --out x | + sort -k2 -t, > lid.host.txt + + ldms-ibnet-sampler-gen --net cluster-p-netdiscover --samplers clustre-samplers --sharp 37 --annotate --out sbx + +:: + + cat <cluster-samplers + admin1 hfi1_0 + admin2 hfi1_0 + admin3 hfi1_0 + EOF + + ldms-get-opa-network.sh > cluster-opa-map + + # check lids for being parsed right + ldms-ibnet-sampler-gen --lidnames --opa cluster-opa-map --samplers cluster-samplers --out x |sort -k2 -t, > lid.host.txt + + ldms-ibnet-sampler-gen --opa cluster-opa-map --samplers cluster-samplers --out swx + +NOTES +======================== + +A Mellanox SHARP port appears as an HCA in a switch. Connections on the +sharp port should be ignored for topology decomposition and sampler load +balancing purposes, as they usually make the topology flat if included. + +This program does not directly invoke infiniband or omnipath utilities. +It does invoke (and require) graphviz utilities if the tier, circo, or +sfdp options are applied. + +Applying the --node-name-map option to ibnetdiscover when generating the +net file makes the results more readable. + +SEE ALSO +=========================== + +Plugin_ibnet(7), circo, dot, ldms-get-opa-network, ibnetdiscover diff --git a/rtd/docs/source/sampler_man/ldms-netlink-notifier.rst b/rtd/docs/source/sampler_man/ldms-netlink-notifier.rst new file mode 100644 index 000000000..806adacf5 --- /dev/null +++ b/rtd/docs/source/sampler_man/ldms-netlink-notifier.rst @@ -0,0 +1,4 @@ +.. contents:: + :depth: 3 +.. + diff --git a/rtd/docs/source/sampler_man/ldms-notify.rst b/rtd/docs/source/sampler_man/ldms-notify.rst new file mode 100644 index 000000000..806adacf5 --- /dev/null +++ b/rtd/docs/source/sampler_man/ldms-notify.rst @@ -0,0 +1,4 @@ +.. contents:: + :depth: 3 +.. + diff --git a/rtd/docs/source/sampler_man/ldms-sensors-config.rst b/rtd/docs/source/sampler_man/ldms-sensors-config.rst new file mode 100644 index 000000000..f6ac04a51 --- /dev/null +++ b/rtd/docs/source/sampler_man/ldms-sensors-config.rst @@ -0,0 +1,94 @@ +=================== +ldms-sensors-config +=================== + +:Date: 15 Dec 2018 + +.. contents:: + :depth: 3 +.. + +NAME +==================== + +ldms-sensors-config - generate LDMS filesingle plugin configuration +prototype + +SYNOPSIS +======================== + +ldms-sensors-config [--sensors=/path/to/sensors] +[--lscpu=/path/to/lscpu] [--test-lscpu=lscpu-log-file] +[--test-sensors=sensors-log-file] + +Run 'sensors' under strace to discover where some sensor files live on +the current system and generate a draft metric configuration file for +the LDMS filesingle sampler. + +DESCRIPTION +=========================== + +The ldms-sensors-config program generates a draft conf file for the +filesingle sampler. The user should tailor the selection, naming, data +storage type, and default values per Plugin_filesingle(7). + +OPTIONS +======================= + +--sensors= + | + | specify an alternate location of the sensors program. The default + is /usr/bin/sensors, and the PATH variable is not used to search + for alternatives. + +--nodash + | + | Replace all - characters in metric names with \_ characters. + +--lscpu= + | + | specify an alternate location of the lscpu program. The default is + /usr/bin/lscpu and the PATH variable is not used to search for + alternatives. + +--test-lscpu= + | + | Specify the location of a pre-collected strace log of lscpu to use + instead of lscpu run on the local system. Used for testing or + remote configuration. + +--test-sensors= + | + | Specify the location of a pre-collected strace log of sensors to + use instead of sensors run on the local system. Used for testing or + remote configuration. + +EXAMPLES +======================== + +The log file for sensors can be collected with: + +script -c 'strace -e trace=open,openat,read sensors -u' sensors.log + +The log file for lscpu can be collected with: + +script -c 'strace -e trace=open,openat lscpu' /tmp/lscpu.tmp \| grep +'^open.*cpuinfo_max_freq' > lscpu.log; rm /tmp/lscpu.tmp + +NOTES +===================== + +When using test input file(s), the live system data will be used if the +corresponding test file is not specified. + +Systems (kernels) lacking cpu frequency reporting produce no output from +lscpu. + +The use of --nodash is recommended for compatibility with downstream +analysis tools. White space appearing in metric names is unconditionally +transformed to \_. + +SEE ALSO +======================== + +sensors(1), lscpu(1), Plugin_filesingle(7), ldmsd. diff --git a/rtd/docs/source/sampler_man/ldms_dstat_schema_name.rst b/rtd/docs/source/sampler_man/ldms_dstat_schema_name.rst new file mode 100644 index 000000000..01ee3748c --- /dev/null +++ b/rtd/docs/source/sampler_man/ldms_dstat_schema_name.rst @@ -0,0 +1,49 @@ +====================== +ldms_dstat_schema_name +====================== + +:Date: 17 Nov 2020 + +.. contents:: + :depth: 3 +.. + +NAME +======================= + +ldms_dstat_schema_name - man page for the LDMS dstat plugin support +utility + +SYNOPSIS +=========================== + +ldms_dstat_schema_name + +DESCRIPTION +============================== + +The dstat plugin optionally generates a schema name including a short +hash of certain configuration data. ldms_dstat_schema_name provides the +user with the schema name the dstat plugin will generate for the given +options. + +CONFIGURATION ATTRIBUTE SYNTAX +================================================= + +See Plugin_dstat(7). + +EXAMPLES +=========================== + +:: + + ldms_dstat_schema_name auto-schema=1 fd=1 + + yields + + dstat_10 + +SEE ALSO +=========================== + +Plugin_dstat(7) diff --git a/rtd/docs/source/sampler_man/ldms_ibnet_schema_name.rst b/rtd/docs/source/sampler_man/ldms_ibnet_schema_name.rst new file mode 100644 index 000000000..b340837da --- /dev/null +++ b/rtd/docs/source/sampler_man/ldms_ibnet_schema_name.rst @@ -0,0 +1,76 @@ +====================== +ldms_ibnet_schema_name +====================== + +:Date: 4 June 2020 + +.. contents:: + :depth: 3 +.. + +NAME +======================= + +ldms_ibnet_schema_name - man page for the LDMS ibnet plugin support +utility + +SYNOPSIS +=========================== + +ldms_ibnet_schema_name + +DESCRIPTION +============================== + +The ibnet plugin generates a schema name including a hash of certain +configuration data. ldms_ibnet_schema_name provides the user with the +resulting name before running ldmsd so that store plugins can be +configured. + +CONFIGURATION ATTRIBUTE SYNTAX +================================================= + +See Plugin_ibnet(7). + +EXAMPLES +=========================== + +:: + + ldms_ibnet_schema_name node-name-map=/path/map timing=2 metric-conf=/path/metricsubsets schema=myibnet + + when file /path/metricsubsets contains + + extended + xmtsl + rcvsl + xmtdisc + rcverr + oprcvcounters + flowctlcounters + vloppackets + vlopdata + vlxmitflowctlerrors/t + vlxmitcounters/t + swportvlcong + rcvcc/t + slrcvfecn + slrcvbecn + xmitcc/t + vlxmittimecc + smplctl/t + + yields + + myibnet_7fffe_tn + +NOTES +======================== + +If the timing option is greater than 0, the name of the overall timing +set will be as for the result given with "\_timing" appended. + +SEE ALSO +=========================== + +Plugin_ibnet(7) diff --git a/rtd/docs/source/sampler_man/ldms_rdc_schema_name.rst b/rtd/docs/source/sampler_man/ldms_rdc_schema_name.rst new file mode 100644 index 000000000..562696011 --- /dev/null +++ b/rtd/docs/source/sampler_man/ldms_rdc_schema_name.rst @@ -0,0 +1,63 @@ +==================== +ldms_rdc_schema_name +==================== + +:Date: 2 April 2021 + +.. contents:: + :depth: 3 +.. + +NAME +===================== + +ldms_rdc_schema_name - man page for the LDMS rdc_sampler plugin support +utility + +SYNOPSIS +========================= + +ldms_rdc_schema_name -h ldms_rdc_schema_name [-d] + +DESCRIPTION +============================ + +The rdc_sampler plugin generates a schema name including a hash of +certain configuration data. ldms_rdc_schema_name provides the user with +the resulting name before running ldmsd so that store plugins can be +configured. + +CONFIGURATION ATTRIBUTE SYNTAX +=============================================== + +See Plugin_rdc_sampler(7). + +EXAMPLES +========================= + +:: + + #ldms_rdc_schema_name -h + + + # ldms_rdc_schema_name metrics=base schema=myrdc_sampler | grep -v ERROR + myrdc_sampler_51dcba58 + + # ldms_rdc_schema_name metrics=xgmi + rdc_sampler_device_e3e41d59 + + # ldms_rdc_schema_name -d metrics=xgni + + +NOTES +====================== + +The rdc libraries loaded by the plugin and the program may emit +inconsequential error messages to stdout. One such begins with +" ERROR RdcLibraryLoader.cc". + +SEE ALSO +========================= + +Plugin_rdc_sampler(7) diff --git a/rtd/docs/source/sampler_man/netlink-notifier.rst b/rtd/docs/source/sampler_man/netlink-notifier.rst new file mode 100644 index 000000000..11c708367 --- /dev/null +++ b/rtd/docs/source/sampler_man/netlink-notifier.rst @@ -0,0 +1,205 @@ +================ +netlink-notifier +================ + +:Date: 25 June 2021 + +.. contents:: + :depth: 3 +.. + +NAME +================= + +ldms-netlink-notifier - Transmit Linux kernel netlink process life +messages to ldmsd streams. + +ldms-notify - systemd service + +SYNOPSIS +===================== + +ldms-netlink-notifier [OPTION...] + +DESCRIPTION +======================== + +The netlink-notifier generates JSON message for ldmsd and JSON aware +LDMS samplers. Its messages are mostly compatible with those from the +slurm spank based notifier. + +OPTIONS +==================== + +:: + + -c use task comm field for process name. + -d strip off directory path from process name. + -D specify run duration in seconds. If unspecified, run forever. + -e select which events to monitor. + -E equivalent to -e all. + -g show glyphs for event types in debug mode. + -h show this help. + -i seconds time (float) to sleep between checks for processes exceeding the short dir filter time. + If the -i value > the -m value, -i may effectively filter out additional processes. + -j file file to log json messages and transmission status. + -l force stdout line buffering. + -L file log to file instead of stdout. + -r run with real time FIFO scheduler (available on some kernels). + -s show short process name in debugging. + -S suppress stream message publication. + -t show debugging trace messages. + -u umin ignore processes with uid < umin + -v lvl log level for stream library messages. Higher is quieter. Error messages are >= 3. + -q run quietly + -x show extra process information. + -X equivalent to -Egrx. + The ldmsd connection and commonly uninteresting or short-lived processes may be specified with the options or environment variables below. + The 'short' options do not override the exclude entirely options. + --exclude-programs[=] change the default value of exclude-programs + When repeated, all values are concatenated. + If given with no value, the default (nullexe): is removed. + If not given, the default is used unless + the environment variable NOTIFIER_EXCLUDE_PROGRAMS is set. + --exclude-dir-path[=] change the default value of exclude-dir-path + When repeated, all values are concatenated. + If given with no value, the default /sbin is removed. + If not given, the default is used unless + the environment variable NOTIFIER_EXCLUDE_DIR_PATH is set. + --exclude-short-path[=] change the default value of exclude-short-path + When repeated, all values are concatenated. + If given with no value, the default /bin:/usr is removed. + If not given, the default is used unless + the environment variable NOTIFIER_EXCLUDE_SHORT_PATH is set. + --exclude-short-time[=][val] change the default value of exclude-short-time. + If repeated, the last value given wins. + If given with no value, the default 1 becomes 0 unless + the environment variable NOTIFIER_EXCLUDE_SHORT_TIME is set. + --stream[=] change the default value of stream. + If repeated, the last value given wins. + The default slurm is used if env NOTIFIER_LDMS_STREAM is not set. + --xprt[=] change the default value of xprt. + If repeated, the last value given wins. + The default sock is used if env NOTIFIER_LDMS_XPRT is not set. + --host[=] change the default value of host. + If repeated, the last value given wins. + The default localhost is used if env NOTIFIER_LDMS_HOST is not set. + --port[=] change the default value of port. + If repeated, the last value given wins. + The default 411 is used if env NOTIFIER_LDMS_PORT is not set. + --auth[=] change the default value of auth. + If repeated, the last value given wins. + The default munge is used if env NOTIFIER_LDMS_AUTH is not set. + --reconnect[=] change the default value of reconnect. + If repeated, the last value given wins. + The default 600 is used if env NOTIFIER_LDMS_RECONNECT is not set. + --timeout[=] change the default value of timeout. + If repeated, the last value given wins. + The default 1 is used if env NOTIFIER_LDMS_TIMEOUT is not set. + --track-dir[=] change the pids published directory. + The default is used if env NOTIFIER_TRACK_DIR is not set. + The path given should be on a RAM-based file system for efficiency, + and it should not contain any files except those created by + this daemon. When enabled, track-dir will be populated even if + -S is used to suppress the stream output. + --component_id= set the value of component_id. + If not set, the component_id field is not included in the stream formats produced. + --ProducerName= set the value of ProducerName + If not set, the ProducerName field is not included in the stream formats produced. + +ENVIRONMENT +======================== + +The following variables override defaults if a command line option is +not present, as describe in the options section. + +:: + + NOTIFIER_EXCLUDE_PROGRAMS="(nullexe):" + NOTIFIER_EXCLUDE_DIRS=/sbin + NOTIFIER_EXCLUDE_SHORT_PATH=/bin:/usr + NOTIFIER_EXCLUDE_SHORT_TIME=1 + NOTIFIER_TRACK_DIR=/var/run/ldms-netlink-tracked + NOTIFIER_LDMS_RECONNECT=600 + NOTIFIER_LDMS_TIMEOUT=1 + NOTIFIER_LDMS_STREAM=slurm + NOTIFIER_LDMS_XPRT=sock + NOTIFIER_LDMS_HOST=localhost + NOTIFIER_LDMS_PORT=411 + NOTIFIER_LDMS_AUTH=munge + +Omitting (nullexe): from NOTIFIER_EXCLUDE_PROGRAMS may cause +incomplete output related to processes no longer present. In exotic +circumstances, this may be desirable anyway. + +FILES +================== + +Users or other processes may discover which processes are the subject of +notifications by examining the files in + +/NOTIFIER_TRACK_DIR/\* + +For each pid started event which would be emitted to an LDMS stream, a +temporary file with the name of the pid is created in +NOTIFIER_TRACK_DIR. The file will contain the json event attempted. The +temporary file will be removed when the corresponding pid stopped event +is sent. These files are not removed when the notifier daemon exits. +Client applications may validate a file by checking the contents against +the /proc/$pid/stat content, if it exists. Invalid files should be +removed by clients or system scripts. + +NOTES +================== + +The core of this utility is derived from forkstat(8). + +The output of this utility, if used to drive a sampler, usually needs to +be consumed on the same node. + +If not used with a sampler, the --component_id or --ProducerName options +are needed to add a node identifier to the messages. Normally a +process-following sampler that creates sets will add the node identifier +automatically. + +Options are still in development. Several options affect only the trace +output. + +EXAMPLES +===================== + +Run for 30 seconds with screen and json.log test output connecting to +the ldmsd from 'ldms-static-test.sh blobwriter' test: + +:: + + netlink-notifier -t -D 30 -g -u 1 -x -e exec,clone,exit \ + -j json.log --exclude-dir-path=/bin:/sbin:/usr \ + --port=61061 --auth=none --reconnect=1" + +Run in a typical deployment (sock, munge, port 411, localhost, forever, +10 minute reconnect): + +:: + + netlink-notifier + +Run in a systemd .service wrapper, excluding root owned processes. + +:: + + EnvironmentFile=-/etc/sysconfig/ldms-netlink-notifier.conf + ExecStart=/usr/sbin/ldms-netlink-notifier -u 1 -x -e exec,clone,exit + +Run in a systemd .service wrapper, excluding root owned processes, with +debugging files + +:: + + EnvironmentFile=-/etc/sysconfig/ldms-netlink-notifier.conf + ExecStart=/usr/sbin/ldms-netlink-notifier -u 1 -x -e exec,clone,exit -j /home/user/nl.json -L /home/user/nl.log -t --ProducerName=%H + +SEE ALSO +===================== + +forkstat(8), ldmsd(8), ldms-static-test(8) diff --git a/rtd/docs/source/store_man/Plugin_avro_kafka_store.rst b/rtd/docs/source/store_man/Plugin_avro_kafka_store.rst new file mode 100644 index 000000000..a847bf240 --- /dev/null +++ b/rtd/docs/source/store_man/Plugin_avro_kafka_store.rst @@ -0,0 +1,335 @@ +======================= +Plugin_avro_kafka_store +======================= + +:Date: 30 Mar 2023 + +.. contents:: + :depth: 3 +.. + +NAME +========================= + +avro_kafka_store - LDMSD avro_kafka_store plugin + +SYNOPSIS +============================= + +**config** **name=avro_kafka_store** **producer=PRODUCER** +**instance=INSTANCE** [ **topic=\ TOPIC_FMT** ] [ **encoding=\ JSON** ] +[ **encoding=\ AVRO** ] [ **kafka_conf=\ PATH** ] [ +**serdes_conf=\ PATH** ] + +DESCRIPTION +================================ + +**``avro_kafka_store``** implements a decomposition capable LDMS metric +data store. The **``avro_kafka_store``** plugin does not implement the +**``store``** function and must only be used with decomposition. + +The plugin operates in one of two modes: *JSON*, and *AVRO* (the +default). In *JSON* mode, each row is encoded as a JSON formatted text +string. In *AVRO* mode, each row is associated with an AVRO schema and +serialized using an AVRO Serdes. + +When in *AVRO* mode, the plugin manages schema in cooperation with an +Avro Schema Registry. The location of this registry is specified in a +configuration file or optionally on the **``config``** command line. + +CONFIG OPTIONS +=================================== + +mode + A string indicating the encoding mode: "JSON" will encode messages in + JSON format, "AVRO" will encode messages using a schema and Avro + Serdes. The default is "AVRO". The mode values are not case + sensitive. + +name + Must be avro_kafka_store. + +kafka_conf + A path to a configuration file in Java property format. This + configuration file is parsed and used to configure the Kafka + kafka_conf_t configuration object. The format of this file and the + supported attributes are available here: + https://github.com/edenhill/librdkafka/blob/master/CONFIGURATION.md. + +serdes_conf + A path to a configuration file in Java property format. This + configuration file is parsed and used to configure the Avro Serdes + serdes_conf_t configuration object. The only supported option for + this file is serdes.schema.url. + +TOPIC NAMES +=============================== + +The topic name to which messages are published is defined by the +**topic** configuration parameter. The parameter specifies a string that +is a *format specifier* similar to a printf() format string. If the +**topic** is not specified, it defaults to "%S" which is the format +specifier for the set schema name. + +The '%' character introduces a *format specifier* that will be +substituted in the topic format string to create the topic name. The +format specifiers are as follows: + +%F + The format in which the message is serialized: "json" or "avro". + +%S + The set parameter's *schema* name. + +%I + The instance name of the set, e.g. "orion-01/meminfo". + +%P + The set parameter's *producer* name, e.g. "orion-01." + +%u + The user-name string for the owner of the set. If the user-name is + not known on the system, the user-id is used. + +%U + The user-id (uid_t) for the owner of the set. + +%g + The group-name string for the group of the set. If the group-name is + not known on the system, the group-id is used. + +%G + The group-id (gid_t) for the group of the set. + +%a + The access/permission bits for the set formatted as a string, e.g. + "-rw-rw----". + +%A + The access/permission bits for the set formatted as an octal number, + e.g. 0440. + +Note that a topic name must only consist of a combination of the +characters [a-zA-Z0-9\\.\_\\-]. In order to ensure that the format +specifier above will not produce invalid topic names, any character that +results from a format specifier substitution that is not in the valid +list will be substituted with a '.'. + +STRGP +========================= + +The avro_kafka_store is used with a storage policy that specifies +avro_kafka_store as the plugin parameter. + +The *schema*, *instance*, *producer* and *flush* strgp_add parameters +have no affect on how data is stored. If the *container* parameter is +set to any value other than an empty string, it will override the +bootstrap.servers Kafka configuration parameter in the kafka_conf file +if present. + +JSON Mode +============================= + +JSON mode encodes messages as self describing text objects. Each message +is a JSON dictionary based on the following template: RS 4 + +:: + + { + "" : , + "" : , + ... + } + +Each row in the decomposition is encoded as shown. The **attr-value** +types are mapped to either quoted strings, floating-point, or integers +as defined by the source metric type in the LDMS metric set. The mapping +is as follows: + ++------------------+----------------------+------------------------+ +| **Metric Type** | **Format Specifier** | **Description** | ++------------------+----------------------+------------------------+ +| LDMS_V_TIMESTAMP | %u.%06u | Floating point number | +| | | in seconds | ++------------------+----------------------+------------------------+ +| LDMS_V_U8 | %hhu | Unsigned integer | ++------------------+----------------------+------------------------+ +| LDMS_V_S8 | %hhd | Signed integer | ++------------------+----------------------+------------------------+ +| LDMS_V_U16 | %hu | Unsigned integer | ++------------------+----------------------+------------------------+ +| LDMS_V_S16 | %hd | Signed integer | ++------------------+----------------------+------------------------+ +| LDMS_V_U32 | %u | Unsigned integer | ++------------------+----------------------+------------------------+ +| LDMS_V_S32 | %d | Signed integer | ++------------------+----------------------+------------------------+ +| LDMS_V_U64 | %lu | Unsigned integer | ++------------------+----------------------+------------------------+ +| LDMS_V_S64 | %ld | Signed integer | ++------------------+----------------------+------------------------+ +| LDMS_V_FLOAT | %.9g | Floating point | ++------------------+----------------------+------------------------+ +| LDMS_V_DOUBLE | %.17g | Floating point | ++------------------+----------------------+------------------------+ +| LDMS_V_STRING | "%s" | Double quoted string | ++------------------+----------------------+------------------------+ +| LDMS_V_ARRAY_xxx | [ v0, v1, ... ] | Comma separated value | +| | | list surrounding by | +| | | '[]' | ++------------------+----------------------+------------------------+ + +Example JSON Object +------------------- + +{"timestamp":1679682808.001751,"component_id":8,"dev_name":"veth1709f8b","rx_packets":0,"rx_err_packets":0,"rx_drop_packets":0,"tx_packets":858,"tx_err_packets":0,"tx_drop_packets":0} + +Avro Mode +============================= + +In Avro mode, LDMS metric set values are first converted to Avro values. +The table below describes how each LDMS metric set value is represented +by an Avro value. + +Each row in the decomposition is encoded as a sequence of Avro values. +The target Avro type is governed by the Avro schema. The mapping is as +follows: + ++-------------------+---------------+--------------------------------+ +| **Metric Type** | **Avro Type** | **Description** | ++-------------------+---------------+--------------------------------+ +| LDMS_V_TIMESTAMP | AVRO_INT32 | Seconds portion of timestamp | +| | | value is stored in the Avro | +| | | integer | ++-------------------+---------------+--------------------------------+ +| LDMS_V_TIMESTAMP | AVRO_INT64 | tv_secs + 1000 \* tv_usecs is | +| | | stored in Avro long integer | ++-------------------+---------------+--------------------------------+ +| LDMS_V_TIMESTAMP | AVRO_RECORD | Seconds portion is stored in | +| | | seconds portion of record, | +| | | usecs is stored in the | +| | | micro-seconds portion of the | +| | | record | ++-------------------+---------------+--------------------------------+ +| LDMS_V_U8 | AVRO_INT32 | avro_value_set_int | ++-------------------+---------------+--------------------------------+ +| LDMS_V_S8 | AVRO_INT32 | avro_value_set_int | ++-------------------+---------------+--------------------------------+ +| LDMS_V_U16 | AVRO_INT32 | avro_value_set_int | ++-------------------+---------------+--------------------------------+ +| LDMS_V_S16 | AVRO_INT32 | avro_value_set_int | ++-------------------+---------------+--------------------------------+ +| LDMS_V_U32 | AVRO_INT64 | avro_value_set_long | ++-------------------+---------------+--------------------------------+ +| LDMS_V_S32 | AVRO_INT32 | avro_value_set_int | ++-------------------+---------------+--------------------------------+ +| LDMS_V_U64 | AVRO_INT64 | avro_value_set_long | ++-------------------+---------------+--------------------------------+ +| LDMS_V_S64 | AVRO_INT64 | avro_value_set_long | ++-------------------+---------------+--------------------------------+ +| LDMS_V_FLOAT | AVRO_FLOAT | avro_value_set_float | ++-------------------+---------------+--------------------------------+ +| LDMS_V_DOUBLE | AVRO_DOUBLE | avro_value_set_double | ++-------------------+---------------+--------------------------------+ +| LDMS_V_CHAR_ARRAY | AVRO_STRING | avro_value_set_string | ++-------------------+---------------+--------------------------------+ +| LDMS_V_ARRAY_xxx | AVRO_ARRAY | Comma separated value list or | +| | | primitive type surrounded by | +| | | '[]' | ++-------------------+---------------+--------------------------------+ + +Schema Creation +--------------- + +Each row in the LDMS metric set presented for storage is used to +generate an Avro schema definition. The table above shows the Avro types +that are used to store each LDMS metric type. Note that currently, all +LDMS_V_TIMESTAMP values in a metric set are stored as the Avro logical +type "timestamp-millis" and encoded as an Avro long. + +Unsigned types are currently encoded as signed types. The case that +could cause issues is LDMS_V_U64 which when encoded as AVRO_LONG will +result in a negative number. One way to deal with this is to encode +these as AVRO_BYTES[8] and let the consumer perform the appropriate +cast. This, however, seems identical to simply encoding it as a signed +long and allow the consumer to cast the signed long to an unsigned long. + +Schema Registration +------------------- + +The Avro schema are generated from the row instances presented to the +commit() storage strategy routine. The **schema_name** that is contained +in the row instance is used to search for a serdes schema. This name is +first searched for in a local RBT and if not found, the Avro Schema +Registry is consulted. If the schema is not present in the registry, a +new Avro schema is constructed per the table above, registered with the +schema registry and stored in the local cache. + +Encoding +-------- + +After the schema is located, constructed, and or registered for the row, +the schema in conjunction with libserdes is used to binary encode the +Avro values for each column in the row. Once encoded, the message is +submitted to Kafka. + +Client Side Decoding +-------------------- + +Consumers of topics encoded with libserdes will need to perform the +above procedure in reverse. The message received via Kafka will have the +schema-id present in the message header. The client will use this +schema-id to query the Schema registry for a schema. Once found, the +client will construct a serdes from the schema definition and use this +serdes to decode the message into Avro values. + +EXAMPLES +============================= + +kafka_conf Example File +------------------------ + + :: + + # Lines beginning with '#' are considered comments. + # Comments and blank lines are ignored. + + # Specify the location of the Kafka broker + bootstrap.server=localhost:9092 + +serdes_conf Example File +------------------------- + + :: + + # Specify the location of the Avro Schema registry. This can be overridden + # on the strgp_add line with the "container" strgp_add option if it is + # set to anything other than an empty string + serdes.schema.url=https://localhost:9092 + +Example strg_add command +------------------------- + + :: + + strgp_add name=aks plugin=avro_kafka_store container=kafka-broker.int:9092 decomposition=aks-decomp.conf + strgp_start name=aks + +Example plugin configuration +---------------------------- + + :: + + config name=avro_kafka_store encoding=avro kafka_conf=/etc/kakfa.conf serdes_conf=/etc/serdes.conf topic=ldms.%S + strgp_start name=aks + +NOTES +========================= + +This man page is a work in progress. + +SEE ALSO +============================ + +**ldmsd**\ (8), **ldmsd_controller**\ (8), **ldmsd_decomposition**\ (7), +**ldms_quickstart**\ (7) diff --git a/rtd/docs/source/store_man/Plugin_darshan_stream_store.rst b/rtd/docs/source/store_man/Plugin_darshan_stream_store.rst new file mode 100644 index 000000000..6195c1611 --- /dev/null +++ b/rtd/docs/source/store_man/Plugin_darshan_stream_store.rst @@ -0,0 +1,102 @@ +============================ +main +============================ + +:Date: 26 September 2021 + +.. contents:: + :depth: 3 +.. + +NAME +============================ + +Plugin_darshan_stream_store - LDMS darshan_stream_store plugin + +SYNOPSIS +================================ + +| Within ldmsd_controller or a configuration file: +| config name=darshan_stream_store [ = ] + +DESCRIPTION +=================================== + +With LDMS (Lightweight Distributed Metric Service), plugins for the +ldmsd (ldms daemon) are configured via ldmsd_controller or a +configuration file. The darshan_stream_store plugin writes out a single +darshan json stream's data to SOS container. The input data produced by +the LDMS darshan plugin consist of two types of messages: "MOD" for +module data and "MET for meta data. Both messages saved into the same +SOS container. + +CONFIGURATION ATTRIBUTE SYNTAX +====================================================== + +**config** + | name=darshan_stream_store path= stream= [mode=] + | configuration line + + name= + | + | This MUST be darshan_stream_store. + + path= + | + | The path to the root of the SOS container store (should be + created by the user) + + stream= + | + | stream to which to subscribe. + + mode= + | + | The container permission mode for create, (defaults to 0660). + +INPUT JSON FORMAT +========================================= + +The input json has a "type" field, and this type used to select the the +message type between module data and meta data. + +A MOD darshan JSON example is shown below: + +{"job_id":6582,"rank":0,"ProducerName":"nid00021","file":"N/A","record_id":6222542600266098259,"module":"POSIX","type":"MOD","max_byte":16777215,"switches":0,"cnt":1,"op":"writes_segment_0","seg":[{"off":0,"len":16777216,"dur":0.16,"timestamp":1631904596.737955}]} + +A MET darshan JSON example is shown below: + +Some fields are set to -1 if they don't have data for that message type. + +BUGS +============================ + +No known bugs. + +NOTES +============================= + +This store is in development and may be changed at any time. + +Only supports one stream + +EXAMPLES +================================ + +Within ldmsd_controller or a configuration file: + +:: + + load name=darshan_stream_store + config name=darshan_stream_store path=/tmp/darshan_stream stream=darshanConnector + + prdcr_add name=localhost1 host=localhost type=active xprt=sock port=52001 interval=20000000 + prdcr_subscribe stream=darshanConnector regex=localhost* + prdcr_start name=localhost1 + +SEE ALSO +================================ + +ldmsd(8), ldms_quickstart(7), ldmsd_controller(8), ldms_sampler_base(7), +darshan_publisher, darshan_sampler, parser.pl (has perlpod), +Plugin_darshan_cat_publisher(7) diff --git a/rtd/docs/source/store_man/Plugin_store_flatfile.rst b/rtd/docs/source/store_man/Plugin_store_flatfile.rst new file mode 100644 index 000000000..71b96fe0b --- /dev/null +++ b/rtd/docs/source/store_man/Plugin_store_flatfile.rst @@ -0,0 +1,107 @@ +===================== +Plugin_store_flatfile +===================== + +:Date: 24 Oct 2019 + +.. contents:: + :depth: 3 +.. + +NAME +====================== + +Plugin_store_flatfile - man page for the LDMS store_flatfile plugin + +SYNOPSIS +========================== + +| Within ldmsd_controller script or a configuration file: +| load name=store_flatfile +| config name=store_flatfile path=datadir +| strgp_add plugin=store_flatfile [ = ] + +DESCRIPTION +============================= + +The flatfile store generates one file per metric with time, producer, +component id, and value columns separated by spaces. The file name is +$datadir/$container/$schema/$metric_name. + +STRGP_ADD ATTRIBUTE SYNTAX +============================================ + +The strgp_add sets the policies being added. This line determines the +output files via identification of the container and schema. + +**strgp_add** + | plugin=store_flatfile name= schema= + container= + | ldmsd_controller strgp_add line + + plugin= + | + | This MUST be store_flatfile. + + name= + | + | The policy name for this strgp. + + container= + | + | The container and the schema determine where the output files + will be written (see path above). They also are used to match + any action=custom configuration.node/meminfo. + + schema= + | + | The container and schema determines where the output files will + be written (see path above). + +NOTES +======================= + +- As of LDMS Version 4.3.3 there is a change in behavior. Previously + there was a choice that an associated value with a metric was its + udata, rather than the component_id. In the code the variable name + used was 'comp_id', even though it wasn't necessarily input as such + in the sampler. his code now explictly gets the component_id by name. + +- We expect to develop additional options controlling output files and + output file format. + +- There is no option to quote string values, handle rollover, or handle + buffering. + +- There is a maximum of 20 concurrent flatfile stores. + +BUGS +====================== + +- Numeric array metrics are not presently supported. + +EXAMPLES +========================== + +Within ldmsd_controller or in a configuration file + +:: + + load name=store_flatfile + config name=store_flatfile path=/XXX/datadir + + # log only Active from the meminfo sampler + strgp_add name=store_flatfile_meminfo plugin=store_flatfile schema=meminfo container=flat + strgp_prdcr_add name=store_flatfile_meminfo regex=localhost1 + strgp_metric_add name=store_flatfile_meminfo metric=Active + strgp_start name=store_flatfile_meminfo regex=localhost1 + + # log all from vmstat + strgp_add name=store_flatfile_vmstat plugin=store_flatfile schema=vmstat container=flat + strgp_prdcr_add name=store_flatfile_vmstat regex=localhost1 + strgp_start name=store_flatfile_vmstat regex=localhost1 + +SEE ALSO +========================== + +ldmsd(8), ldms_quickstart(7), ldmsd_controller(8) diff --git a/rtd/docs/source/store_man/Plugin_store_slurm.rst b/rtd/docs/source/store_man/Plugin_store_slurm.rst new file mode 100644 index 000000000..674a3df1f --- /dev/null +++ b/rtd/docs/source/store_man/Plugin_store_slurm.rst @@ -0,0 +1,104 @@ +================== +Plugin_store_slurm +================== + +:Date: 30 Sep 2019 + +.. contents:: + :depth: 3 +.. + +NAME +=================== + +Plugin_store_slurm - man page for the LDMSD store_slurm plugin + +SYNOPSIS +======================= + +Within ldmsd_controller or a configuration file: **load** +**name=store_slurm** + +**config** **name=store_slurm** **path=**\ *STORE_ROOT_PATH* [ +**verbosity=\ (0\ \|\ 1\ \|\ 2)** ] + +**strgp_add** **name=**\ *STRGP_NAME* **plugin=store_slurm** +**container=**\ *CONTAINER* **schema=mt-slurm** + +**strgp_prdcr_add** **name=**\ *STRGP_NAME* **regex=**\ *PRDCR_REGEX* + +DESCRIPTION +========================== + +**store_slurm** is an LDMSD storage plugin that stores job data from +**slurm_sampler** specifically, and must not be used with other data. + +PLUGIN CONFIG OPTIONS +==================================== + +**name=store_slurm** + This MUST be store_slurm (the name of the plugin). + +**path=**\ *STORE_ROOT_PATH* + The path to the root of the store. SOS container for each schema + specified by the storage policy (**strgp**) will be placed in the + *STORE_ROOT_PATH* directory. + +**verbosity=(**\ *0*\ **\|**\ *1*\ **\|**\ *2*\ **)** + + *0* + (default) for SUMMARY verbosity level. The storage plugin only + stores single entry for each job. + + *1* + for RANK verbosity level. The storage plugin stores job data entry + per each rank (process) in the job. + + *2* + for TIME (the most verbosed) verbosity level. The storage plugin + stores job data entries every time the slurm_sampler set is + updated. In this verbosity level, we would have a lot of job + entries that are the same in everything except for the timestamp. + +STORAGE POLICY +============================= + +An LDMSD storage plugin is like a storage driver that provides only +storing mechanism. A storage policy (**strgp**) is a glue binding data +sets from various producers to a container of a storage plugin. + +**strgp_add** command defines a new storage policy, identified by +**name**. The **plugin** attribute tells the storage policy which +storage plugin to work with. The **schema** attribute identifies LDMS +schema the data set of which is consumed by the storage policy. The +**container** attribute identifies a container inside the storage plugin +that will store data. + +The **schema** for **store_slurm** is always *mt-slurm* as +**slurm_sampler** restricts "mt-slurm" as its schema name. + +**strgp_prdcr_add** is a command to specify producers that feed data to +the storage policy. + +BUGS +=================== + +No known bugs. + +EXAMPLES +======================= + +Plugin configuration + prdcr example: + + :: + + load name=store_slurm + config name=store_slurm path=/var/store verbosity=1 + strgp_add name=slurm_strgp plugin=store_slurm container=slurm schema=mt-slurm + strgp_prdcr_add name=slurm_strgp regex=.* + +SEE ALSO +======================= + +**Plugin_slurm_sampler**\ (7), **ldmsd**\ (8), **ldms_quickstart**\ (7), +**ldmsd_controller**\ (8), **ldms_sampler_base**\ (7). diff --git a/rtd/docs/source/store_man/Plugin_store_sos.rst b/rtd/docs/source/store_man/Plugin_store_sos.rst new file mode 100644 index 000000000..b4a20e59c --- /dev/null +++ b/rtd/docs/source/store_man/Plugin_store_sos.rst @@ -0,0 +1,346 @@ +==================== +Plugin_store_sos +==================== + +:Date: 21 Dec 2015 + +.. contents:: + :depth: 3 +.. + +NAME +==== + +Plugin_store_sos - man page for the LDMS store_sos plugin + +SYNOPSIS +======== + +| Within ldmsd_controller script: +| ldmsd_controller> load name=store_sos +| ldmsd_controller> config name=store_sos path=path +| ldmsd_controller> strgp_add plugin=store_sos [ = ] + +DESCRIPTION +=========== + +With LDMS (Lightweight Distributed Metric Service), store plugins for +the ldmsd (ldms daemon) are configured via the ldmsd_controller. The +store_sos plugin is a sos store. + +To build the store_sos, build with the following flag: **--enable_sos** + +STORE_SOS INIT CONFIGURATION ATTRIBUTE SYNTAX +============================================= + +**config** + | name= path= + | ldmsd_controller configuration line + + name= + | + | This MUST be store_sos. + + path= + | + | The store will be put into a directory whose root is specified + by the path argument. This directory must exist; the store will + be created. The full path to the store will be + /. The schema(s) determine the schemas of the + data base. Container and schema are set when the strgp is added. + +STRGP_ADD ATTRIBUTE SYNTAX +========================== + +The strgp_add sets the policies being added. This line identifies the +container and schema for a store. + +**strgp_add** + | plugin=store_sos name= schema= + container= [decomposition=] + | ldmsd_controller strgp_add line + + plugin= + | + | This MUST be store_sos. + + name= + | + | The policy name for this strgp. + + container= + | + | The container and schema define the store as described above + (see path). + + schema= + | + | The container and schema define the store as described above + (see path). You can have multiples of the same path and + container, but with different schema (which means they will have + different metrics) and they will be stored in the same store. + + decomposition= + | + | Optionally use set-to-row decomposition with the specified + configuration file in JSON format. See more about decomposition + in ldmsd_decomposition(7). + +USING SOS COMMANDS TO MANAGE PARTITIONS +======================================= + +Some of the basic sos commands are given below. SOS tools will be built +into XXX. Any commands given with no argument, will return usage info. + +**sos_part_query** + | + | List the partitions defined in a container. + +**sos_part_create** + | -C [=] part_name + | Create a partition. + + **-C** ** + | + | Path to the container + + **-s** *state* + | + | State of the new partition (case insensitive). Default is + OFFLINE. Optional parameter. Valid options are: + + - PRIMARY: all new allocations go in this partition + + - ONLINE: objects are accessible, but the partition does not grow + + - OFFLINE: object references are invalid; the partition may be moved + or deleted. + + **part_name** + | + | Name of the partition + +**sos_part_delete** + | -C + | Delete a partition in a container. The partition must be in the + OFFLINE state to be deleted. + + **-C** ** + | + | Path to the container + + **name** + | + | Name of the parition + +**sos_part_modify** + | -C [=] part_name + | Modify the state of a partition. + + **-C** ** + | + | Path to the container + + **-s** *state* + | + | State of the new partition (case insensitive). Default is + OFFLINE. Optional parameter. Valid options are: + + - PRIMARY: all new allocations go in this partition + + - ONLINE: objects are accessible, but the partition does not grow + + - OFFLINE: object references are invalid; the partition may be moved + or deleted. + + **part_name** + | + | Name of the partition + +**sos_part_move** + | + | Move a partition to another storage location. -C -p + part_name + + **-C** ** + | + | Path to the container + + **-p** ** + | + | The new path. + + **part_name** + | + | Name of the partition + +USING SOS COMMANDS TO LOOK AT DATA IN A PARTITION +================================================= + +sos_cmd can be used to get data from an sos instance. Some relevant +command options are below. Example usage is in the example section. + +**sos_cmd** + | -C -l + | Print a directory of the schemas. + + **-C** ** + | + | Path to the container + +**sos_cmd** + | -C -i + | Show debug information for the container + + **-C** ** + | + | Path to the container + +**sos_cmd** + | -C -q -S -X -V -V .... + | Print data from a container + + **-C** ** + | + | Path to the container + + **-q** + Used to query + + **-S** ** + | + | Schema querying against + + **-X** ** + | + | Variable that is indexed to use in the query. + + **-V** ** + | + | One or more vars to output. + +NOTES +===== + +- The configuration lines do not allow specification of the partition, + that is done automatically (by default this is the epoch timestamp). + +- Management of partitions is done outside of LDMS (e.g., cron script + that calls creation of new partitions and changes from PRIMARY to + ACTIVE). + +BUGS +==== + +No known bugs. + +EXAMPLES +======== + +Configuring store_sos: +---------------------- + +:: + + ldmsd_controller> load name=store_sos + ldmsd_controller> config name=store_sos path=/XXX/storedir + ldmsd_controller> strgp_add name=sos_mem_policy plugin=store_sos container=sos schema=meminfo + +Querying a container's partitions: +---------------------------------- + +:: + + $ sos_part /NVME/0/SOS_ROOT/Test + Partition Name RefCount Status Size Modified Accessed Path + -------------------- -------- ---------------- -------- ---------------- ---------------- ---------------- + 00000000 3 ONLINE 1M 2015/08/25 13:49 2015/08/25 13:51 /SOS_STAGING/Test + 00000001 3 ONLINE 2M 2015/08/25 11:54 2015/08/25 13:51 /NVME/0/SOS_ROOT/Test + 00000002 3 ONLINE 2M 2015/08/25 11:39 2015/08/25 13:51 /NVME/0/SOS_ROOT/Test + 00000003 3 ONLINE PRIMARY 2M 2015/08/25 11:39 2015/08/25 13:51 /NVME/0/SOS_ROOT/Test + +Looking at a container's directory: +----------------------------------- + +Variables that are options for -X in the sos_cmd will have indexed = 1 + +:: + + $ sos_cmd -C /NVME/0/LDMS -l + schema : + name : aries_nic_mmr + schema_sz : 1944 + obj_sz : 192 + id : 129 + -attribute : timestamp + type : TIMESTAMP + idx : 0 + indexed : 1 + offset : 8 + -attribute : comp_time + type : UINT64 + idx : 1 + indexed : 1 + offset : 16 + -attribute : job_time + type : UINT64 + idx : 2 + indexed : 1 + offset : 24 + -attribute : component_id + type : UINT64 + idx : 3 + indexed : 0 + offset : 32 + -attribute : job_id + type : UINT64 + idx : 4 + indexed : 0 + offset : 40 + -attribute : AR_NIC_NETMON_ORB_EVENT_CNTR_REQ_PKTS + type : UINT64 + idx : 5 + indexed : 0 + offset : 48 + -attribute : AR_NIC_NETMON_ORB_EVENT_CNTR_REQ_FLITS + type : UINT64 + idx : 6 + indexed : 0 + offset : 56 + -attribute : AR_NIC_NETMON_ORB_EVENT_CNTR_REQ_STALLED + type : UINT64 + idx : 7 + indexed : 0 + offset : 64 + ... + +Looking at variable values in a container: +------------------------------------------ + +:: + + $ sos_cmd -C /NVME/0/LDMS -q -S aries_nic_mmr -X timestamp -V timestamp -V AR_NIC_NETMON_ORB_EVENT_CNTR_REQ_PKTS + timestamp AR_NIC_NETMON_ORB_EVENT_CNTR_REQ_PKTS + -------------------------------- ------------------ + 1447449560.003480 1642207034 + 1447449630.002155 1642213993 + 1447449630.003115 88703749 + 1447449630.003673 74768272 + 1447449640.002818 74768367 + 1447449640.003201 88703844 + 1447449640.003249 1642214024 + 1447449650.002885 74768402 + 1447449650.003263 1642214059 + 1447449650.003325 88703874 + 1447449660.002954 74768511 + 1447449660.003308 1642214174 + 1447449660.003444 88703993 + 1447449670.003015 74768547 + 1447449670.003361 1642214205 + 1447449670.003601 88704024 + 1447449680.003081 74768582 + +SEE ALSO +======== + +ldms(7), Plugin_store_csv(7), ldmsd_decomposition(7) diff --git a/rtd/docs/source/store_man/Plugin_store_timescale.rst b/rtd/docs/source/store_man/Plugin_store_timescale.rst new file mode 100644 index 000000000..06a846870 --- /dev/null +++ b/rtd/docs/source/store_man/Plugin_store_timescale.rst @@ -0,0 +1,175 @@ +====================== +Plugin_store_timescale +====================== + +:Date: 24 Oct 2019 + +.. contents:: + :depth: 3 +.. + +NAME +======================= + +Plugin_store_timescale - man page for the LDMS store_timescale plugin + +SYNOPSIS +=========================== + +| Within ldmsd_controller script or a configuration file: +| load name=store_timescale +| strgp_add name= plugin=store_timescale container= + schema= +| strgp_prdcr_add name= regex=.\* +| strgp_start name= + +DESCRIPTION +============================== + +With LDMS (Lightweight Distributed Metric Service), plugins for the +ldmsd (ldms daemon) are configured via ldmsd_controller or a +configuration file. The timescale_store plugin is a store developed by +Shanghai Jiao Tong University HPC Center to store collected data in +TimescaleDB. + +This store is a simplified version of store_influx. + +STORE_TIMESCALE CONFIGURATION ATTRIBUTE SYNTAX +================================================================= + +**config** + | name= user= pwfile= + hostaddr= port= dbname= + measurement_limit= + | ldmsd_controller configuration line + + name= + | + | This MUST be store_timescale. + + user= + | + | This option is required; It will be used as the user name to + connect to timescaledb. + + pwfile= + | + | This option is required; The file must have content + secretword=, the password will be used as the password + to connect to timescaledb. + + hostaddr= + | + | This option is required; It will be used as the ip addr of + timescaledb to connect to. + + port= + | + | This option is required; It will be used as the port number of + timescaledb to connect to. + + dbname= + | + | This option is required; It will be used as the timescaledb + database name to connect to. + + measurement_limit= + | + | This is optional; It specifies the maximum length of the sql + statement to create table or insert data into timescaledb; + default 8192. + +STRGP_ADD ATTRIBUTE SYNTAX +============================================= + +The strgp_add sets the policies being added. This line determines the +output files via identification of the container and schema. + +**strgp_add** + | plugin=store_timescale name= schema= + container= + | ldmsd_controller strgp_add line + + plugin= + | + | This MUST be store_timescale. + + name= + | + | The policy name for this strgp. + + container= + | + | The container and the schema determine where the output files + will be written (see path above). + + schema= + | + | The container and the schema determine where the output files + will be written (see path above). You can have multiples of the + same sampler, but with different schema (which means they will + have different metrics) and they will be stored in different + containers (and therefore files). + +STORE COLUMN ORDERING +======================================== + +This store generates output columns in a sequence influenced by the +sampler data registration. Specifically, the column ordering is + + Time, Time_usec, ProducerName, \* + +The column sequence of is the order in which the +metrics are added into the metric set by the sampler. + +NOTES +======================== + +None. + +BUGS +======================= + +None known. + +EXAMPLES +=========================== + +Within ldmsd_controller or in a ldmsd command script file + +:: + + load name=store_timescale + + + strgp_add name=store_tutorial1 plugin=store_timescale schema=test1 container=tutorial_sampler1 + + + strgp_prdcr_add name=store_tutorial1 regex=.* + + + strgp_start name=store_tutorial1 + + + strgp_add name=store_tutorial2 plugin=store_tutorial schema=test2 container=tutorial_sampler2 + + + strgp_prdcr_add name=store_tutorial2 regex=.* + + + strgp_start name=store_tutorial2 + + + strgp_add name=store_tutorial3 plugin=store_tutorial schema=test3 container=tutorial_sampler3 + + + strgp_prdcr_add name=store_tutorial3 regex=.* + + + strgp_start name=store_tutorial3 + +SEE ALSO +=========================== + +ldmsd(8), ldms_quickstart(7), ldmsd_controller(8), +Plugin_tutorial_sampler(7), Plugin_store_csv(7) diff --git a/rtd/docs/source/store_man/Plugin_store_tutorial.rst b/rtd/docs/source/store_man/Plugin_store_tutorial.rst new file mode 100644 index 000000000..5959adff4 --- /dev/null +++ b/rtd/docs/source/store_man/Plugin_store_tutorial.rst @@ -0,0 +1,164 @@ +===================== +Plugin_store_tutorial +===================== + +:Date: 24 Oct 2019 + +.. contents:: + :depth: 3 +.. + +NAME +====================== + +Plugin_store_tutorial - man page for the LDMS store_tutorial plugin + +SYNOPSIS +========================== + +| Within ldmsd_controller script or a configuration file: +| load name=store_tutorial +| config name=store_tutorial path= +| strgp_add name= plugin=store_tutorial container= + schema= +| strgp_prdcr_add name= regex=.\* +| strgp_start name= + +DESCRIPTION +============================= + +With LDMS (Lightweight Distributed Metric Service), plugins for the +ldmsd (ldms daemon) are configured via ldmsd_controller or a +configuration file. The tutorial_store plugin is a demo store described +in the LDMSCON2019 tutorial "LDMS v4: Sampler and Store Writing". + +This store is a simplified version of store_csv, handling only U64 and +producing no header and with no rollover. + +STORE_TUTORIAL CONFIGURATION ATTRIBUTE SYNTAX +=============================================================== + +**config** + | name= path= + | ldmsd_controller configuration line + + name= + | + | This MUST be store_tutorial. + + path= + | + | This option is required; the config line or the options file + must supply a default value. The output files will be put into a + directory whose root is specified by the path argument. This + directory must exist; the subdirectories and files will be + created. The full path to the output files will be + //. Container and schema are set when + the strgp is added. + +STRGP_ADD ATTRIBUTE SYNTAX +============================================ + +The strgp_add sets the policies being added. This line determines the +output files via identification of the container and schema. + +**strgp_add** + | plugin=store_tutorial name= schema= + container= + | ldmsd_controller strgp_add line + + plugin= + | + | This MUST be store_tutorial. + + name= + | + | The policy name for this strgp. + + container= + | + | The container and the schema determine where the output files + will be written (see path above). + + schema= + | + | The container and the schema determine where the output files + will be written (see path above). You can have multiples of the + same sampler, but with different schema (which means they will + have different metrics) and they will be stored in different + containers (and therefore files). + +STORE COLUMN ORDERING +======================================= + +This store generates output columns in a sequence influenced by the +sampler data registration. Specifically, the column ordering is + + Time, Time_usec, ProducerName, \* + +The column sequence of is the order in which the +metrics are added into the metric set by the sampler. + +NOTES +======================= + +None. + +BUGS +====================== + +None known. + +EXAMPLES +========================== + +Within ldmsd_controller or in a ldmsd command script file + +:: + + load name=store_tutorial + + + config name=store_tutorial path=/tmp/store + + + strgp_add name=store_tutorial1 plugin=store_tutorial schema=test1 container=tutorial_sampler1 + + + strgp_prdcr_add name=store_tutorial1 regex=.* + + + strgp_start name=store_tutorial1 + + + strgp_add name=store_tutorial2 plugin=store_tutorial schema=test2 container=tutorial_sampler2 + + + strgp_prdcr_add name=store_tutorial2 regex=.* + + + strgp_start name=store_tutorial2 + + + strgp_add name=store_tutorial3 plugin=store_tutorial schema=test3 container=tutorial_sampler3 + + + strgp_prdcr_add name=store_tutorial3 regex=.* + + + strgp_start name=store_tutorial3 + +| > ls /tmp/store +| tutorial_sampler1 tutorial_sampler2 tutorial_sampler +| > more /tmp/store/tutorial_sampler1/test1 +| 1571943275.194664,194664,localhost1,1,0,0,13,26,39,52,65,78,91,104,117,130 +| 1571943276.195789,195789,localhost1,1,0,0,14,28,42,56,70,84,98,112,126,140 +| 1571943277.196916,196916,localhost1,1,0,0,15,30,45,60,75,90,105,120,135,150 +| 1571943278.198051,198051,localhost1,1,0,0,16,32,48,64,80,96,112,128,144,160 +| 1571943279.199184,199184,localhost1,1,0,0,17,34,51,68,85,102,119,136,153,170 + +SEE ALSO +========================== + +ldmsd(8), ldms_quickstart(7), ldmsd_controller(8), +Plugin_tutorial_sampler(7), Plugin_store_csv(7) diff --git a/rtd/docs/source/store_man/Plugin_stream_csv_store.rst b/rtd/docs/source/store_man/Plugin_stream_csv_store.rst new file mode 100644 index 000000000..0a80eedb3 --- /dev/null +++ b/rtd/docs/source/store_man/Plugin_stream_csv_store.rst @@ -0,0 +1,261 @@ +======================= +Plugin_stream_csv_store +======================= + +:Date: 03 Oct 2021 + +.. contents:: + :depth: 3 +.. + +NAME +======================== + +Plugin_stream_csv_store - man page for the LDMS stream_csv_store plugin + +SYNOPSIS +============================ + +| Within ldmsd_controller or a configuration file: +| config name=stream_csv_store [ = ] + +DESCRIPTION +=============================== + +With LDMS (Lightweight Distributed Metric Service), plugins for the +ldmsd (ldms daemon) are configured via ldmsd_controller or a +configuration file. The stream_csv_store plugin is a DEVELOPMENTAL store +that writes out either a single stream's data to csv format if the input +type is a well-known json format or writes out the raw messages if the +input type is str. Input type will be determined by the +hello_cat_publisher or similar. + +CONFIGURATION ATTRIBUTE SYNTAX +================================================== + +**config** + | name=stream_csv_store path= container= + stream= [flushtime=] [buffer=<0/1>] [rolltype= + rollover= rollagain=] + | configuration line + + name= + | + | This MUST be stream_csv_store. + + path= + | + | path to the directory of the csv output file + + container= + | + | directory of the csv output file + + stream= + | + | csv list of streams to which to subscribe. + + flushtime= + | + | Flush any file that has not received data on its stream in the + last N sec. This is asynchonous to any buffering or rollover + that is occuring. Min time if enabled = 120 sec. This will occur + again at this interval if there is still no data received. + + buffer=<0/1> + | + | Optional buffering of the output. 0 to disable buffering, 1 to + enable it with autosize (default) + + rolltype= + | + | By default, the store does not rollover and the data is written + to a continously open filehandle. Rolltype and rollover are used + in conjunction to enable the store to manage rollover, including + flushing before rollover. The header will be rewritten when a + roll occurs. Valid options are: + + 1 + | + | wake approximately every rollover seconds and roll. Rollover + is suppressed if no data at all has been written. + + 2 + | + | wake daily at rollover seconds after midnight (>=0) and roll. + Rollover is suppressed if no data at all has been written. + + 3 + | + | roll after approximately rollover records are written. + + 4 + | + | roll after approximately rollover bytes are written. + + 5 + | + | wake at rollover seconds after midnight (>=0) and roll, then + repeat every rollagain (> rollover) seconds during the day. + For example "rollagain=3600 rollover=0 rolltype=5" rolls + files hourly. Rollover is suppressed if no data at all has + been written. + + rollover= + | + | Rollover value controls the frequency of rollover (e.g., number + of bytes, number of records, time interval, seconds after + midnight). Note that these values are estimates due to the + nature of thread wake-ups. Also, for rolltypes 3 and 4, there is + a minimum delay of ROLL_LIMIT_INTERVAL seconds between rollovers + no matter how fast the data is being received, which may lead to + larger than expected data files for small values of rollover. + +JSON FORMAT AND OUTPUT HEADER AND FORMAT +============================================================ + +The json is expected to be something like: + +:: + + {"foo":1, "bar":2, "zed-data":[{"count":1, "name":"xyz"},{"count":2, "name":"abc"}]} + +Note the brackets. There will be at most one list. It is expected that +each dictionary in the list will have the same item names. Everything +else must be singleton data items. + +The header is generated off the first received json ever. If that first +json is missing the list, or if the list has no entries, then list data +will not appear in the header and will not be parsed in subsequent data +lines. The header values will be the singleton names (e.g., foo, bar) +and a list will be broken up into and item per dictionary item with +names listname:dictname (e.g., zed_data:count, zed_data:name). + +There can be any number of dictionaries in a list. Data lines with +multiple dictionaries will be written out in the csv as separate lines, +with the singleton items repeated in each line like: + +:: + + #foo,bar,zed-data:count,zed-data:name + 1,2,1,xyz + 1,2,2,abc + +There will be a header in every output file (can be more than 1 output +file because of rollover). + +STORE OUTPUT FILENAME +========================================= + +The filename will be '.' (e.g., foo-123456789). +The timestamp is determined when the store is started or rolledover and +the file is created. That may be considerably earlier than when data is +streamed to the store. + +STORE COLUMN ORDERING +========================================= + +There is only column ordering for 'json' format. There is no column +ordering for 'str' format. 'str' format will always be written out, no +matter what the 'json' header keys may be. The json order is arbitrary. + +TIMING INFORMATION +====================================== + +Options for timing information are driven by #defines in the code source +right now. + +TIMESTAMP_STORE + | + | Set by #define or #undef TIMESTAMP_STORE. This will write out an + absolute timestamp in the file as the last item in the csv and is + called 'store_recv_time' in the header. The timestamp is only + gotten once, when the function is entered (e.g., if a data line has + multiple dicts, this will result in multiple output lines each of + which will have the same additional timestamp value). Both string + and json are timestamped. + +STREAM_CSV_DIAGNOSTICS + | + | Set by #define or #undef STREAM_CSV_DIAGNOSTICS. This will write + out diagnostic info to the log when stream_cb is called. + +BUGS +======================== + +No known bugs. + +NOTES +========================= + +This store is in development and may be changed at any time. + +Supports more than 1 stream. There is currently no performance guidence +about number of streams and amount of data. + +There is no way to know if a stream will actually be used or if a final +value is received. Therefore, this store will need to be restarted if +you want to use it with a new stream or if you want use the same stream +name, but with different fields in the json. + +It is possible that with buffering, if a stream's sends are ended, there +still may be unflushed data to a file. + +There is no way to remove a stream from the index nor to unsubscribe. +That is, there is nothing that is akin to open_store and close_store +pair as in an actual store plugin. Note that this is in development and +options are changing. For example, RESET funcationality has been removed +and flushtime functionality has changed. + +Note the restrictions on the data input above. Also how that affects the +header. + +EXAMPLES +============================ + +Within ldmsd_controller or a configuration file: + +:: + + load name=stream_csv_store + config name=stream_csv_store path=XYZ/store container=csv stream=foo buffer=1 + # dont call anything else on the store. the store action is called by a callback triggered by the stream. + + prdcr_add name=localhost1 host=localhost type=active xprt=sock port=52001 interval=20000000 + prdcr_subscribe stream=foo regex=localhost* + prdcr_start name=localhost1 + +Testdata: + +:: + + cat XXX/testdata.txt + {"job-id" : 10364, "rank" : 1, "kokkos-perf-data" : [ {"name" : "SPARTAFOO0", "count": 0, "time": 0.0000},{"name" : "SPARTAFOO1", "count": 1, "time": 0.0001},{"name" : "SPARTAFOO2", "count": 2, "time": 0.0002},{"name" : "SPARTAFOO3", "count": 3, "time": 0.0003},{"name" : "SPARTAFOO4", "count": 4, "time": 0.0004},{"name" : "SPARTAFOO5", "count": 5, "time": 0.0005},{"name" : "SPARTAFOO6", "count": 6, "time": 0.0006},{"name" : "SPARTAFOO7", "count": 7, "time": 0.0007},{"name" : "SPARTAFOO8", "count": 8, "time": 0.0008},{"name" : "SPARTAFOO9", "count": 9, "time": 0.0009}] } + +Publish: + +:: + + ldmsd_stream_publish -x sock -h localhost -p 52001 -s foo -t json -f XXX/testdata.txt -a + + + + Output: + cat XYZ/store/csv/foo.1614306320 + rank,job-id,kokkos-perf-data:time,kokkos-perf-data:name,kokkos-perf-data:count,store_recv_time + 1,10364,0.000000,"SPARTAFOO0",0,1614306329.167736 + 1,10364,0.000100,"SPARTAFOO1",1,1614306329.167736 + 1,10364,0.000200,"SPARTAFOO2",2,1614306329.167736 + 1,10364,0.000300,"SPARTAFOO3",3,1614306329.167736 + 1,10364,0.000400,"SPARTAFOO4",4,1614306329.167736 + 1,10364,0.000500,"SPARTAFOO5",5,1614306329.167736 + 1,10364,0.000600,"SPARTAFOO6",6,1614306329.167736 + 1,10364,0.000700,"SPARTAFOO7",7,1614306329.167736 + 1,10364,0.000800,"SPARTAFOO8",8,1614306329.167736 + 1,10364,0.000900,"SPARTAFOO9",9,1614306329.167736 + +SEE ALSO +============================ + +ldmsd(8), ldms_quickstart(7), ldmsd_controller(8), ldms_sampler_base(7), +ldmsd_stream_publish(7), Plugin_hello_sampler(7) diff --git a/rtd/docs/source/store_man/index.rst b/rtd/docs/source/store_man/index.rst new file mode 100644 index 000000000..d96eb6013 --- /dev/null +++ b/rtd/docs/source/store_man/index.rst @@ -0,0 +1,8 @@ +Store Plugin Man Pages +===== + +.. toctree:: + :maxdepth: 1 + :glob: + + * diff --git a/rtd/docs/source/store_man/ldms_csv_time_drops.rst b/rtd/docs/source/store_man/ldms_csv_time_drops.rst new file mode 100644 index 000000000..47fe683e5 --- /dev/null +++ b/rtd/docs/source/store_man/ldms_csv_time_drops.rst @@ -0,0 +1,169 @@ +=================== +ldms_csv_time_drops +=================== + +:Date: 07 Jul 2022 + +.. contents:: + :depth: 3 +.. + +NAME +==================== + +ldms_csv_time_drops the LDMS CSV data quality check + +SYNOPSIS +======================== + +| ldms_csv_time_drops +| ldms_csv_time_drops_range + +DESCRIPTION +=========================== + +LDMS CSV store file quality checker. For each input file, the interval, +gaps and duplicates in the data are reported. When multiple files are +given, they must be given in chronological order of the data contained. + +INTERVAL +======================== + +The interval is determined per file by examining the rounded time +differences of sequential samples on each host and taking the most +common value. 0 length intervals are ignored. If more than one 'most +common' interval is found across the hosts of a single file, the maximum +interval seen in any file is reported as 'interval' and the minimum is +reported as 'short_interval'. + +GAPS +==================== + +Gaps in the data are computed using the assumption of a uniform sampling +interval across all hosts on the aggregate timestamp data from all the +input files. A missing file or daemon down-time within the range of the +data set will appear a gap. + +DUPLICATES +========================== + +An identical timestamp reappearing on the same host will be reported. +The later time reported for a duplicate is the latest time seen across +any host in the same file preceeding the line location of the duplicate. + +INPUT +===================== + +The LDMS csv store column format is assumed, in particular that the +first column is the timestamp and any row beginning with # is a header +to be ignored. Columns 1-4 are assumed to be + +:: + + + Time,Time_usec,ProducerName,component_id + +OUTPUT FORMATS +============================== + +Per-file summary: + +:: + + + lines + oldest + newest + interval + +If multiple intervals found in a file + +:: + + + short_interval + +Per gap output for ldms_csv_time_drops_range: + +:: + + + is missing between + + and + +Per gap output for ldms_csv_time_drops: + +:: + + + missing + +Duplicates are reported as: + +:: + + + written again at + +BUGS +==================== + +Sub-second intervals are not supported. + +EXAMPLES +======================== + +For input test.csv containing: + +:: + + + 1.1,100000,host1,1 + 1.1,100000,host2,2 + 1.1,100000,host3,3 + 2.1,100000,host1,1 + 2.1,100000,host2,2 + 3.1,100000,host1,1 + 3.1,100000,host2,2 + 3.1,100000,host3,3 + 4.1,100000,host1,1 + 4.1,100000,host3,3 + 5.1,100000,host1,1 + 2.1,100000,host1,1 + 5.1,100000,host2,2 + 5.1,100000,host3,3 + + output of 'ldms_csv_time_drops test.csv' + + lines 14 + oldest 1.100000 + newest 5.100000 + interval 1 seconds + host1 2.000001 written again at 5.000001 + host2 missing 4 + host3 missing 2 + + output of 'ldms_csv_time_drops_range test.csv' + + lines 14 + oldest 1.100000 + newest 5.100000 + interval 1 seconds + host1 2.100000 written again at 5.100000 + host2 is missing 1 steps between + 3.100000 + and 5.100000 + host3 is missing 1 steps between + 1.100000 + and 3.100000 + + + Find the interval of data in a file foo.csv + + ldms_csv_time_drops foo.csv |grep ^interval + +SEE ALSO +======================== + +Plugin_store_csv(7) diff --git a/rtd/docs/source/ug.rst b/rtd/docs/source/ug.rst new file mode 100644 index 000000000..c371f1fb2 --- /dev/null +++ b/rtd/docs/source/ug.rst @@ -0,0 +1,36 @@ +LDMS User's Group +============================== + +LDMS User's group will meet every other Monday at Noon (Mountain time). +Sign up for meeting announcements using the information below. +The LDMS Mailing Lists are hosted by LLNL at listserv.llnl.gov. The current available lists are: + + ldms-announce - A low traffic announcements-only mailing list. + + +Subscribing to the list +--------------------- + +To subscribe to one of the mailing lists, send an email to listserv@listserv.llnl.gov. The body of the email should be (note: copy and paste may introduce non-printing characters. Please type out the message below): + + subscribe LISTNAME YOURFIRSTNAME YOURLASTNAME + +For instance: + + subscribe ldms-announce YOURFIRSTNAME YOURLASTNAME + +Unsubscribing from the list +------------------------- + +To unsubscribe from one of the mailing lists, send an email to listserv@listserv.llnl.gov. The body of the email should be: + + unsubscribe LISTNAME + +For instance: + + unsubscribe ldms-announce + + +LDMS User Group Meeting Notes +----------------------------- +All notes for our biweekly meetings can be found here: :doc:`Meeting Notes ` diff --git a/rtd/docs/source/ug_notes.rst b/rtd/docs/source/ug_notes.rst new file mode 100644 index 000000000..1a16a90b6 --- /dev/null +++ b/rtd/docs/source/ug_notes.rst @@ -0,0 +1,13 @@ +August 14, 2023 +---------------- + + +July 31, 2023 +---------------- + + +July 7, 2023 +---------------- + +June 5, 2023 +---------------- diff --git a/rtd/files/ldmscon2023_directory.zip b/rtd/files/ldmscon2023_directory.zip new file mode 100644 index 000000000..43938275b Binary files /dev/null and b/rtd/files/ldmscon2023_directory.zip differ diff --git a/rtd/man2rst/Plugin_app_sampler.rst b/rtd/man2rst/Plugin_app_sampler.rst new file mode 100644 index 000000000..79459414f --- /dev/null +++ b/rtd/man2rst/Plugin_app_sampler.rst @@ -0,0 +1,311 @@ +================== +Plugin_app_sampler +================== + +:Date: 30 Sep 2019 + +.. contents:: + :depth: 3 +.. + +NAME +==================== + +ldmsd_app_sampler - LDMSD app_sampler plugin + +SYNOPSIS +======================== + +**config** **name=app_sampler** **producer=**\ *PRODUCER* +**instance=**\ *INSTANCE* [ **schema=\ SCHEMA** ] [ +**component_id=\ COMPONENT_ID** ] [ **stream=\ STREAM_NAME** ] [ +**metrics=\ METRICS** ] [ **cfg_file=\ PATH** ] + +DESCRIPTION +=========================== + +**``app_sampler``** collects metrics from **``/proc/``** according +to current SLURM jobs/tasks running on the system. **``app_sampler``** +depends on **``slurm_notifier``** SPANK plugin to send SLURM job/task +events over **``ldmsd_stream``** (**``stream``** option, default: +slurm). A set is created per task when the task started in the following +format: **``PRODUCER_NAME/JOB_ID/TASK_PID``**. The set is deleted when +the task exited. + +By default **``app_sampler``** sampling all available metrics (see +**``LIST OF METRICS``** section). Users may down-select the list of +metrics to monitor by specifying **``metrics``** option (comma-separated +string) or writing a JSON configuration file and specifying +**``cfg_file``** option (see **``EXAMPLES``** section). + +CONFIG OPTIONS +============================== + +name + Must be app_sampler. + +producer + The name of the data producer (e.g. hostname). + +instance + This is required by sampler_base but is not used by app_sampler. So, + this can be any string but must be present. + +schema + The optional schema name (default: app_sampler). + +component_id + An integer identifying the component (default: *0*). + +stream + The name of the **``ldmsd_stream``** to listen for SLURM job events. + (default: slurm). + +metrics + The comma-separated list of metrics to monitor. The default is '' + (empty), which is equivalent to monitor ALL metrics. + +cfg_file + The alternative config file in JSON format. The file is expected to + have an object that may contain the following attributes: + +.. + + :: + + + { + 'stream': 'STREAM_NAME' + 'metrics': [ METRICS ] + } + +The default values are assumed for the attributes that are not +specified. Attributes other than 'stream' and 'metrics' are ignored. + +If the **``cfg_file``** is given, **``stream``** and **``metrics``** +options are ignored. + +LIST OF METRICS +=============================== + + :: + + /* from /proc/[pid]/cmdline */ + cmdline_len, + cmdline, + + /* the number of open files */ + n_open_files, + + /* from /proc/[pid]/io */ + io_read_b, + io_write_b, + io_n_read, + io_n_write, + io_read_dev_b, + io_write_dev_b, + io_write_cancelled_b, + + /* /proc/[pid]/oom_score */ + oom_score, + + /* /proc/[pid]/oom_score_adj */ + oom_score_adj, + + /* path of /proc/[pid]/root */ + root, + + + /* /proc/[pid]/stat */ + stat_pid, + stat_comm, + stat_state, + stat_ppid, + stat_pgrp, + stat_session, + stat_tty_nr, + stat_tpgid, + stat_flags, + stat_minflt, + stat_cminflt, + stat_majflt, + stat_cmajflt, + stat_utime, + stat_stime, + stat_cutime, + stat_cstime, + stat_priority, + stat_nice, + stat_num_threads, + stat_itrealvalue, + stat_starttime, + stat_vsize, + stat_rss, + stat_rsslim, + stat_startcode, + stat_endcode, + stat_startstack, + stat_kstkesp, + stat_kstkeip, + stat_signal, + stat_blocked, + stat_sigignore, + stat_sigcatch, + stat_wchan, + stat_nswap, + stat_cnswap, + stat_exit_signal, + stat_processor, + stat_rt_priority, + stat_policy, + stat_delayacct_blkio_ticks, + stat_guest_time, + stat_cguest_time, + stat_start_data, + stat_end_data, + stat_start_brk, + stat_arg_start, + stat_arg_end, + stat_env_start, + stat_env_end, + stat_exit_code, + + /* from /proc/[pid]/status */ + status_name, + status_umask, + status_state, + status_tgid, + status_ngid, + status_pid, + status_ppid, + status_tracerpid, + status_uid, + status_real_user, + status_eff_user, + status_sav_user, + status_fs_user, + status_gid, + status_real_group, + status_eff_group, + status_sav_group, + status_fs_group, + status_fdsize, + status_groups, + status_nstgid, + status_nspid, + status_nspgid, + status_nssid, + status_vmpeak, + status_vmsize, + status_vmlck, + status_vmpin, + status_vmhwm, + status_vmrss, + status_rssanon, + status_rssfile, + status_rssshmem, + status_vmdata, + status_vmstk, + status_vmexe, + status_vmlib, + status_vmpte, + status_vmpmd, + status_vmswap, + status_hugetlbpages, + status_coredumping, + status_threads, + status_sig_queued, + status_sig_limit, + status_sigpnd, + status_shdpnd, + status_sigblk, + status_sigign, + status_sigcgt, + status_capinh, + status_capprm, + status_capeff, + status_capbnd, + status_capamb, + status_nonewprivs, + status_seccomp, + status_speculation_store_bypass, + status_cpus_allowed, + status_cpus_allowed_list, + status_mems_allowed, + status_mems_allowed_list, + status_voluntary_ctxt_switches, + status_nonvoluntary_ctxt_switches, + + /* /proc/[pid]/syscall */ + syscall, + + /* /proc/[pid]/timerslack_ns */ + timerslack_ns, + + /* /proc/[pid]/wchan */ + wchan, + +BUGS +==================== + +No known bugs. + +EXAMPLES +======================== + +Example 1 +---------- + +Get everyting: + + :: + + config name=app_sampler + +Example 2 +---------- + +Down-select and with non-default stream name: + + :: + + config name=app_sampler metrics=stat_pid,stat_utime stream=mystream + +Example 3 +---------- + +Down-select using config file, using default stream: + + :: + + config name=app_sampler cfg_file=cfg.json + +.. + + :: + + # cfg.json + { + "metrics" : [ + "stat_pid", + "stat_utime" + ] + } + +NOTES +==================== + +Some of the optionally collected data might be security sensitive. + +The status_uid and status_gid values can alternatively be collected as +"status_real_user", "status_eff_user", "status_sav_user", +"status_fs_user", "status_real_group", "status_eff_group", +"status_sav_group", "status_fs_group". These string values are most +efficiently collected if both the string value and the numeric values +are collected. + +SEE ALSO +======================= + +**ldmsd**\ (8), **ldms_quickstart**\ (7), **ldmsd_controller**\ (8), +**ldms_sampler_base**\ (7), **proc(5),** **sysconf(3),** **environ(3).** diff --git a/rtd/man2rst/Plugin_aries_linkstatus.rst b/rtd/man2rst/Plugin_aries_linkstatus.rst new file mode 100644 index 000000000..268ab7da8 --- /dev/null +++ b/rtd/man2rst/Plugin_aries_linkstatus.rst @@ -0,0 +1,147 @@ +======================= +Plugin_aries_linkstatus +======================= + +:Date: 4 Jan 2018 + +.. contents:: + :depth: 3 +.. + +NAME +======================== + +Plugin_aries_linkstatus - man page for the linkstatus plugin for Cray +Aries systems + +SYNOPSIS +============================ + +| Within ldmsd_controller or in a configuration file +| config name=cray_aries_linkstatus [ = ] + +DESCRIPTION +=============================== + +With LDMS (Lightweight Distributed Metric Service), plugins for the +ldmsd (ldms daemon) are configured via ldmsd_controller or a +configuration file. aries_linkstatus reads the send and recv status +information from where it is exposed via gpcdr. + +Note that the cray_system_sampler variants have the capability to gather +linkstatus information provided by gpcdr using the configuration and +flag for HSN. For XE/XK systems, linkstatus metrics are reasonably +gathered as part of the cray_gemini_r sampler's gathering of the link +aggregated network counter values. However, for XC (Aries) systems, we +recommend gathering the network counter metrics via the aries_nic_mmr +and aries_rtr_mmr samplers (which use the ioctls) and the link status +metrics via this sampler (which reads from the filesystem location where +gpcdr exposes these values. In order to reduce the overhead, then, we +recommend that this sampler collect at lower frequencies than the +network counter samplers. + +The aries_linkstatus sampler is built and used independently of the +cray_system_sampler variants and of the aries_mmr samplers. + +To build the aries_linkstatus sampler, build with the following flag: +**--enable_aries_linkstatus** + +The output format is as follows: There is an array metric of length 8 +hex values for each tile row. Therefore, there are 5 metrics for each of +send and receive, associated with tiles 00X-01Y. The send and receive +metrics associated with r1, for example, correspond to the 8 values for +tiles 010 - 017. + +CONFIGURATION ATTRIBUTE SYNTAX +================================================== + +The aries_linkstatus plugin uses the sampler_base base class. This man +page covers only the configuration attributes, or those with default +values, specific to the this plugin; see ldms_sampler_base.man for the +attributes of the base class. + +**config** + | name= file_send= + file_recv= [schema=] + | configuration line + + name= + | + | aries_linkstatus + + file_send= + | + | Location of the file with the sendlinkstatus metrics, as + specified in the gpcdr configuration file. In the Cray-provided + default gpcdr configuration, this will be + /sys/devices/virtual/gni/gpcdr0/metricsets/linksendstatus/metrics. + + file_recv= + | + | Location of the file with the recvlinkstatus metrics, as + specified in the gpcdr configuration file. In the Cray-provided + default gpcdr configuration, this will be + /sys/devices/virtual/gni/gpcdr0/metricsets/linkrecvstatus/metrics. + + schema= + | + | Optional schema name. It is intended that the same sampler on + different nodes with different metrics have a different schema. + This will default to aries_linkstatus, if unspecified. + +NOTES +========================= + +- The file_send and file_recv can be the same file, if gpcdr is + configured that way. However, the sampler will do an separate pass + over the file for each type of metric. + +- The linkstatus metrics are not anticipated to change frequently. In + order to reduce overhead since the metrics are read from the + filesystem location where gpcdr exposes these values, it is + recommended that this sampler collect at lower frequencies than the + network counter samplers. Reasonable intervals are on order of + minutes. + +- This sampler is for Cray Aries systems only due to the differing + format of the names of the linkstatus metrics for Aries vs Gemini. It + could be extended to handle both. + +BUGS +======================== + +No known bugs. + +EXAMPLES +============================ + +1) aries_linkstatus: Within ldmsd_controller or in a configuration file: + +:: + + load name=aries_linkstatus + config name=aries_linkstatus producer=64 instance=nid00064/aries_linkstatus file_send=/sys/devices/virtual/gni/gpcdr0/metricsets/linksendstatus/metrics file_recv=/sys/devices/virtual/gni/gpcdr0/metricsets/linkrecvstatus/metrics + start name=aries_linkstatus interval=10000000 + +:: + + #ldms_ls -h nid00064 -x ugni -p 411 -l nid00064/aries_linkstatus + +localhost1/aries_linkstatus: consistent, last update: Tue Sep 26 +11:35:51 2017 [811278us] M u64 component_id 1 D u64 job_id 0 D u8[] +sendlinkstatus_r0 0x03,0x00,0x00,0x00,0x00,0x00,0x00,0x00 D u8[] +sendlinkstatus_r1 0x03,0x03,0x00,0x00,0x00,0x00,0x00,0x00 D u8[] +sendlinkstatus_r2 0x03,0x03,0x03,0x03,0x03,0x03,0x03,0x03 D u8[] +sendlinkstatus_r3 0x00,0x00,0x00,0x03,0x03,0x03,0x03,0x03 D u8[] +sendlinkstatus_r4 0x03,0x03,0x00,0x03,0x03,0x03,0x03,0x03 D u8[] +recvlinkstatus_r0 0x03,0x00,0x00,0x00,0x00,0x00,0x00,0x00 D u8[] +recvlinkstatus_r1 0x03,0x03,0x00,0x00,0x00,0x00,0x00,0x00 D u8[] +recvlinkstatus_r2 0x03,0x03,0x03,0x03,0x03,0x03,0x03,0x03 D u8[] +recvlinkstatus_r3 0x00,0x00,0x00,0x03,0x03,0x03,0x03,0x03 D u8[] +recvlinkstatus_r4 0x03,0x03,0x00,0x03,0x03,0x03,0x03,0x03 + +SEE ALSO +============================ + +ldmsd(7), ldms_sampler_base(7), Plugin_cray_system_sampler_variants(7), +Plugin_aries_mmr(7), ldms_quickstart(7), ldmsd_controller(8) diff --git a/rtd/man2rst/Plugin_aries_mmr.rst b/rtd/man2rst/Plugin_aries_mmr.rst new file mode 100644 index 000000000..a16bd7d58 --- /dev/null +++ b/rtd/man2rst/Plugin_aries_mmr.rst @@ -0,0 +1,143 @@ +================ +Plugin_aries_mmr +================ + +:Date: 05 Jan 2020 + +.. contents:: + :depth: 3 +.. + +NAME +================= + +Plugin_aries_mmr - man page for the aries_mmr sampler and variants. + +SYNOPSIS +===================== + +| Within ldmsd_controller or in a configuration file +| config name=aries_mmr [ = ] +| config name=aries_nic_mmr [ = ] +| config name=aries_rtr_mmr [ = ] + +DESCRIPTION +======================== + +With LDMS (Lightweight Distributed Metric Service), plugins for the +ldmsd (ldms daemon) are configured via ldmsd_controller or a +configuration file. The aries_XXX_mmr sampler variants. provides aries +network counter information. The particular counters to be read are +specified by configuration files. No functional combinations of the +counters are supported (.i.e., does not sum or scale values). + +The aries_XXX_mmr samplers depend on Cray's libgpcd, built with aries +options. This library has been released by Cray in CLE6 and later. You +cannot build this sampler if you do not have the libraries and headers. +If you have the code to build the library, be sure to build with +**CFLAGS=-fPIC** + +The difference between the variants is that aries_nic_mmr will skip any +counters in the inputfile that do NOT begin with AR_NIC\_; aries_rtr_mmr +does the opposite; and aries_mmr does NO name filtering. + +Different types of metrics are added to separate gpcd_contexts. The +order of the metrics in the output is the contexts in a particular +order, with the metrics in each context as they are specified in the +file. + +For the config file, all counter names must be fully spelled out (i.e., +does not resolve the shorthand given in the documentation for the +counters). + +To build any of the aries_mmr samplers, build with the following flags: +**--enable-aries_mmr** +**--with-aries-libgpcd=,** + +CONFIGURATION ATTRIBUTE SYNTAX +=========================================== + +The aries_mmr plugin uses the sampler_base base class. This man page +covers only the configuration attributes, or those with default values, +specific to the this plugin; see ldms_sampler_base.man for the +attributes of the base class. + +**config** + | name= file= [aries_rtr_id= + schema=] + | configuration line + + name= + | + | This MUST be aries_mmr, aries_nic_mmr, or aries_rtr_mmr. + + schema= + | + | Optional schema name. It is intended that the same sampler on + different nodes with different metrics have a different schema. + This will default to cray_aries_r or cray_gemini_r as + appropriate, if unspecified. + + aries_rtr_id= + | + | Optional aries router identifier. Defaults to 0 length string. + + file= + | + | Configuration file of aries performance counter names that will + be added in exactly as they are specified. At least one file + must be specified. + +NOTES +================== + +- This is entirely independent of the cray_aries_r_sampler. + +- At the moment, no functions of the data (either in the sampler or in + a store) are supported. + +- Counters whose names do not resolve are left out. + +- If you start this sampler on a node for which the counters cannot be + obtained (e.g., an external login node), the set may still get + created, however the sample function will fail and the plugin will be + stopped. + +- A non-sampler, standalone version of this code is in the Source in + util/aries/mmr_reader. It is not built via the build. + +- These samplers may change at any time. + +BUGS +================= + +No known bugs. + +EXAMPLES +===================== + +| > cat metrics.txt +| #RAW METRICS +| AR_NIC_NETMON_ORB_EVENT_CNTR_REQ_FLITS + +| +| AR_NIC_RSPMON_NPT_EVENT_CNTR_NL_FLITS +| # this is a test +| AR_RTR_1_2_INQ_PRF_INCOMING_FLIT_VC0 + +| load name=aries_mmr +| config name=aries_mmr producer=localhost2 + instance=localhost2/aries_mmr schema=aries_mmr + file=/home/XXX/metrics.txt +| start name=aries_mmr interval=1000000 + +> ldms_ls localhost2/aries_mmr: consistent, last update: Wed Oct 28 +08:48:36 2015 [153343us] u64 0 AR_RTR_1_2_INQ_PRF_INCOMING_FLIT_VC0 u64 +5968204876 AR_NIC_RSPMON_NPT_EVENT_CNTR_NL_FLITS u64 4182142522 +AR_NIC_NETMON_ORB_EVENT_CNTR_REQ_FLITS + +SEE ALSO +===================== + +ldmsd(8), ldms_sampler_base(7), Plugin_cray_sampler_variants(7), +Plugin_aries_linkstatus(7), ldms_quickstart(7), ldmsd_controller(8) diff --git a/rtd/man2rst/Plugin_aries_mmr_configurable.rst b/rtd/man2rst/Plugin_aries_mmr_configurable.rst new file mode 100644 index 000000000..8a3c53112 --- /dev/null +++ b/rtd/man2rst/Plugin_aries_mmr_configurable.rst @@ -0,0 +1,292 @@ +============================= +Plugin_aries_mmr_configurable +============================= + +:Date: 12 Apr 2020 + +.. contents:: + :depth: 3 +.. + +NAME +============================== + +Plugin_aries_mmr_configurable - man page for the aries_mmr_configurable +sampler. + +SYNOPSIS +================================== + +| Within ldmsd_controller or in a configuration file +| config name=aries_mmr_configurable [ = ] + +DESCRIPTION +===================================== + +With LDMS (Lightweight Distributed Metric Service), plugins for the +ldmsd (ldms daemon) are configured via ldmsd_controller or a +configuration file. The aries_mmr_configurable sampler provides aries +network counter information. It is intended to be used for reading and +optionally resetting the configuable counters, however there is nothing +that currently restricts this. + +The particular counters to be read and set are specified by +configuration files. No functional combinations of the counters are +supported (i.e., does not sum or scale values). The available counter +names can be discovered by: gpcd_print_valid_tile_mmrs(); +gpcd_print_valid_nic_mmrs(); gpcd_print_valid_tile_filtering_mmrs(); +gpcd_print_valid_tile_static_mmrs(); + +A utility providing this service is built as check_mmr_configurable into +bin. The counters are described in Cray's Aries Hardware Counters +Document S-0045. Counters described in that document with ':' extensions +cannot be called by the ':' name in this sampler; rather the counter has +to be read by the base name as hex and the fields separated out by mask, +which is beyond the capability of this sampler. + +The aries_XXX_mmr samplers depend on Cray's libgpcd, built with aries +options. This library has been released by Cray in CLE6 and later. You +cannot build this sampler if you do not have the libraries and headers. +If you have the code to build the library, be sure to build the library +with **CFLAGS=-fPIC** + +The set and read metrics are added to separate gpcd_contexts. The order +of the metrics in the output is the contexts in a particular order, with +the metrics in each context as they are specified in the file. The +counters for read and set can only be specified once and cannot be +changed. The counters to be set can be reset to their configured values +at any time by issuing the action=reset command to configure. + +For the config file, all counter names must be fully spelled out (i.e., +does not resolve the shorthand given in the documentation for the +counters). + +To build the aries_mmr_configurable sampler, build with the following +flags: **--enable-aries_mmr** +**--with-aries-libgpcd=,** + +**!!!WARNING!!!** Cray does not recommend use of the configurable +counters outside of CrayPAT. Use this Plugin at your own risk. +**!!!WARNING!!!** + +CONFIGURATION COMMANDS ORDER +====================================================== + +Configuration commands are intended to be issued in the following order: + +- load + +- config action=initialize + +- config action=finalize + +- start + +The following config commands can be issued anytime after the start in +any order + +- config action=reset + +- config action=ls + +CONFIGURATION ATTRIBUTE SYNTAX +======================================================== + +The aries_mmr_configurable plugin uses the sampler_base base class. This +man page covers only the configuration attributes, or those with default +values, specific to the this plugin; see ldms_sampler_base.man for the +attributes of the base class. + +**config** + | name= action= [ = ...] + | configuration line + + name= + | + | This MUST be aries_mmr_configurable + + action= + | + | Options are initialize, finalize, reset, and ls: + + **initialize** + | [schema= setfile= rtrid=] readfile= + | initialize the plugin. sampler_base configuration arguments + should be specified at this point. + + setfile= + | + | Optional configuration file with the counter value assignment + options. + | Format: "name,type,default_value" one entry per line. + | Type is 'H' for Hex or anything else to default to uint64_t. + | Value should be written out in standard decimal or hex + (leading 0x) format. + | Blanklines and comments (specfied by leading #) are allowed. + | The sampler uses gpcd_lookup_mmr_by_name, so only the names + that are in the 'valid' sets specified by the gpcd library + are allowed. As of this writing those can be obtained by: + gpcd_print_valid_tile_mmrs(); gpcd_print_valid_nic_mmrs(); + gpcd_print_valid_tile_filtering_mmrs(); + gpcd_print_valid_tile_static_mmrs(); + + These are printed out in the utility check_mmr_configurable. + + readfile= + | + | Configuration file with the names of the counters to read. + | Format "name,type" one entry per line. + | Type is 'H' for Hex or anything else to default to uint64_t. + Hex values are written out as a char array. + | Blanklines and comments (specfied by leading #) are allowed. + | The sampler uses gpcd_lookup_mmr_by_name, so only the names + that are in the 'valid' sets specified by the gpcd library + are allowed. As of this writing those can be obtained by: + gpcd_print_valid_tile_mmrs(); gpcd_print_valid_nic_mmrs(); + gpcd_print_valid_tile_filtering_mmrs(); + gpcd_print_valid_tile_static_mmrs(); + + These are printed out in the utility check_mmr_configurable. + + rtrid= + | + | Optional unique rtr string identifier (e.g., c0-0c0s0a0). + Defaults to 0 length string. + + schema= + | + | Optional schema name. Defaults to 'aries_mmr_configurable'. + + **finalize** + | + | Creates the mmr_contexts, sets the set counters to the + configured values, and creates the set. Takes no arguments. If + finalize fails, all state is cleared and the plugin can be + configured again. + + **ls** + | + | Prints out the set counter names and their configured values and + also the read counter names. Takes no arguments. + + **reset** + | + | Resets the set counters to their configured values. Takes no + arguments. + +NOTES +=============================== + +- See WARNINGS above. + +- This is entirely independent of the cray_aries_r_sampler. + +- At the moment, no functions of the data (either in the sampler or in + a store) are supported. + +- Counters whose names do not resolve are left out. + +- If you start this sampler on a node for which the counters cannot be + obtained (e.g., an external login node), the set may still get + created, however the sample function will fail and the plugin will be + stopped. + +- While the names are checked to be in the valid set (see note above), + there is nothing that checks that the value that you choose to write + to a counter is valid. + +- If writing the counters is not enabled, this plugin must be run as + root in order to call the gpcd command that enables writing the + counters. + +- This sampler may change at any time. + +BUGS +============================== + +- There is an unavoidable race condition if someone out of band disable + permissions of writing the counters in between the check in this + sampler and the actual write. + +- Because the sampler needs to write this will toggle on the write + ability for anyone. + +EXAMPLES +================================== + +| > more setconf.txt +| AR_NIC_NETMON_ORB_EVENT_CNTR_REQ_FLITS,U,0 +| AR_NIC_ORB_CFG_NET_RSP_HIST_OVF,H,0xFF +| AR_NIC_ORB_CFG_NET_RSP_HIST_1,H,0x000A000500010000 + +| > more readconf.txt +| AR_NIC_NETMON_ORB_EVENT_CNTR_REQ_FLITS,U +| AR_NIC_ORB_CFG_NET_RSP_HIST_OVF,H +| AR_NIC_ORB_CFG_NET_RSP_HIST_1,H +| AR_NIC_ORB_PRF_NET_RSP_HIST_BIN01,H +| AR_NIC_ORB_PRF_NET_RSP_HIST_BIN23,H +| AR_NIC_ORB_PRF_NET_RSP_HIST_BIN45,H +| AR_NIC_ORB_PRF_NET_RSP_HIST_BIN67,H + +| load name=aries_mmr_configurable +| config name=aries_mmr_configurable producer=localhost1 + instance=localhost1/aries_mmr schema=aries_mmr_configurable + setfile=XXX/setconf.txt readfile=XXX/Build/readconf.txt component_id=1 + action=initialize aries_rtr_id=c0-0c0a0 +| config name=aries_mmr_configurable action=finalize +| config name=aries_mmr_configurable action=ls +| start name=aries_mmr_configurable interval=5000000 + +| >ldms_ls +| localhost1/aries_mmr: consistent, last update: Sun Apr 12 19:04:00 + 2020 -0600 [290661us] +| M u64 component_id 1 +| D u64 job_id 0 +| D u64 app_id 0 +| M char[] aries_rtr_id "c0-0c0a0" +| D u64 AR_NIC_NETMON_ORB_EVENT_CNTR_REQ_FLITS 30756 +| D char[] AR_NIC_ORB_CFG_NET_RSP_HIST_OVF "0x0" +| D char[] AR_NIC_ORB_CFG_NET_RSP_HIST_1 "0xa000500010000" +| D char[] AR_NIC_ORB_PRF_NET_RSP_HIST_BIN01 "0xcb400000d6b" +| D char[] AR_NIC_ORB_PRF_NET_RSP_HIST_BIN23 "0x0" +| D char[] AR_NIC_ORB_PRF_NET_RSP_HIST_BIN45 "0x0" +| D char[] AR_NIC_ORB_PRF_NET_RSP_HIST_BIN67 "0x0" + +| Also in the logs from the action=ls: +| Sun Apr 12 19:03:55 2020: INFO : Name default R/S +| Sun Apr 12 19:03:55 2020: INFO : + ------------------------------------------------ -------------------- + ----- +| Sun Apr 12 19:03:55 2020: INFO : + AR_NIC_NETMON_ORB_EVENT_CNTR_REQ_FLITS N/A R +| Sun Apr 12 19:03:55 2020: INFO : AR_NIC_ORB_CFG_NET_RSP_HIST_OVF N/A R +| Sun Apr 12 19:03:55 2020: INFO : AR_NIC_ORB_CFG_NET_RSP_HIST_1 N/A R +| Sun Apr 12 19:03:55 2020: INFO : AR_NIC_ORB_PRF_NET_RSP_HIST_BIN01 N/A + R +| Sun Apr 12 19:03:55 2020: INFO : AR_NIC_ORB_PRF_NET_RSP_HIST_BIN23 N/A + R +| Sun Apr 12 19:03:55 2020: INFO : AR_NIC_ORB_PRF_NET_RSP_HIST_BIN45 N/A + R +| Sun Apr 12 19:03:55 2020: INFO : AR_NIC_ORB_PRF_NET_RSP_HIST_BIN67 N/A + R +| Sun Apr 12 19:03:55 2020: INFO : + AR_NIC_NETMON_ORB_EVENT_CNTR_REQ_FLITS 0 S +| Sun Apr 12 19:03:55 2020: INFO : AR_NIC_ORB_CFG_NET_RSP_HIST_OVF 0xff + S +| Sun Apr 12 19:03:55 2020: INFO : AR_NIC_ORB_CFG_NET_RSP_HIST_1 + 0xa000500010000 S + +| At any time action=ls or action=reset can be called via + ldmsd_controller: +| > more aries_mmr_configurable_controller_reset.sh #!/bin/bash +| echo "config name=aries_mmr_configurable action=reset" +| exit +| > ldmsd_controller --host localhost --port=${port1} -a munge --script + "XXX/aries_mmr_configurable_controller_reset.sh" + +SEE ALSO +================================== + +ldmsd(8), ldms_sampler_base(7), Plugin_cray_sampler_variants(7), +Plugin_aries_linkstatus(7), ldms_quickstart(7), Plugin_aries_mmr(7), +Plugin_aries_rtr_mmr)7), Plugin_aries_nic_mmr(7), ldmsd_controller(8) diff --git a/rtd/man2rst/Plugin_avro_kafka_store.rst b/rtd/man2rst/Plugin_avro_kafka_store.rst new file mode 100644 index 000000000..a847bf240 --- /dev/null +++ b/rtd/man2rst/Plugin_avro_kafka_store.rst @@ -0,0 +1,335 @@ +======================= +Plugin_avro_kafka_store +======================= + +:Date: 30 Mar 2023 + +.. contents:: + :depth: 3 +.. + +NAME +========================= + +avro_kafka_store - LDMSD avro_kafka_store plugin + +SYNOPSIS +============================= + +**config** **name=avro_kafka_store** **producer=PRODUCER** +**instance=INSTANCE** [ **topic=\ TOPIC_FMT** ] [ **encoding=\ JSON** ] +[ **encoding=\ AVRO** ] [ **kafka_conf=\ PATH** ] [ +**serdes_conf=\ PATH** ] + +DESCRIPTION +================================ + +**``avro_kafka_store``** implements a decomposition capable LDMS metric +data store. The **``avro_kafka_store``** plugin does not implement the +**``store``** function and must only be used with decomposition. + +The plugin operates in one of two modes: *JSON*, and *AVRO* (the +default). In *JSON* mode, each row is encoded as a JSON formatted text +string. In *AVRO* mode, each row is associated with an AVRO schema and +serialized using an AVRO Serdes. + +When in *AVRO* mode, the plugin manages schema in cooperation with an +Avro Schema Registry. The location of this registry is specified in a +configuration file or optionally on the **``config``** command line. + +CONFIG OPTIONS +=================================== + +mode + A string indicating the encoding mode: "JSON" will encode messages in + JSON format, "AVRO" will encode messages using a schema and Avro + Serdes. The default is "AVRO". The mode values are not case + sensitive. + +name + Must be avro_kafka_store. + +kafka_conf + A path to a configuration file in Java property format. This + configuration file is parsed and used to configure the Kafka + kafka_conf_t configuration object. The format of this file and the + supported attributes are available here: + https://github.com/edenhill/librdkafka/blob/master/CONFIGURATION.md. + +serdes_conf + A path to a configuration file in Java property format. This + configuration file is parsed and used to configure the Avro Serdes + serdes_conf_t configuration object. The only supported option for + this file is serdes.schema.url. + +TOPIC NAMES +=============================== + +The topic name to which messages are published is defined by the +**topic** configuration parameter. The parameter specifies a string that +is a *format specifier* similar to a printf() format string. If the +**topic** is not specified, it defaults to "%S" which is the format +specifier for the set schema name. + +The '%' character introduces a *format specifier* that will be +substituted in the topic format string to create the topic name. The +format specifiers are as follows: + +%F + The format in which the message is serialized: "json" or "avro". + +%S + The set parameter's *schema* name. + +%I + The instance name of the set, e.g. "orion-01/meminfo". + +%P + The set parameter's *producer* name, e.g. "orion-01." + +%u + The user-name string for the owner of the set. If the user-name is + not known on the system, the user-id is used. + +%U + The user-id (uid_t) for the owner of the set. + +%g + The group-name string for the group of the set. If the group-name is + not known on the system, the group-id is used. + +%G + The group-id (gid_t) for the group of the set. + +%a + The access/permission bits for the set formatted as a string, e.g. + "-rw-rw----". + +%A + The access/permission bits for the set formatted as an octal number, + e.g. 0440. + +Note that a topic name must only consist of a combination of the +characters [a-zA-Z0-9\\.\_\\-]. In order to ensure that the format +specifier above will not produce invalid topic names, any character that +results from a format specifier substitution that is not in the valid +list will be substituted with a '.'. + +STRGP +========================= + +The avro_kafka_store is used with a storage policy that specifies +avro_kafka_store as the plugin parameter. + +The *schema*, *instance*, *producer* and *flush* strgp_add parameters +have no affect on how data is stored. If the *container* parameter is +set to any value other than an empty string, it will override the +bootstrap.servers Kafka configuration parameter in the kafka_conf file +if present. + +JSON Mode +============================= + +JSON mode encodes messages as self describing text objects. Each message +is a JSON dictionary based on the following template: RS 4 + +:: + + { + "" : , + "" : , + ... + } + +Each row in the decomposition is encoded as shown. The **attr-value** +types are mapped to either quoted strings, floating-point, or integers +as defined by the source metric type in the LDMS metric set. The mapping +is as follows: + ++------------------+----------------------+------------------------+ +| **Metric Type** | **Format Specifier** | **Description** | ++------------------+----------------------+------------------------+ +| LDMS_V_TIMESTAMP | %u.%06u | Floating point number | +| | | in seconds | ++------------------+----------------------+------------------------+ +| LDMS_V_U8 | %hhu | Unsigned integer | ++------------------+----------------------+------------------------+ +| LDMS_V_S8 | %hhd | Signed integer | ++------------------+----------------------+------------------------+ +| LDMS_V_U16 | %hu | Unsigned integer | ++------------------+----------------------+------------------------+ +| LDMS_V_S16 | %hd | Signed integer | ++------------------+----------------------+------------------------+ +| LDMS_V_U32 | %u | Unsigned integer | ++------------------+----------------------+------------------------+ +| LDMS_V_S32 | %d | Signed integer | ++------------------+----------------------+------------------------+ +| LDMS_V_U64 | %lu | Unsigned integer | ++------------------+----------------------+------------------------+ +| LDMS_V_S64 | %ld | Signed integer | ++------------------+----------------------+------------------------+ +| LDMS_V_FLOAT | %.9g | Floating point | ++------------------+----------------------+------------------------+ +| LDMS_V_DOUBLE | %.17g | Floating point | ++------------------+----------------------+------------------------+ +| LDMS_V_STRING | "%s" | Double quoted string | ++------------------+----------------------+------------------------+ +| LDMS_V_ARRAY_xxx | [ v0, v1, ... ] | Comma separated value | +| | | list surrounding by | +| | | '[]' | ++------------------+----------------------+------------------------+ + +Example JSON Object +------------------- + +{"timestamp":1679682808.001751,"component_id":8,"dev_name":"veth1709f8b","rx_packets":0,"rx_err_packets":0,"rx_drop_packets":0,"tx_packets":858,"tx_err_packets":0,"tx_drop_packets":0} + +Avro Mode +============================= + +In Avro mode, LDMS metric set values are first converted to Avro values. +The table below describes how each LDMS metric set value is represented +by an Avro value. + +Each row in the decomposition is encoded as a sequence of Avro values. +The target Avro type is governed by the Avro schema. The mapping is as +follows: + ++-------------------+---------------+--------------------------------+ +| **Metric Type** | **Avro Type** | **Description** | ++-------------------+---------------+--------------------------------+ +| LDMS_V_TIMESTAMP | AVRO_INT32 | Seconds portion of timestamp | +| | | value is stored in the Avro | +| | | integer | ++-------------------+---------------+--------------------------------+ +| LDMS_V_TIMESTAMP | AVRO_INT64 | tv_secs + 1000 \* tv_usecs is | +| | | stored in Avro long integer | ++-------------------+---------------+--------------------------------+ +| LDMS_V_TIMESTAMP | AVRO_RECORD | Seconds portion is stored in | +| | | seconds portion of record, | +| | | usecs is stored in the | +| | | micro-seconds portion of the | +| | | record | ++-------------------+---------------+--------------------------------+ +| LDMS_V_U8 | AVRO_INT32 | avro_value_set_int | ++-------------------+---------------+--------------------------------+ +| LDMS_V_S8 | AVRO_INT32 | avro_value_set_int | ++-------------------+---------------+--------------------------------+ +| LDMS_V_U16 | AVRO_INT32 | avro_value_set_int | ++-------------------+---------------+--------------------------------+ +| LDMS_V_S16 | AVRO_INT32 | avro_value_set_int | ++-------------------+---------------+--------------------------------+ +| LDMS_V_U32 | AVRO_INT64 | avro_value_set_long | ++-------------------+---------------+--------------------------------+ +| LDMS_V_S32 | AVRO_INT32 | avro_value_set_int | ++-------------------+---------------+--------------------------------+ +| LDMS_V_U64 | AVRO_INT64 | avro_value_set_long | ++-------------------+---------------+--------------------------------+ +| LDMS_V_S64 | AVRO_INT64 | avro_value_set_long | ++-------------------+---------------+--------------------------------+ +| LDMS_V_FLOAT | AVRO_FLOAT | avro_value_set_float | ++-------------------+---------------+--------------------------------+ +| LDMS_V_DOUBLE | AVRO_DOUBLE | avro_value_set_double | ++-------------------+---------------+--------------------------------+ +| LDMS_V_CHAR_ARRAY | AVRO_STRING | avro_value_set_string | ++-------------------+---------------+--------------------------------+ +| LDMS_V_ARRAY_xxx | AVRO_ARRAY | Comma separated value list or | +| | | primitive type surrounded by | +| | | '[]' | ++-------------------+---------------+--------------------------------+ + +Schema Creation +--------------- + +Each row in the LDMS metric set presented for storage is used to +generate an Avro schema definition. The table above shows the Avro types +that are used to store each LDMS metric type. Note that currently, all +LDMS_V_TIMESTAMP values in a metric set are stored as the Avro logical +type "timestamp-millis" and encoded as an Avro long. + +Unsigned types are currently encoded as signed types. The case that +could cause issues is LDMS_V_U64 which when encoded as AVRO_LONG will +result in a negative number. One way to deal with this is to encode +these as AVRO_BYTES[8] and let the consumer perform the appropriate +cast. This, however, seems identical to simply encoding it as a signed +long and allow the consumer to cast the signed long to an unsigned long. + +Schema Registration +------------------- + +The Avro schema are generated from the row instances presented to the +commit() storage strategy routine. The **schema_name** that is contained +in the row instance is used to search for a serdes schema. This name is +first searched for in a local RBT and if not found, the Avro Schema +Registry is consulted. If the schema is not present in the registry, a +new Avro schema is constructed per the table above, registered with the +schema registry and stored in the local cache. + +Encoding +-------- + +After the schema is located, constructed, and or registered for the row, +the schema in conjunction with libserdes is used to binary encode the +Avro values for each column in the row. Once encoded, the message is +submitted to Kafka. + +Client Side Decoding +-------------------- + +Consumers of topics encoded with libserdes will need to perform the +above procedure in reverse. The message received via Kafka will have the +schema-id present in the message header. The client will use this +schema-id to query the Schema registry for a schema. Once found, the +client will construct a serdes from the schema definition and use this +serdes to decode the message into Avro values. + +EXAMPLES +============================= + +kafka_conf Example File +------------------------ + + :: + + # Lines beginning with '#' are considered comments. + # Comments and blank lines are ignored. + + # Specify the location of the Kafka broker + bootstrap.server=localhost:9092 + +serdes_conf Example File +------------------------- + + :: + + # Specify the location of the Avro Schema registry. This can be overridden + # on the strgp_add line with the "container" strgp_add option if it is + # set to anything other than an empty string + serdes.schema.url=https://localhost:9092 + +Example strg_add command +------------------------- + + :: + + strgp_add name=aks plugin=avro_kafka_store container=kafka-broker.int:9092 decomposition=aks-decomp.conf + strgp_start name=aks + +Example plugin configuration +---------------------------- + + :: + + config name=avro_kafka_store encoding=avro kafka_conf=/etc/kakfa.conf serdes_conf=/etc/serdes.conf topic=ldms.%S + strgp_start name=aks + +NOTES +========================= + +This man page is a work in progress. + +SEE ALSO +============================ + +**ldmsd**\ (8), **ldmsd_controller**\ (8), **ldmsd_decomposition**\ (7), +**ldms_quickstart**\ (7) diff --git a/rtd/man2rst/Plugin_blob_stream_writer.rst b/rtd/man2rst/Plugin_blob_stream_writer.rst new file mode 100644 index 000000000..46f8c7475 --- /dev/null +++ b/rtd/man2rst/Plugin_blob_stream_writer.rst @@ -0,0 +1,133 @@ +========================= +Plugin_blob_stream_writer +========================= + +:Date: 15 Jun 2021 + +.. contents:: + :depth: 3 +.. + +NAME +========================== + +Plugin_blob_stream_writer - man page for the LDMS blob_stream_writer +plugin + +SYNOPSIS +============================== + +| Within ldmsd_controller or a configuration file: +| config name=blob_stream_writer [ = ] + +DESCRIPTION +================================= + +With LDMS (Lightweight Distributed Metric Service), plugins for the +ldmsd (ldms daemon) are configured via ldmsd_controller or a +configuration file. The blob_stream_writer plugin writes out raw stream +messages and offsets of the messages in separate files. Messages are not +appended with ' or ' '. Multiple streams may be specified. + +CONFIGURATION ATTRIBUTE SYNTAX +==================================================== + +**config** + | name=blob_stream_writer path= container= + stream= debug=1 + | configuration line + + name= + | + | This MUST be blob_stream_writer. + + path= + | + | path to the directory of the output files + + container= + | + | directory of the output file + + stream= + | + | stream to which to subscribe. This argument may be repeated. + Each stream will be written in a separate file pair. + + debug=1 + | + | Enable logging of messages stored to the log file. + + timing=1 + | + | Enable writing timestamps to a separate file. + +OUTPUT FORMAT +=================================== + +There is no requirement that any message must the same format as any +other. + +The writer writes all messages received to a file pair: +$path/$container/$stream.OFFSET.$create_time +$path/$container/$stream.DAT.$create_time where OFFSET is the byte +offsets into the corresponding .DAT of the messages seen on the stream. + +Each byte offset is written as a little-endian 64 bit number. Data read +from .OFFSET should be converted to host order with le64toh. + +Both DAT and OFFSET files begin with an 8 byte magic number: blobdat\\0 +and bloboff\\0, respectively. + +Optionally (if timing=1 given) the additional file +$path/$container/$stream.TIMING.$create_time is created containing +binary timestamps corresponding to the messages. The TIMING file begins +with an 8 byte magic number: blobtim\\0. Each time is the delivery time +to the plugin performing the blob storage. Each timestamp is written to +the .TIMING file as a binary pair (tv_sec, tv_usec) with each value +stored as a little-endian 64 bit value which should be read and then +converted with le64toh. + +NOTES +=========================== + +This writer is in development and may be changed at any time. + +Cannot support stream=.\* as there is no corresponding regex +subscription policy currently available in the C stream API. + +The config operation may called at any time or repeated. The start and +stop operations will start and stop storage of all streams. + +The plugin appears in C code as a sampler plugin, since the storage +policy and store plugin interfaces are set-oriented and no sets are +involved here. + +EXAMPLES +============================== + +Within ldmsd_controller or a configuration file: + +:: + + load name=blob_stream_writer + config name=blob_stream_writer path=/writer/streams container=${CLUSTER} stream=foo stream=slurm stream=kokkos + start name=name=blob_stream_writer + +Examining offsets in a shell: + +:: + + od od -A d -t u8 -j 8 -w8 slurm.OFFSET.1624033344 |sed -e 's/[0-9,A-F,a-f]* *//' + +Examining timestamps in a shell: + +:: + + od -A d -j 8 -t u8 + +SEE ALSO +============================== + +ldmsd(8), ldms_quickstart(7), ldmsd_controller(8), le64toh(3), fseek(3), +od(1) diff --git a/rtd/man2rst/Plugin_clock.rst b/rtd/man2rst/Plugin_clock.rst new file mode 100644 index 000000000..f5af75541 --- /dev/null +++ b/rtd/man2rst/Plugin_clock.rst @@ -0,0 +1,70 @@ +============ +Plugin_clock +============ + +:Date: 18 Feb 2018 + +.. contents:: + :depth: 3 +.. + +NAME +============= + +Plugin_clock - man page for the LDMS clock plugin + +SYNOPSIS +================= + +| Within ldmsd_controller or a configuration file: +| config name=clock [ = ] + +DESCRIPTION +==================== + +The clock plugin provides a counter of samples taken since it started. +This is of pedagogical interest and useful for detecting situations +where a sample is missed either in being taken or in transmission. + +CONFIGURATION ATTRIBUTE SYNTAX +======================================= + +The clock plugin uses the sampler_base base class. This man page covers +only the configuration attributes, or those with default values, +specific to the this plugin; see ldms_sampler_base.man for the +attributes of the base class. + +**config** + | name= [schema=] + | configuration line + + name= + | + | This MUST be clock + + schema= + | + | Optional schema name. It is intended that the same sampler on + different nodes with different metrics have a different schema. + If not specified, will default to \`clock\`. + +BUGS +============= + +No known bugs. + +EXAMPLES +================= + +Within ldmsd_controller or a configuration file: + +:: + + load name=clock + config name=clock producer=vm1_1 instance=vm1_1/clock + start name=clock interval=1000000 offset=0 + +SEE ALSO +================= + +ldmsd(8), ldms_quickstart(7), ldmsd_controller(8), ldms_sampler_base(7) diff --git a/rtd/man2rst/Plugin_coretemp.rst b/rtd/man2rst/Plugin_coretemp.rst new file mode 100644 index 000000000..2065e627e --- /dev/null +++ b/rtd/man2rst/Plugin_coretemp.rst @@ -0,0 +1,55 @@ +=============== +Plugin_coretemp +=============== + +:Date: 3 May 2022 + +.. contents:: + :depth: 3 +.. + +NAME +================ + +Plugin_coretemp - An LDMS sampler plugin that monitors CPU temperature +data + +SYNOPSIS +==================== + +| Within ldmsd_controller or a configuration file: +| load name=coretemp config name=coretemp producer= + instance= component_id= + +DESCRIPTION +======================= + +The coretemp sampler collects information from the Linux coretemp module +through files located in /sys/devices/platform. Files in this directory +are walked recursively and regular expressions are used to select +entries produced by the Linux coretemp module. + +See the Linux modprobe(8) command for information on how to load Linux +modules. + +CONFIGURATION ATTRIBUTE SYNTAX +========================================== + +See man Plugin_base. + +EXAMPLES +==================== + +Within ldmsd_controller or a configuration file: + +:: + + load name=coretemp + config name=coretempp producer=vm1_1 instance=vm1_1/coretemp + start name=coretemp interval=1000000 offset=0 + +SEE ALSO +==================== + +ldmsd(8), ldms_quickstart(7), ldmsd_controller(8), ldms_sampler_base(7), +Plugin_filesingle(7) diff --git a/rtd/man2rst/Plugin_cray_dvs_sampler.rst b/rtd/man2rst/Plugin_cray_dvs_sampler.rst new file mode 100644 index 000000000..7788bea20 --- /dev/null +++ b/rtd/man2rst/Plugin_cray_dvs_sampler.rst @@ -0,0 +1,108 @@ +======================= +Plugin_cray_dvs_sampler +======================= + +:Date: 05 Feb 2018 + +.. contents:: + :depth: 3 +.. + +NAME +======================== + +Plugin_cray_dvs_sampler - man page for the LDMS cray_dvs_sampler plugin + +SYNOPSIS +============================ + +| Within ldmsd_controller or a configuration file: +| config name=cray_dvs_sampler [ = ] + +DESCRIPTION +=============================== + +With LDMS (Lightweight Distributed Metric Service), plugins for the +ldmsd (ldms daemon) are configured via ldmsd_controller or a +configuration file. + +The cray_dvs_sampler plugin provides memory info from +/proc/fs/dvs/mount/[mount-id]/stats. A separate metric set is produced +for each mount point. Metric set names are of the form \`XXX'. + +See section \`DATA AND THE CONFIGURATION FILE' for information on the +variables and configuration file. + +This sampler is for Cray systems only. + +CONFIGURATION ATTRIBUTE SYNTAX +================================================== + +The cray_dvs_sampler plugin uses the sampler_base base class. This man +page covers only the configuration attributes, or those with default +values, specific to the this plugin; see ldms_sampler_base.man for the +attributes of the base class. + +**config** + | name= [schema= conffile=] + | configuration line + + name= + | + | This MUST be cray_dvs_sampler + + schema= + | + | Optional schema name. It is intended that the same sampler on + different nodes with different metrics have a different schema. + If not specified, will default to \`cray_dvs_sampler\`. + + conffile= + | + | Optional path to the configuration file + +DATA AND THE CONFIGURATION FILE +=================================================== + +| The data source is /proc/fs/dvs/mount/[mount-id]/stats. This file + consists of a number of lines of the format +| variablename: v1 v2 ... vN + +The number of values varies between 1 and 6. Each line will then produce +between 1 and 6 metrics with names of the form variablename appended by +an additional string associated with thr interpretation of that value +(e.g, min, err). + +By default, this sampler will collect all the variables for all mount +points. The number of metrics can be downselected by using a +configuration file (see conffile argument). The format of this file is +one variablename per line, comments start with '#' and blank lines are +skipped. Note that the variablename from the dataline is what is +specified in the configuration file, not the metricnames associated with +that variablename in the data source file. As a result, all metrics +associated with a give line in the dvs stats source are included or +excluded together. + +NOTES +========================= + +- In the config, the sampler is called cray_dvs_sampler. Also the + library is called libcray_dvs_sampler. However, the source file is + dvs_sampler.c + +- This sampler is for Cray systems only. + +BUGS +======================== + +None known. + +EXAMPLES +============================ + +TBD + +SEE ALSO +============================ + +ldmsd(8), ldms_quickstart(7), ldmsd_controller(8), ldms_sampler_base(7) diff --git a/rtd/man2rst/Plugin_daos_sampler.rst b/rtd/man2rst/Plugin_daos_sampler.rst new file mode 100644 index 000000000..b8d742adb --- /dev/null +++ b/rtd/man2rst/Plugin_daos_sampler.rst @@ -0,0 +1,117 @@ +=================== +Plugin_daos_sampler +=================== + +:Date: 28 Apr 2022 + +.. contents:: + :depth: 3 +.. + +NAME +==================== + +Plugin_daos_sampler - man page for the LDMS DAOS sampler plugin + +SYNOPSIS +======================== + +| Within ldmsd_controller or a configuration file: +| load name=daos_sampler +| config name=daos_sampler producer=${HOSTNAME} +| start name=daos_sampler interval=1000000 + +DESCRIPTION +=========================== + +The daos_sampler plugin collects DAOS telemetry from local DAOS I/O +Engine instances. + +CONFIGURATION ATTRIBUTE SYNTAX +============================================== + +The daos_sampler plugin uses the sampler_base base class. This man page +only covers the configuration attributes specific to this plugin; see +ldms_sampler_base.man for the attributes of the base class. + +name= + | + | This MUST be daos_sampler. + +producer=$HOSTNAME + | + | The $HOSTNAME variable provides a good unique producer ID. + +engine_count=2 + | + | The default is 2; don't change it unless the number of per-server + engines is different. + +target_count=8 + | + | The default is 8; don't change it unless the number of targets per + engine is different. + +**SAMPLE FORMAT** + +The DAOS telemetry is exposed as a set of trees, with the system name as +the root: + +:: + + $system/$rank/$target - Per-engine target metrics not associated with a pool + $system/$rank/$pool - Per-engine top-level pool metrics + $system/$rank/$pool/$target - Per-engine target metrics associated with a pool + +Under each tree is a set of metrics in either counter or gauge format. +Counters are monotonically-increasing uint64 values; gauges are +instantaneous-read uint64 values that can vary up or down. Certain gauge +metrics may have associated statistics in min/max/count/mean/stddev +format. + +**EXAMPLE SAMPLER USAGE** + +Start ldmsd as usual, for example: + +:: + + $ ldmsd -m1MB -x sock:10444 -F -c /path/to/sampler.conf + +NOTE: The default memory size (512KB) may be too small for the number of +metrics collected. Larger sizes may be specified for a large number of +pools. + +Once ldmsd is running, it is possible to check that the DAOS telemetry +appears in the output of ldms_ls, for example: + +:: + + $ ldms_ls -h localhost -x sock -p 10444 -l + daos_server/0/0: consistent, last update: Wed Aug 25 18:40:25 2021 +0000 [653335us] + M char[] system "daos_server" + M u32 rank 0 + M u32 target 0 + D u64 io/latency/update/256B 0 + D u64 io/latency/update/256B/min 0 + D u64 io/latency/update/256B/max 0 + D u64 io/latency/update/256B/samples 0 + D d64 io/latency/update/256B/mean 0.000000 + D d64 io/latency/update/256B/stddev 0.000000 + D u64 io/latency/update/32KB 611 + D u64 io/latency/update/32KB/min 611 + D u64 io/latency/update/32KB/max 611 + D u64 io/latency/update/32KB/samples 1 + D d64 io/latency/update/32KB/mean 611.000000 + D d64 io/latency/update/32KB/stddev 0.000000 + D u64 io/latency/update/64KB 0 + D u64 io/latency/update/64KB/min 0 + D u64 io/latency/update/64KB/max 0 + D u64 io/latency/update/64KB/samples 0 + D d64 io/latency/update/64KB/mean 0.000000 + D d64 io/latency/update/64KB/stddev 0.000000 + D u64 io/latency/update/128KB 1018 + D u64 io/latency/update/128KB/min 567 + D u64 io/latency/update/128KB/max 1214 + D u64 io/latency/update/128KB/samples 8 + D d64 io/latency/update/128KB/mean 828.000000 + D d64 io/latency/update/128KB/stddev 238.011404 diff --git a/rtd/man2rst/Plugin_darshan_stream_store.rst b/rtd/man2rst/Plugin_darshan_stream_store.rst new file mode 100644 index 000000000..6195c1611 --- /dev/null +++ b/rtd/man2rst/Plugin_darshan_stream_store.rst @@ -0,0 +1,102 @@ +============================ +main +============================ + +:Date: 26 September 2021 + +.. contents:: + :depth: 3 +.. + +NAME +============================ + +Plugin_darshan_stream_store - LDMS darshan_stream_store plugin + +SYNOPSIS +================================ + +| Within ldmsd_controller or a configuration file: +| config name=darshan_stream_store [ = ] + +DESCRIPTION +=================================== + +With LDMS (Lightweight Distributed Metric Service), plugins for the +ldmsd (ldms daemon) are configured via ldmsd_controller or a +configuration file. The darshan_stream_store plugin writes out a single +darshan json stream's data to SOS container. The input data produced by +the LDMS darshan plugin consist of two types of messages: "MOD" for +module data and "MET for meta data. Both messages saved into the same +SOS container. + +CONFIGURATION ATTRIBUTE SYNTAX +====================================================== + +**config** + | name=darshan_stream_store path= stream= [mode=] + | configuration line + + name= + | + | This MUST be darshan_stream_store. + + path= + | + | The path to the root of the SOS container store (should be + created by the user) + + stream= + | + | stream to which to subscribe. + + mode= + | + | The container permission mode for create, (defaults to 0660). + +INPUT JSON FORMAT +========================================= + +The input json has a "type" field, and this type used to select the the +message type between module data and meta data. + +A MOD darshan JSON example is shown below: + +{"job_id":6582,"rank":0,"ProducerName":"nid00021","file":"N/A","record_id":6222542600266098259,"module":"POSIX","type":"MOD","max_byte":16777215,"switches":0,"cnt":1,"op":"writes_segment_0","seg":[{"off":0,"len":16777216,"dur":0.16,"timestamp":1631904596.737955}]} + +A MET darshan JSON example is shown below: + +Some fields are set to -1 if they don't have data for that message type. + +BUGS +============================ + +No known bugs. + +NOTES +============================= + +This store is in development and may be changed at any time. + +Only supports one stream + +EXAMPLES +================================ + +Within ldmsd_controller or a configuration file: + +:: + + load name=darshan_stream_store + config name=darshan_stream_store path=/tmp/darshan_stream stream=darshanConnector + + prdcr_add name=localhost1 host=localhost type=active xprt=sock port=52001 interval=20000000 + prdcr_subscribe stream=darshanConnector regex=localhost* + prdcr_start name=localhost1 + +SEE ALSO +================================ + +ldmsd(8), ldms_quickstart(7), ldmsd_controller(8), ldms_sampler_base(7), +darshan_publisher, darshan_sampler, parser.pl (has perlpod), +Plugin_darshan_cat_publisher(7) diff --git a/rtd/man2rst/Plugin_dcgm_sampler.rst b/rtd/man2rst/Plugin_dcgm_sampler.rst new file mode 100644 index 000000000..5e2942fb9 --- /dev/null +++ b/rtd/man2rst/Plugin_dcgm_sampler.rst @@ -0,0 +1,83 @@ +=================== +Plugin_dcgm_sampler +=================== + +:Date: 1 May 2019 + +.. contents:: + :depth: 3 +.. + +NAME +==================== + +Plugin_dcgm_sampler - man page for the LDMS dcgm_sampler plugin + +SYNOPSIS +======================== + +| Within ldmsd_controller or a configuration file: +| config name=dcgm_sampler [ = ] + +DESCRIPTION +=========================== + +With LDMS (Lightweight Distributed Metric Service), plugins for the +ldmsd (ldms daemon) are configured via ldmsd_controller or a +configuration file. The dcgm_sampler plugin provides a metric set for +each DCGM-compatible Nvidia GPU on the system. The schema is named +"dcgm" by default. + +CONFIGURATION ATTRIBUTE SYNTAX +============================================== + +**config** + | name= interval= [fields=] + [schema=] [job_set=] + | configuration line + + name= + | + | This MUST be dcgm_sampler. + + interval= + | + | The sampling interval. This MUST be set to the same value that + is set on the "start" line, otherwise behavior is undetermined. + + fields= + | + | is a comma-separated list of integers representing DCGM + field numebers that the plugin should watch. By default the + plugin will watch fields 150,155. + + schema= + | + | The schema name defaults to "dcgm", but it can be renamed at the + user's choice. + + job_set= + | + | The name of the metric set that contains the job id information + (default=job_id) + +BUGS +==================== + +No known bugs. + +EXAMPLES +======================== + +Within ldmsd_controller or a configuration file: + +:: + + load name=dcgm_sampler + config name=dcgm_sampler interval=1000000 fields=150,155,1001,1002,1003 schema=dcgmfav5 + start name=dcgm_sampler interval=1000000 + +SEE ALSO +======================== + +ldmsd(8), ldms_quickstart(7), ldmsd_controller(8), ldms_sampler_base(7) diff --git a/rtd/man2rst/Plugin_dstat.rst b/rtd/man2rst/Plugin_dstat.rst new file mode 100644 index 000000000..c09096bcf --- /dev/null +++ b/rtd/man2rst/Plugin_dstat.rst @@ -0,0 +1,152 @@ +============ +Plugin_dstat +============ + +:Date: 4 Nov 2020 + +.. contents:: + :depth: 3 +.. + +NAME +============= + +Plugin_dstat - man page for the LDMS dstat plugin + +SYNOPSIS +================= + +| Within ldmsd_controller +| config name=dstat [ = ] + +DESCRIPTION +==================== + +The dstat plugin provides ldmsd process information from +/proc/self/[io,stat,statm,fd]. + +CONFIGURATION ATTRIBUTE SYNTAX +======================================= + +**config** + | name= component_id= [io=] [stat=] + [statm=] [mmalloc=] [fd=] [fdtypes=] + set= + | configuration line + + name= + | + | This MUST be dstat. + + producer= + | + | The producer string value. + + instance= + | + | The name of the metric set. + + schema= + | + | Optional schema name. It is required by most storage backends + that the same sampler on different nodes with different metric + subsets needs to have a unique schema name. Use auto-schema=1 + instead of schema to automatically meet the backend requirement. + + auto-schema= + | + | If true, change the schema name to dstat\_$X, where $X will be a + unique hex value derived from the data selection options. If + both schema and auto-schema are given, for + backward-compatibility auto-schema is ignored for the dstat + plugin. + + component_id= + | + | The component id numerical value. + + io= + | + | Include the metrics from /proc/self/io. + + stat= + | + | Include the metrics from /proc/self/stat. + + tick= + | + | Include the sc_clk_tck from sysconf(3) as a metric. + + statm= + | + | Include the metrics from /proc/self/statm. + + mmalloc= + | + | Include the mmap memory usage metric from LDMS mmalloc. + + fd= + | + | Include the number of open file descriptors found in + /proc/self/fd. + + fdtypes= + | + | Include the number and types of open file descriptors found in + /proc/self/fd. This option may have high overhead on aggregators + with many open connections. + +DATA +============= + +This reports metrics from /proc/self/[io,stat,statm] by default. If +specific subsets are named (io=true), then unnamed sets are suppressed. +Units on the /proc metric values are documented in the man pages. The +unit of the mmalloc metric is bytes. + +EXAMPLES +================= + +Within ldmsd_controller or a configuration file: + +:: + + load name=dstat + config name=dstat producer=vm1_1 component_id=1 instance=vm1_1/dstat + start name=dstat interval=1000000 + +NOTES +============== + +See proc(5) for the definitions of all the metrics except sc_clk_tck and +fd data. Metrics which are invariant (other than pids and sc_clk_tck) +are not included. Where naming is potentially ambiguous and a more +specific name is used in /proc/self/status for the same metrics, the +name from /proc/self/status is used. + +Requesting mmalloc or fd or fdtypes (any of which may be high overhead) +requires explicitly requesting it and all others which are wanted. + +The numbers listed in /proc/self/fd/ are symbolic links. The "types" of +reported are based on the names pointed to by the links as follows: + +:: + + fd_count total number of open file descriptors. + fd_max highest file number. + fd_socket count of link targets starting with "socket:" + fd_dev count of link targets starting with "/dev:" + fd_anon_inode count of link targets starting with "anon_inode:" + fd_pipe count of link targets starting with "pipe:" + fd_path count of link targets starting with . or / but not /dev. + +On most HPC Linux systems sc_clk_tck is 100 Hz. Less common values are +250, 300, and 1000. + +This is the LDMSD answer to the ancient question "Quis custodiet ipsos +custodes?" + +SEE ALSO +================= + +proc(5), ldmsd(8), sysconf(3) diff --git a/rtd/man2rst/Plugin_edac.rst b/rtd/man2rst/Plugin_edac.rst new file mode 100644 index 000000000..e82811fe6 --- /dev/null +++ b/rtd/man2rst/Plugin_edac.rst @@ -0,0 +1,108 @@ +=========== +Plugin_edac +=========== + +:Date: 18 Feb 2018 + +.. contents:: + :depth: 3 +.. + +NAME +============ + +Plugin_edac - man page for the LDMS edac plugin + +SYNOPSIS +================ + +| Within ldmsd_controller +| config name=edac [ = ] + +DESCRIPTION +=================== + +The edac plugin provides memory error information from +/sys/devices/system/edac for correctable and uncorrectable errors. + +CONFIGURATION ATTRIBUTE SYNTAX +====================================== + +The edac plugin uses the sampler_base base class. This man page covers +only the configuration attributes, or those with default values, +specific to the this plugin; see ldms_sampler_base.man for the +attributes of the base class. + +**config** + | name= max_mc= max_csrow= + [schema=] + | configuration line + + name= + | + | This MUST be edac. + + schema= + | + | Optional schema name. It is intended that the same sampler on + different nodes with different metrics have a different schema. + If not specified, will default to edac. + + max_mc= + | + | The number of mc's in /sys/devices/system/edac/mc. Typically + this number is 2. + + max_csrow= + | + | The number of csrows in a single mc. For example, the value + should be 4 for when the largest csrow looks like: + /sys/devices/system/edac/mc/mc0/csrow3. Typically this number is + 8, but it can vary depending on the system. + +DATA +============ + +This reports counts for both correctable and uncorrectable errors per mc +and per csrow. It also reports the seconds since reset per mc. + +EXAMPLES +================ + +Within ldmsd_controller or a configuration file: + +:: + + load name=edac + config name=edac producer=vm1_1 component_id=1 instance=vm1_1/edac max_mc=2 max_csrow=4 + start name=edac interval=1000000 + +NOTES +============= + +An upper limit on metric set size is enforced. Configuring to collect +too many registers will generate an error detailing the compiled size +limit. This limit is only adjustable in the source code. + +For more detailed background information, see +www.kernel.org/doc/Documentation/edac.txt and +www.kernel.org/doc/Documentation/ABI/testing/sysfs-devices-edac. + +AUTHORS +=============== + +Kathleen Shoga (Lawrence Livermore National +Laboratory). Ported to LDMS v3 by Benjamin Allan . +Ported to LDMS v4 by Ann Gentile . + +ACKNOWLEDGMENTS +======================= + +This work was created under the auspices of the U.S. Department of +Energy by Lawrence Livermore National Laboratory under Contract +DE-AC52-07NA27344. Release Number: LLNL-SM-687054. + +SEE ALSO +================ + +edac(3), edac-util(8), edac-ctl(8), ldms(7), ldms_sampler_base(7) diff --git a/rtd/man2rst/Plugin_filesingle.rst b/rtd/man2rst/Plugin_filesingle.rst new file mode 100644 index 000000000..73cd8f547 --- /dev/null +++ b/rtd/man2rst/Plugin_filesingle.rst @@ -0,0 +1,129 @@ +================= +Plugin_filesingle +================= + +:Date: 15 Dec 2018 + +.. contents:: + :depth: 3 +.. + +NAME +================== + +Plugin_filesingle - man page for the LDMS filesingle plugin + +SYNOPSIS +====================== + +| Within ldmsd_controller or in a configuration file +| config name=filesingle conf= [timing] + +DESCRIPTION +========================= + +The filesingle plugin provides metrics pulled from files containing a +single numeric value or character. This supports flexible definition of, +among others, sensor hardware, file system, and cpu metrics. + +CONFIGURATION ATTRIBUTE SYNTAX +============================================ + +See ldms_sampler_base(7) for the common sampler options. + +**config** + | conf= [timing] + + conf= + | + | File lines contain the source, type, and default value for each + metric. See CONF FILE SYNTAX below. + + timing + | + | If keyword 'timing' is included in the options, extra metrics + measuring the time to collect every defined metric will be + included. This allows for the discovery of slow sensors. Each + timing metric will have the name of the timed metric with + ".time" appended. Do not use "timing="; it is ignored. + +COLLECTION +======================== + +Each metric is collected from a separate file. If this process fails for +any reason at all, the default value is collected instead. The timing +metrics (type S64) report the number of microseconds measured bracketing +the open/read/close cycle of the metric's value file. The timing of a +failed collection is -1. Each file is open, read, and closed for each +data sample collected. + +CONF FILE SYNTAX +============================== + +Each line of the conf file must be empty, contain a comment or contain: + + + +The metric and file names must not contain spaces. The metric type is +one of: S8, S16, S32, S64, U8, U16, U32, U64, F32, D64, CHAR. + +Lines starting with # are comment lines. Line continuations are not +allowed. + +The script ./ldms-sensors-config(1) generates an example metrics config +file from the data reported by sensors(1). Metric names, types, and +defaults generated can be tuned to user preferences. + +EXAMPLES +====================== + +Within ldmsd_controller or a configuration file: + +:: + + load name=filesingle + config name=filesingle conf=/etc/sysconfig/ldms.d/plugins-conf/filesingle.conf + start name=filesingle interval=10000000 offset=0 + +For the contents of filesingle.conf (on a specific machine): + +:: + + power1 /sys/class/hwmon/hwmon0/device/power1_average S64 -1 + coretemp.Physical_id_0 /sys/class/hwmon/hwmon1/temp1_input S64 -1 + coretemp.Core_0 /sys/class/hwmon/hwmon1/temp2_input S64 -1 + core0.cur_freq /sys/devices/system/cpu/cpu0/cpufreq/cpuinfo_cur_freq S64 -1 + +The power reading, two temperatures, and cpu frequency are collected. + +NOTES +=================== + +The values collected are the raw values from the sources; converting to +humane units is left to data post-processors. In the specific example +given, the raw power reading has units of microwatts, the temperatures +have units of millidegrees Celsius, and the cpu frequency is reported in +milliHertz. To determine the appropriate unit conversions for your +system, compare the output of sensors(1) or lscpu(1) to the value found +in the raw data files. + +To determine the file locations of metrics on your system consult the +documentation for the device drivers of interest or the output of +ldms-sensors-config(1) or + +"strace -e trace=open " + +Some metric files may only be readable by the users with administrative +privileges. Some of these may be available without privilege by +extracting them from larger files in /proc, e.g. "cpu MHz" in +/proc/cpuinfo. + +Some sensors may not update themselves (at the kernel level) faster than +a certain frequency, even though it is possible to more frequently read +their data files. + +SEE ALSO +====================== + +ldms-sensors-config(1), sensors(1), lscpu(1), ldms_sampler_base(7), +proc(5), ldmsd(8), ldmsd_controller(8) diff --git a/rtd/man2rst/Plugin_fptrans.rst b/rtd/man2rst/Plugin_fptrans.rst new file mode 100644 index 000000000..84802c7ef --- /dev/null +++ b/rtd/man2rst/Plugin_fptrans.rst @@ -0,0 +1,76 @@ +============== +Plugin_fptrans +============== + +:Date: 18 Feb 2018 + +.. contents:: + :depth: 3 +.. + +NAME +=============== + +Plugin_fptrans - man page for the LDMS fptrans plugin + +SYNOPSIS +=================== + +| Within ldmsd_controller or a configuration file: +| config name=fptrans [ = ] + +DESCRIPTION +====================== + +The fptrans plugin provides metrics that have well known values which +can be used to test transmission and storage fidelity of single and +double precision scalars and floating point arrays. + +CONFIGURATION ATTRIBUTE SYNTAX +========================================= + +The fptrans plugin uses the sampler_base base class. This man page +covers only the configuration attributes, or those with default values, +specific to the this plugin; see ldms_sampler_base.man for the +attributes of the base class. + +**config** + | name= [schema=] + | configuration line + + name= + | + | This MUST be fptrans. + + schema= + | + | Optional schema name. It is intended that the same sampler on + different nodes with different metrics have a different schema. + If not specified, it will default to \`fptrans\`. + +NOTES +================ + +The well known values used are 0, 1, and pi as determined by C macro +M_PI. + +BUGS +=============== + +No known bugs. + +EXAMPLES +=================== + +Within ldmsd_controller or a configuration file: + +:: + + load name=fptrans + config name=fptrans producer=vm1_1 instance=vm1_1/fptrans + start name=fptrans interval=1000000 + +SEE ALSO +=================== + +ldmsd(8), ldms_quickstart(7), ldmsd_controller(8), ldms_sampler_base(7) diff --git a/rtd/man2rst/Plugin_geopm_sampler.rst b/rtd/man2rst/Plugin_geopm_sampler.rst new file mode 100644 index 000000000..f9fec8474 --- /dev/null +++ b/rtd/man2rst/Plugin_geopm_sampler.rst @@ -0,0 +1,153 @@ +==================== +Plugin_geopm_sampler +==================== + +:Date: 06 May 2022 + +.. contents:: + :depth: 3 +.. + +NAME +===================== + +Plugin_geopm - man page for the LDMS geopm plugin + +SYNOPSIS +========================= + +| Within ldmsd_controller or a configuration file: +| config name=ldms_geopm_sampler geopm_request_path= + +DESCRIPTION +============================ + +With LDMS (Lightweight Distributed Metric Service), plugins for the +ldmsd (ldms daemon) are configured via ldmsd_controller or a +configuration file. The geopm plugin provides access to the geopm(7) +PlatformIO interface by configuring the request file with signal +requests. + +CONFIGURATION ATTRIBUTE SYNTAX +=============================================== + +The ldms_geopm_sampler plugin uses the sampler_base base class. This man +page covers only the configuration attributes, or those with default +values, specific to the this plugin; see ldms_sampler_base.man for the +attributes of the base class. + +The GEOPM LDMS sampler can be configured with the same config parameters +as other LDMS samplers (e.g., \``name`\`, \``producer`\`, +\``component_Id`\`). In addition to these paramers, the sampler must be +configured with the option - \``geopm_request_path=`\`. + +**config** + | name= geopm_request_path= [schema=] + | configuration line + + name= + | + | This MUST be ldms_geopm_sampler. + + geopm_request_path= + | + | This parameter points to the absolute path of the ASCII file + containing the list of signals that the user would like to have + monitored by the sampler. + + The format of this file is a three column white space delimited file. + Each line must contain a GEOPM PlatformIO request of the form: + + ** ** + + The signal name must be a signal supported by GEOPM on the system. To + see a full list of supported signals run the geopmread(1) command + without any options. The domain must match one of the GEOPM domains. + Run the geopmread(1) command with the -d option to see a full list of + supported domains and the number of instances of each on the system. + The domain index provided must be greater or equal to zero and less + than the number of available domains. + + schema= + | + | Optional schema name. It is intended that the same sampler on + different nodes with different metrics have a different schema. + If not specified, will default to \`ldms_geopm_sampler\`. + +EXAMPLES +========================= + +**CONFIGURING LDMSD WITH THE SAMPLER** + +Within ldmsd_controller or a configuration file: + +:: + + load name=ldms_geopm_sampler + config name=ldms_geopm_sampler producer=${HOSTNAME} geopm_request_path=/etc/ldms/geopm_sampler_request.config + start name=ldms_geopm_sampler interval=1000000 + +Here's an example of a file containing the list of signals: + +$> cat geopm_sampler_signal.config CPU_FREQUENCY_MAX board 0 +CPU_FREQUENCY_MIN board 0 CPU_FREQUENCY_STEP board 0 +CPU_FREQUENCY_STICKER board 0 TIME board 0 ENERGY_PACKAGE board 0 +INSTRUCTIONS_RETIRED board 0 POWER_DRAM board 0 POWER_PACKAGE board 0 +POWER_PACKAGE_LIMIT board 0 POWER_PACKAGE_MAX board 0 POWER_PACKAGE_MIN +board 0 POWER_PACKAGE_TDP board 0 TEMPERATURE_CORE board 0 +TEMPERATURE_PACKAGE board 0 TIMESTAMP_COUNTER board 0 + +Note the inclusion of the *geopm_request_path* parameter passed to the +*config* instruction. Also, note the name of the sampler +*ldms_geopm_sampler* passed to the *name* parameter for the *load* and +*start* instructions. + +**RUNNING LDMSD WITH THE SAMPLER** + +In order to run the GEOPM LDMS sampler, follow the same steps as you +would for any other LDMS sampler. Start the \``ldmsd`\` daemon is +running on the target node to be monitored. Example below: + +ldmsd -x sock:10444 -F -c -l +${TEST_PATH}/temp/demo_ldmsd_log + +For observing the progress of the sampler, you may choose to add the +option \`\`-v DEBUG`\` above. While the \``ldmsd`\` daemon is running, +the user may choose to query for a single instantaneous sample set +comprising of recently monitored signals. This can be achieved by using +the existing commandline tool - \``ldms_ls`\` available as part of the +installation of the LDMS framework. An example is shown below: + +$> ldms_ls -h localhost -x sock -p 10444 -l -v + +Schema Instance Flags Msize Dsize Hsize UID GID Perm Update Duration +Info -------------- ------------------------ ------ ------ ------ ------ +------ ------ ---------- ----------------- ----------------- -------- +ldms_geopm_sampler /ldms_geopm_sampler CL 1352 240 0 1024 100 +-r--r----- 1656431193.051578 0.000323 "updt_hint_us"="1000000:50000" +-------------- ------------------------ ------ ------ ------ ------ +------ ------ ---------- ----------------- ----------------- -------- +Total Sets: 1, Meta Data (kB): 1.35, Data (kB) 0.24, Memory (kB): 1.59 + +======================================================================================== + +/ldms_geopm_sampler: consistent, last update: Tue Jun 28 +08:46:33 2022 -0700 [51578us] M u64 component_id 1 D u64 job_id 0 D u64 +app_id 0 D d64 CPU_FREQUENCY_MAX_board_0 3700000000.000000 D d64 +CPU_FREQUENCY_MIN_board_0 1000000000.000000 D d64 +CPU_FREQUENCY_STEP_board_0 100000000.000000 D d64 +CPU_FREQUENCY_STICKER_board_0 2100000000.000000 D d64 TIME_board_0 +6.899751 D d64 ENERGY_PACKAGE_board_0 334936.207092 D d64 +INSTRUCTIONS_RETIRED_board_0 131016700.000000 D d64 POWER_DRAM_board_0 +0.900889 D d64 POWER_PACKAGE_board_0 25.469352 D d64 +POWER_PACKAGE_LIMIT_board_0 140.000000 D d64 POWER_PACKAGE_MAX_board_0 +594.000000 D d64 POWER_PACKAGE_MIN_board_0 140.000000 D d64 +POWER_PACKAGE_TDP_board_0 280.000000 D d64 TEMPERATURE_CORE_board_0 +26.454545 D d64 TEMPERATURE_PACKAGE_board_0 28.000000 D d64 +TIMESTAMP_COUNTER_board_0 10913748924506.000000 + +SEE ALSO +========================= + +ldmsd(8), ldms_quickstart(7), ldmsd_controller(8), ldms_sampler_base(7), +geopm(7), geopm_pio(7), geopmread(1), geopmwrite(1) diff --git a/rtd/man2rst/Plugin_hello_sampler.rst b/rtd/man2rst/Plugin_hello_sampler.rst new file mode 100644 index 000000000..24ff1ae8c --- /dev/null +++ b/rtd/man2rst/Plugin_hello_sampler.rst @@ -0,0 +1,89 @@ +==================== +Plugin_hello_sampler +==================== + +:Date: 21 Aug 2021 + +.. contents:: + :depth: 3 +.. + +NAME +===================== + +Plugin_hello_sampler - man page for the LDMS hello_sampler plugin + +SYNOPSIS +========================= + +| Within ldmsd_controller or a configuration file: +| config name=hello_sampler [ = ] + +DESCRIPTION +============================ + +With LDMS (Lightweight Distributed Metric Service), plugins for the +ldmsd (ldms daemon) are configured via ldmsd_controller or a +configuration file. The hello_sampler plugin does not actually sample, +but rather subscribes to an ldmsd_stream and writes the stream data to +the ldmsd logfile. + +CONFIGURATION ATTRIBUTE SYNTAX +=============================================== + +The hello_sampler plugin uses the sampler_base base class. This man page +covers only the configuration attributes, or those with default values, +specific to the this plugin; see ldms_sampler_base.man for the +attributes of the base class. + +**config** + | name= stream= + | configuration line + + name= + | + | This MUST be hello_sampler. + + stream= + | + | Name of the stream to which to subscribe. + +BUGS +===================== + +No known bugs. + +EXAMPLES +========================= + +Within ldmsd_controller or a configuration file: + +:: + + load name=hello_sampler + config name=hello_sampler producer=host1 instance=host1/hello_sampler stream=foo component_id=1 + start name=hello_sampler interval=1000000 offset=0 + +:: + + > ./hello_publisher -x sock -h localhost -p 16000 -a munge -s foo -m "foo" -t str + The data was successfully published. + The server responded with 0 + + > ./hello_publisher -x sock -h localhost -p 16000 -a munge -s foo -m "bar" -t str + The data was successfully published. + The server responded with 0 + + + In the log file of the ldmsd: + > cat log.txt + Mon May 04 19:44:05 2020: CRITICAL : stream_type: STRING, msg: "foo", msg_len: 4, entity: (nil) + Mon May 04 19:44:24 2020: CRITICAL : stream_type: STRING, msg: "bar", msg_len: 4, entity: (nil) + + Note that the hello_streams sampler does not do a sample, instead it subscribes to the stream with a callback and prints out what it got off the stream. + +SEE ALSO +========================= + +ldmsd(8), ldms_quickstart(7), ldmsd_controller(8), ldms_sampler_base(7), +ldmsd_stream_publish(7), Plugin_stream_csv_store(7) diff --git a/rtd/man2rst/Plugin_ibmad_records_sampler.rst b/rtd/man2rst/Plugin_ibmad_records_sampler.rst new file mode 100644 index 000000000..959128b2b --- /dev/null +++ b/rtd/man2rst/Plugin_ibmad_records_sampler.rst @@ -0,0 +1,139 @@ +============================ +Plugin_ibmad_records_sampler +============================ + +:Date: 1 May 2019 + +.. contents:: + :depth: 3 +.. + +NAME +============================= + +Plugin_ibmad_records_sampler - man page for the LDMS +ibmad_records_sampler plugin + +SYNOPSIS +================================= + +| Within ldmsd_controller or a configuration file: +| config name=ibmad_records_sampler [ = ] + +DESCRIPTION +==================================== + +With LDMS (Lightweight Distributed Metric Service), plugins for the +ldmsd (ldms daemon) are configured via ldmsd_controller or a +configuration file. The ibmad_records_sampler plugin provides a single +metric set that contains a list of records. Each record contains all of +the metrics for a single infiniband port. + +The schema is named "ibmad" by default. + +NOTE: This plugin will not currently work with virtual IB devices. + +CONFIGURATION ATTRIBUTE SYNTAX +======================================================= + +**config** + | name= [schema=] [job_set=] + | configuration line + + name= + | + | This MUST be ibmad_records_sampler. + + schema= + | + | The schema name defaults to "ibmad", but it can be renamed at + the user's choice. + + rate=0 + | + | Stop the default inclusion of rate values in the set. + + job_set= + | + | The name of the metric set that contains the job id information + (default=job_id) + + include=PORTLIST + | + | Ignore any devices and ports discovered that are not matched by + PORTLIST. See PORTLIST below. Cannot be combined with the + exclude option. + + exclude=PORTLIST + | + | Collect all devices and ports discovered and active that are not + matched by PORTLIST. See PORTLIST below. Cannot be combined with + the include option. + + refresh_interval_sec= + | + | (Optional) The sampler caches the list of infiniband devices, + and that cache is refreshed at the beginning of a sample cycle + if the refresh interval time has been exceeded. + refresh_interval_sec sets the minimum number of seconds between + refreshes of the device cache. The default refresh interval is + 600 seconds. + +PORTLIST +================================= + +Providing a port list specification will stop the automated discovery +process at every sample time from requerying devices and ports that are +not of interest, eliminating nuisance log messages from the MAD +libraries. Such messages are frequently seen on systems using +SocketDirect hardware. + +The port list is a comma-separated list of CA name and optionally +number. E.g. "mlx4_0.1,mlx4_1". A device name specified without a port +number (.N) matches all ports on that device. The maximum port number +supported for a single device is 63. Including a device or port which +does not exist or is not active in the port list has no effect on the +metric sets reported. + +BUGS +============================= + +No known bugs. + +NOTES +============================== + +The rates reported are computed from the last sample taken and the +present sample; however the last sample may not have been stored +downstream and the sample interval size may vary due to kernel wakeup +variations. Rate values are set to -1 for samples where the rate +computation is invalid. + +EXAMPLES +================================= + +Within ldmsd_controller or a configuration file: + +:: + + load name=ibmad_records_sampler + config name=ibmad_records_sampler + start name=ibmad_records_sampler interval=1000000 + +:: + + load name=ibmad_records_sampler + config name=ibmad_records_sampler include=hfi1_0.1 rate=0 + start name=ibmad_records_sampler interval=1000000 + +:: + + load name=ibmad_records_sampler + config name=ibmad_records_sampler exclude=mlx5_0.2,mlx5_0.3,mlx5_0.4, + start name=ibmad_records_sampler interval=1000000 + +SEE ALSO +================================= + +ldmsd(8), ldms_quickstart(7), ldmsd_controller(8) diff --git a/rtd/man2rst/Plugin_ibmad_sampler.rst b/rtd/man2rst/Plugin_ibmad_sampler.rst new file mode 100644 index 000000000..d4ac045c3 --- /dev/null +++ b/rtd/man2rst/Plugin_ibmad_sampler.rst @@ -0,0 +1,128 @@ +==================== +Plugin_ibmad_sampler +==================== + +:Date: 1 May 2019 + +.. contents:: + :depth: 3 +.. + +NAME +===================== + +Plugin_ibmad_sampler - man page for the LDMS ibmad_sampler plugin + +SYNOPSIS +========================= + +| Within ldmsd_controller or a configuration file: +| config name=ibmad_sampler [ = ] + +DESCRIPTION +============================ + +With LDMS (Lightweight Distributed Metric Service), plugins for the +ldmsd (ldms daemon) are configured via ldmsd_controller or a +configuration file. The ibmad_sampler plugin provides a metric set for +each infiniband port discovered on the node. + +The schema is named "ibmad_sampler" by default. + +NOTE: This plugin will not currently work with virtual IB devices. + +CONFIGURATION ATTRIBUTE SYNTAX +=============================================== + +**config** + | name= [schema=] [job_set=] + | configuration line + + name= + | + | This MUST be ibmad_sampler. + + schema= + | + | The schema name defaults to "ibmad_sampler", but it can be + renamed at the user's choice. + + rate=0 + | + | Stop the default inclusion of rate values in the set. + + job_set= + | + | The name of the metric set that contains the job id information + (default=job_id) + + include=PORTLIST + | + | Ignore any devices and ports discovered that are not matched by + PORTLIST. See PORTLIST below. Cannot be combined with the + exclude option. + + exclude=PORTLIST + | + | Collect all devices and ports discovered and active that are not + matched by PORTLIST. See PORTLIST below. Cannot be combined with + the include option. + +PORTLIST +========================= + +Providing a port list specification will stop the automated discovery +process at every sample time from requerying devices and ports that are +not of interest, eliminating nuisance log messages from the MAD +libraries. Such messages are frequently seen on systems using +SocketDirect hardware. + +The port list is a comma-separated list of CA name and optionally +number. E.g. "mlx4_0.1,mlx4_1". A device name specified without a port +number (.N) matches all ports on that device. The maximum port number +supported for a single device is 63. Including a device or port which +does not exist or is not active in the port list has no effect on the +metric sets reported. + +BUGS +===================== + +No known bugs. + +NOTES +====================== + +The rates reported are computed from the last sample taken and the +present sample; however the last sample may not have been stored +downstream and the sample interval size may vary due to kernel wakeup +variations. Rate values are set to -1 for samples where the rate +computation is invalid. + +EXAMPLES +========================= + +Within ldmsd_controller or a configuration file: + +:: + + load name=ibmad_sampler + config name=ibmad_sampler + start name=ibmad_sampler interval=1000000 + +:: + + load name=ibmad_sampler + config name=ibmad_sampler include=hfi1_0.1 rate=0 + start name=ibmad_sampler interval=1000000 + +:: + + load name=ibmad_sampler + config name=ibmad_sampler exclude=mlx5_0.2,mlx5_0.3,mlx5_0.4, + start name=ibmad_sampler interval=1000000 + +SEE ALSO +========================= + +ldmsd(8), ldms_quickstart(7), ldmsd_controller(8) diff --git a/rtd/man2rst/Plugin_ibnet.rst b/rtd/man2rst/Plugin_ibnet.rst new file mode 100644 index 000000000..0bfe0c6c8 --- /dev/null +++ b/rtd/man2rst/Plugin_ibnet.rst @@ -0,0 +1,186 @@ +============ +Plugin_ibnet +============ + +:Date: 19 May 2020 + +.. contents:: + :depth: 3 +.. + +NAME +============= + +Plugin_ibnet - man page for the LDMS ibnet plugin + +SYNOPSIS +================= + +| Within ldmsd_controller or a configuration file: +| config name=ibnet [ = ] + +DESCRIPTION +==================== + +The ibnet plugin provides port info from InfiniBand equipment supporting +extended hardware counters. Each port is handled in a separate data set. +Overall timing of the data collection process is handled in another +optional data set. Plugins for the ldmsd (ldms daemon) are configured +via ldmsd_controller or a configuration file. + +CONFIGURATION ATTRIBUTE SYNTAX +======================================= + +**config** + | name= port-name= source-list= + [port-number=] [metric-conf=] + [node-name-map=] [timing=] [millis=] + [producer=] [instance=] [component_id=] + [schema=] [uid=] [gid=] [perm=] [debug] + | configuration line + + name= + | + | This MUST be ibnet. + + producer=. + | + | The producer string value for the timing set. Default is the + result of gethostname(). + + instance= + | + | The name of the timing metric set. Default is + $producer/ibnet_timing. + + source-list= + | + | Lidfile is the name of a file of LID/port specifications. See + PORT FILE for format details. + + port-name= [port-number=] + | + | Hca is the name of the local IB interface to access the network. + Num is the number of the port on the interface used to access + the network. The default is 1. + + schema= + | + | Optional schema base name. The default is ibnet. The name base + is suffixed to create uniquely defined schema names based on the + plugin options specified. + + component_id= + | + | Optional component identifier for the timing set. Defaults to + zero. + + metric-conf= + | + | The file listing the metric groups to collect. See METRIC GROUPS + below. + + ca_port= + | + | The port number to use, which must be active. + + millis= + | + | The number of milliseconds of the timeout on the MAD calls. + Default 0, which will use the mad library timeout of 1 second. + + timing= + | + | Disable timing (T=0), enable aggregate timing (T=1), or enable + individual port timing(T=2) or enable port offset timing(T=3). + The metric set will contain sampling process timing metrics if T + > 0. + + node-name-map= + | + | The file name nnmap, as used by ibnetdiscover and opensm, of a + mapping from IB GUIDs to short names of IB hardware items + (switch, node, etc) suitable for use in populating names of + sets. + +PORT FILE +================== + +The lid/port file format is + +:: + + lid, hexguid, nports, plist + * where hexguid is 0x...., + * nports is int, + * plist is ints nports long or * if range is 1-nports, + * if not using a name map, names will be GUID_hex. + +The portrange will be an integer expression in the style 1,5,7-9,13, +without repeats, whitespace, reversed ranges, or overlapping ranges. LID +is an integer in the range 0-65535. The same LID may be on multiple +lines so long as the ports listed for it are not repeated. + +The lid file can be generated with ldms-gen-lidfile.sh. + +METRIC GROUPS +====================== + +The metric groups file contains a list of items, one per line, naming +groups of metrics to collect. The groups are named corresponding to +groups in the infiniband-diags perfquery utility options. The +correspondence is not exact. To disable a listed metric group, delete +its name from the file or comment it out by prepending a # to the group, +e.g. '#xmtsl'. '#' followed by whitespace is not allowed. Carriage +returns are optional. + +INTERNAL METRICS +========================= + +port_query_time + | + | Time in seconds spend in the single port MAD call. + +port_query_offset + | + | Time in microseconds from start of all MAD calls in the current + update to the end of the mad call for the specific port. + +ib_query_time + | + | Time in seconds making all MAD calls in the update. + +ib_data_process_time + | + | Time in seconds decoding all MAD data in the update + +BUGS +============= + +The perfquery extended_speeds option is not supported. + +EXAMPLES +================= + +Within ldmsd_controller or a configuration file: + +:: + + load name=ibnet + config name=ibnet producer=compute1 instance=compute1/ibnet component_id=1 port-name=mlx5_0 source-list=/path/lidfile + start name=ibnet interval=1000000 + +NOTES +============== + +The exact schema name that will be generated can be determined using the +ldms_ibnet_schema_name utility. The subsets available from the fabric +depend on the hardware, firmware, and in some cases the subnet manager +versions. + +SEE ALSO +================= + +ldmsd(8), ldms_quickstart(7), ldmsd_controller(8), +ldms_ibnet_schema_name(1), ldms-ibnet-sampler-gen(1). diff --git a/rtd/man2rst/Plugin_ipmireader.rst b/rtd/man2rst/Plugin_ipmireader.rst new file mode 100644 index 000000000..a7c4cd271 --- /dev/null +++ b/rtd/man2rst/Plugin_ipmireader.rst @@ -0,0 +1,134 @@ +================= +Plugin_ipmireader +================= + +:Date: 18 Feb 2019 + +.. contents:: + :depth: 3 +.. + +NAME +================== + +Plugin_ipmireader - man page for the LDMS ipmireader plugin + +SYNOPSIS +====================== + +| Within ldmsd_controller or a configuration file: +| config name=ipmireader [ = ] + +DESCRIPTION +========================= + +With LDMS (Lightweight Distributed Metric Service), plugins for the +ldmsd (ldms daemon) are configured via ldmsd_controller or a +configuration file. The ipmireader plugin provides data from the result +of the ipmitool sdr command. All data is reported out as floats. + +**This sampler is currently in pre-release development in V4.2.** + +CONFIGURATION ATTRIBUTE SYNTAX +============================================ + +The ipmireader plugin uses the sampler_base base class. This man page +covers only the configuration attributes, or those with default values, +specific to the this plugin; see ldms_sampler_base.man for the +attributes of the base class. + +**config** + | name= [schema=] [ = ... ] + | configuration line + + name= + | + | This MUST be ipmireader. + + schema= + | + | Optional schema name. It is intended that the same sampler on + different nodes with different metrics have a different schema. + If not specified, will default to \`ipmireader\`. + + address=
+ | + | address of the host to contact. H flag in the ipmitool command. + + username= + | + | username for the query. U flag in the ipmitool command. Defaults + to 'admin'. + + password= + | + | password for the query. P flag in the ipmitool command. Defaults + to 'password'. + + sdrcache= + | + | output for the sdr cache file, to improve performance. Optional. + + retry= + | + | interval to retry creating set if initially fails (host down). + Default 600 sec. + +BUGS +================== + +No known bugs. + +NOTES +=================== + +- This sampler is currently in pre-release development in V4.2. + +- Parameters in the ipmitool call are: -N1 (timeout for LAN interface) + -R1 (number of retries for LAN interface). These are in order to + reduce the time waiting for a non responsive node. + +- The ipmitool command appears to have less overhead than ipmi-sensors + and so is preferred over the ipmisensors sampler for single node + calls. + +- If the dump cache command fails, this is not reported. If the file + does not exist after a short sleep, there is a log message. Without + the sdr file, the sampler will continue. On one system, using the + cached sdr information reduces the call response time by about 0.5 + seconds. This manifests itself in the timestamp of the call. + +- There is a one time occurrence of a sleep of 2 seconds (empirically + chosen) after the dump cache command, to enable the file to be + written by the time of the next data call. If it takes longer, but is + in place for later sample calls, it will be used then. + +- There is currently no call to redump the file. + +- There is no way to check that a dumped file is still accurate for + your system. + +- Currently all the data is reported as type float. + +- In case of signficant error or cannot open the file, all metrics are + set to the FAIL value, which currently is -8888. In case of a metric + error, like a missing fan and hence the reported value is not + numeric, the metric is set to the ERR value, which currently is + -9999. + +EXAMPLES +====================== + +Within ldmsd_controller or a configuration file: + +:: + + load name=ipmireader + config name=ipmireader producer=vm1_1 instance=vm1_1/ipmireader address=cn1-ipmi + start name=ipmireader interval=1000000 + +SEE ALSO +====================== + +ldmsd(8), ldms_quickstart(7), ldmsd_controller(8), ldms_sampler_base(7), +Plugin_ipmisensors(7) diff --git a/rtd/man2rst/Plugin_ipmisensors.rst b/rtd/man2rst/Plugin_ipmisensors.rst new file mode 100644 index 000000000..58d7c2a35 --- /dev/null +++ b/rtd/man2rst/Plugin_ipmisensors.rst @@ -0,0 +1,119 @@ +================== +Plugin_ipmisensors +================== + +:Date: 21 Mar 2019 + +.. contents:: + :depth: 3 +.. + +NAME +=================== + +Plugin_ipmisensors - man page for the LDMS ipmisensors plugin + +SYNOPSIS +======================= + +| Within ldmsd_controller or a configuration file: +| config name=ipmisensors [ = ] + +DESCRIPTION +========================== + +With LDMS (Lightweight Distributed Metric Service), plugins for the +ldmsd (ldms daemon) are configured via ldmsd_controller or a +configuration file. The ipmisensors plugin provides data from the result +of the ipmi-sensors command. Specific parameters for the command +described below. All data is reported out as floats. + +**This sampler is currently in pre-release development in V4.2.** + +CONFIGURATION ATTRIBUTE SYNTAX +============================================= + +The ipmisensors plugin uses the sampler_base base class. This man page +covers only the configuration attributes, or those with default values, +specific to the this plugin; see ldms_sampler_base.man for the +attributes of the base class. + +**config** + | name= [schema=] [ = ... ] + | configuration line + + name= + | + | This MUST be ipmisensors. + + schema= + | + | Optional schema name. It is intended that the same sampler on + different nodes with different metrics have a different schema. + If not specified, will default to \`ipmisensors\`. + + address=
+ | + | address of the host to contact. h flag in the ipmi-sensors + command. + + username= + | + | username for the query. u flag in the ipmi-sensors command. + Defaults to 'admin'. + + password= + | + | password for the query. p flag in the ipmi-sensors command. + Defaults to 'password'. + +BUGS +=================== + +No known bugs. + +NOTES +==================== + +- This sampler is currently in pre-release development in V4.2. + +- The ipmi-sensors call appears to have more overhead than the ipmitool + commandfor single node queries, and so the impireader sampler is + preferred. + +- Specific args to the command are: --comma-separated-output + --no-header-output --session-timeout=500 --retransmission-timeout=250 + --quiet-cache --no-sensor-type. Of note are the timeouts. These will + limit how long the call will wait (and thus the duration of the + sample) if a host is not responding. + +- The ipmi-sensors call can be called with a fan out. This would cause + significant parsing in the return, so it is not used here. Also the + return of the fan out call will wait on the return of all the + individual calls. Thus a non-responsive node can cause a long delay, + affecting all values, without a timeout. + +- Currently all the data is reported as type float. + +- In case of signficant error or cannot open the file, all metrics are + set to the FAIL value, which currently is -8888. In case of a metric + error, like a missing fan and hence the reported value is not + numeric, the metric is set to the ERR value, which currently is + -9999. + +EXAMPLES +======================= + +Within ldmsd_controller or a configuration file: + +:: + + load name=ipmisensors + config name=ipmisensors producer=vm1_1 instance=vm1_1/ipmisensors address=cn1-ipmi + start name=ipmisensors interval=1000000 + +SEE ALSO +======================= + +ldmsd(8), ldms_quickstart(7), ldmsd_controller(8), ldms_sampler_base(7), +Plugin_ipmireader(7) diff --git a/rtd/man2rst/Plugin_jobid.rst b/rtd/man2rst/Plugin_jobid.rst new file mode 100644 index 000000000..73cb07581 --- /dev/null +++ b/rtd/man2rst/Plugin_jobid.rst @@ -0,0 +1,125 @@ +============ +Plugin_jobid +============ + +:Date: 03 Dec 2016 + +.. contents:: + :depth: 3 +.. + +NAME +============= + +Plugin_jobid - man page for the LDMS jobid plugin + +SYNOPSIS +================= + +| Within ldmsd_controller or in a configuration file +| config name=jobid [ = ] + +DESCRIPTION +==================== + +With LDMS (Lightweight Distributed Metric Service), plugins for the +ldmsd (ldms daemon) are configured via ldmsd_controller or a +configuration file. The jobid plugin provides jobid info from +/var/run/ldms.jobinfo or similar files replaced periodically by resource +managers. When files are missing, the value 0 or equivalent is reported. + +CONFIGURATION ATTRIBUTE SYNTAX +======================================= + +**config** + | name= producer= instance= + [component_id= schema=] [with_jobid=] + file= + | configuration line + + name= + | + | This MUST be jobid. + + producer= + | + | The producer name value. + + instance= + | + | The name of the metric set. + + schema= + | + | Optional schema name. It is intended that the same sampler on + different nodes with different metrics have a different schema. + If not specified, will default to \`vmstat\`. + + component_id= + | + | Optional component identifier. Defaults to zero. + + with_jobid= + | + | Option to lookup job_id with set or 0 if not. The job_id column + will always appear, but populated witn zero. + +BUGS +============= + +No known implementation bugs. Design features you may not like: Relies +on site-specific resource manager configuration to produce the file +read. Does not query local or remote manager daemons. May be slow to +sample and generate undesirable filesystem events if filepath is on a +networked filesystem instead of a node-local RAM partition as is usual +in clusters. + +NOTES +============== + +The colname option from LDMS v2 slurmjobid plugin is no longer +supported. The sampler offset for the jobid plugin should be slightly +less than all other plugins to ensure consistency in the job information +reported for a given time interval across all other plugins. The time +interval for the jobid plugin need only be approximately the clock +granularity of the resource manager. + +Other samplers use the jobid plugin as the jobid data source. If the +jobid sampler is not loaded, these samplers will report 0 jobid values. + +EXAMPLES +================= + +:: + + Within ldmsd_controller or in a configuration file + load name=jobid + config name=jobid component_id=1 producer=vm1_1 instance=vm1_1/jobid + start name=jobid interval=1000000 offset=-100000 + + + Within ldmsd_controller or in a configuration file + load name=jobid + config name=jobid component_id=1 producer=vm1_1 instance=vm1_1/jobid file=/var/run/rman/node/jobinfo + start name=jobid interval=1000000 offset=-100000 + +Slurm 2.x installations can populate /var/run/ldms.jobid by adding the +following lines to slurm.epilog and slurm.prolog, respectively. + +:: + + + echo "JOBID=0" > /var/run/ldms.jobinfo + + and + + echo JOBID=$SLURM_JOBID > /var/run/ldms.jobinfo + echo UID=$SLURM_UID >> /var/run/ldms.jobinfo + echo USER=$SLURM_JOB_USER >> /var/run/ldms.jobinfo + +These slurm files might be found in /etc/nodestate/bin/. + +SEE ALSO +================= + +ldms(7), ldmsd(8), ldmsd_controller(8) diff --git a/rtd/man2rst/Plugin_kgnilnd.rst b/rtd/man2rst/Plugin_kgnilnd.rst new file mode 100644 index 000000000..a238a8a00 --- /dev/null +++ b/rtd/man2rst/Plugin_kgnilnd.rst @@ -0,0 +1,72 @@ +============== +Plugin_kgnilnd +============== + +:Date: 10 Feb 2018 + +.. contents:: + :depth: 3 +.. + +NAME +=============== + +Plugin_kgnilnd - man page for the LDMS kgnilnd plugin + +SYNOPSIS +=================== + +| Within ldmsd_controller or in a configuration file +| config name=kgnilnd [ = ] + +DESCRIPTION +====================== + +With LDMS (Lightweight Distributed Metric Service), plugins for the +ldmsd (ldms daemon) are configured via ldmsd_controller or a +configuration file. The kgnilnd plugin provides Cray specific info from +/proc/kgnilnd. + +CONFIGURATION ATTRIBUTE SYNTAX +========================================= + +The kgnilnd plugin uses the sampler_base base class. This man page +covers only the configuration attributes, or those with default values, +specific to the this plugin; see ldms_sampler_base.man for the +attributes of the base class. + +**config** + | name= [schema=] + | configuration line + + name= + | + | This MUST be kgnilnd. + + schema= + | + | Optional schema name. It is intended that the same sampler on + different nodes with different metrics have a different schema. + If not specified, will default to \`kgnilnd\`. + +BUGS +=============== + +No known bugs. + +EXAMPLES +=================== + +Within ldmsd_controller or in a configuration file + +:: + + load name=kgnilnd + config name=kgnilnd producer=vm1_1 instance=vm1_1/kgnilnd + start name=kgnilnd interval=1000000 + +SEE ALSO +=================== + +ldmsd(8), Plugin_cray_system_sampler_variants(7), ldms_quickstart(7), +ldmsd_controller(8), ldms_sampler_base(7) diff --git a/rtd/man2rst/Plugin_linux_proc_sampler.rst b/rtd/man2rst/Plugin_linux_proc_sampler.rst new file mode 100644 index 000000000..9e2cbd280 --- /dev/null +++ b/rtd/man2rst/Plugin_linux_proc_sampler.rst @@ -0,0 +1,426 @@ +========================= +Plugin_linux_proc_sampler +========================= + +:Date: 15 Jul 2021 + +.. contents:: + :depth: 3 +.. + +NAME +========================== + +Plugin_linux_proc_sampler - man page for the LDMS linux_proc_sampler +plugin + +SYNOPSIS +============================== + +| Within ldmsd_controller or a configuration file: +| config name=linux_proc_sampler [common attributes] [stream=STREAM] + [metrics=METRICS] [cfg_file=FILE] [instance_prefix=PREFIX] + [exe_suffix=1] [argv_sep=] [argv_msg=1] [argv_fmt=<1,2>] + [env_msg=1] [env_exclude=EFILE] [fd_msg=1] [fd_exclude=EFILE] + +DESCRIPTION +================================= + +With LDMS (Lightweight Distributed Metric Service), plugins for the +ldmsd (ldms daemon) are configured via ldmsd_controller or a +configuration file. The linux_proc_sampler plugin provides data from +/proc/, creating a different set for each process identified in the +named stream. The stream can come from the ldms-netlink-notifier daemon +or the spank plugin slurm_notifier. The per-process data from +/proc/self/environ and /proc/self/cmdline can optionally be published to +streams. + +CONFIGURATION ATTRIBUTE SYNTAX +==================================================== + +The linux_proc_sampler plugin uses the sampler_base base class. This man +page covers only the configuration attributes, or those with default +values, specific to the this plugin; see ldms_sampler_base.man for the +attributes of the base class. + +**config** + | name= [other options] + | configuration line + + name= + | + | This MUST be linux_proc_sampler. + + instance_prefix=PREFIX + | + | Prepend PREFIX to the set instance names. Typically a cluster + name when needed to disambiguate producer names that appear in + multiple clusters. (default: no prefix). + + exe_suffix=1 + | + | If present, set instance names are appended with the full path + of the executable. This is most likely useful for debugging + configuration of the notifier up-stream using ldms_ls. (default: + no such suffix) + + sc_clk_tck=1 + | + | If present, include sc_clk_tck in the metric set. sc_clk_tck is + the ticks per second from sysconf(\_SC_CLK_TCK). (default: not + included). + + stream=STREAM + | + | The name of the \`ldmsd_stream\` to listen for SLURM job events. + (default: slurm). + + argv_sep= + | + | Replace nul within the cmdline string with char. Special + specifiers \\b,\\n,\\t,\\v,\\r,\\f are also supported. + + syscalls=FILE + | + | File mapping syscall integers to symbolic names. Not needed + unless syscall_name is included in the metrics. See FILES for + details. + + metrics + | + | The comma-separated list of metrics to monitor. The default is + (empty), which is equivalent to monitor ALL metrics. + + cfg_file=CFILE + | + | The alternative configuration file in JSON format. The file is + expected to have an object that contains the following + attributes: { "stream": "STREAM_NAME", "syscalls" : "/file", + "metrics": [ comma-separated-quoted-strings ] }. If the + \`cfg_file\` is given, all other sampler-specific options given + on the key=value line are ignored. + + argv_msg=1 + | + | Publish the argv items to a stream named _argv, where if + the schema is not specified, the default schema is + linux_proc_sampler. (Default: argv_msg=0; no publication of + argv). E.g. a downstream daemon will need to subscribe to + linux_proc_sampler_argv to receive the published messages and + store them. + + argv_fmt=<1,2> + | + | Publish the argv items formatted as (1) a json list of strings + ['argv0', 'argv1'] or (2) a json list of key/value tuples, e.g. + [ {"k":0, "v":"argv[0]"}, {"k":1, "v":"argv[1]"}]. + + env_msg=1 + | + | Publish the environment items to a stream named _env, + where if the schema is not specified, the default SCHEMA is + linux_proc_sampler. (Default: env_msg=0; no publication of the + environment). Environment data is published as a list in the + style of argv_fmt=2. E.g. a downstream daemon will need to + subscribe to linux_proc_sampler_env to receive the published + messages and store them. + + env_exclude=ELIST + | + | Exclude the environment items named with regular expressions in + ELIST. On the configuration key=value line, ELIST must be a file + name of a file containing a list of regular expressions one per + line. An environment variable that matches any of the listed + regular expressions will be excluded. When used in the cfg_file, + the env_exclude value may be either the string name of the + regular expression file or a JSON array of expression strings as + shown in EXAMPLES. + + fd_exclude=ELIST + | + | Exclude the files named with regular expressions in ELIST. On + the configuration key=value line, ELIST must be a file name of a + file containing a list of regular expressions one per line. A + file that matches any of the listed regular expressions will be + excluded. When used in the cfg_file, the fd_exclude value may be + either the string name of the regular expression file or a JSON + array of expression strings as shown in EXAMPLES. + + fd_msg=N + | + | Publish new /proc/pid/fd scan data to the _files stream + every N-th sample, where if the schema is not specified, the + default SCHEMA is linux_proc_sampler. (Default: fd_msg=0; no + publication of the file details). A downstream daemon will need + to subscribe to linux_proc_sampler_files to receive the + published messages and store them. Files that are not opened + long enough to be caught in a scan of fds will be missed. Files + will be reported as 'opened' the first time seen and as 'closed' + when they are no longer seen. A file both no longer seen and no + longer existing will be reported as 'deleted'. Only regular + files (not sockets, etc) are reported, and additionally files + matching the fd_expressions are ignored. Use a larger N to + reduce the scan overhead at the cost of missing short-access + files. If a close-reopen of the same file occurs between scans, + no corresponding events are generated. + + published_pid_dir= + | + | Name of the directory where netlink-notifier or other notifier + pids of interest may be found. This directory is scanned at + sampler startup only, so that pids which were the subject of + events published before the sampler started can be tracked. If + not specified, the default directory is + /var/run/ldms-netlink-tracked. Absence of this directory is not + a sampler configuration error, as ldmsd may start before the + notifier process. When starting, the sampler will clean up any + stale pid references found in this directory. Any pid not + appearing in this directory is not being tracked. + +INPUT STREAM FORMAT +========================================= + +The named ldmsd stream should deliver messages with a JSON format which +includes the following. Messages which do not contain event, data, +job_id, and some form of PID will be ignored. Extra fields will be +ignored. + +:: + + { "event" = "$e", + "data" : { + "job_id" : INT, + "task_pid" : INT, + "os_pid" : INT, + "parent_pid" : INT, + "is_thread" : INT, + "exe" : STRING, + "start" : STRING, + "start_tick" : STRING + } + } + +where $e is one of task_init_priv or task_exit. The data fields other +than job_id are all optional, but at least one of os_pid and task_pid +must contain the PID of a process to be monitored. If present and > 0, +task_pid should be the value taken from SLURM_TASK_PID or an equivalent +value from another resource management environment. The value of start, +if provided, should be approximately the epoch time ("%lu.%06lu") when +the PID to be monitored started. + +OUTPUT STREAM FORMAT +========================================== + +The json formatted output for argv and environment values includes a +common header: + +:: + + { + "producerName":"localhost1", + "component_id":1, + "pid":8991, + "job_id":0, + "timestamp":"1663086686.947600", + "task_rank":-1, + "parent":1, + "is_thread":0, + "exe":"/usr/sbin/ldmsd", + "data":[LIST] + +where LIST is formatted as described for argv_fmt option. + +EXAMPLES +============================== + +Within ldmsd_controller or a configuration file: + +:: + + load name=linux_proc_sampler + config name=linux_proc_sampler producer=vm1_1 instance=vm1_1/linux_proc_sampler metrics=stat_comm,stat_pid,stat_cutime + start name=linux_proc_sampler interval=1000000 + +An example metrics configuration file is: + +:: + + { + "stream": "slurm", + "instance_prefix" : "cluster2", + "syscalls": "/etc/sysconfig/ldms.d/plugins-conf/syscalls.map", + "env_msg": 1, + "argv_msg": 1, + "fd_msg" : 1, + "fd_exclude": [ + "/dev/", + "/run/", + "/var/", + "/etc/", + "/sys/", + "/tmp/", + "/proc/", + "/ram/tmp/", + "/usr/lib" + ], + "env_exclude": [ + "COLORTERM", + "DBU.*", + "DESKTOP_SESSION", + "DISPLAY", + "GDM.*", + "GNO.*", + "XDG.*", + "LS_COLORS", + "SESSION_MANAGER", + "SSH.*", + "XAU.*" + ], + "metrics": [ + "stat_pid", + "stat_state", + "stat_rss", + "stat_utime", + "stat_stime", + "stat_cutime", + "stat_cstime", + "stat_num_threads", + "stat_comm", + "n_open_files", + "io_read_b", + "io_write_b", + "status_vmdata", + "status_rssfile", + "status_vmswap", + "status_hugetlbpages", + "status_voluntary_ctxt_switches", + "status_nonvoluntary_ctxt_switches", + "syscall_name" + ] + } + +Generating syscalls.map: + +:: + + # ldms-gen-syscalls-map > /etc/sysconfig/ldms.d/plugins-conf/syscalls.map + +Obtaining the currently supported optional metrics list: + +:: + + ldms-plugins.sh linux_proc_sampler + +FILES +=========================== + +Data is obtained from (depending on configuration) the following files +in /proc/[PID]/: + +:: + + cmdline + exe + statm + stat + status + fd + io + oom_score + oom_score_adj + root + syscall + timerslack_ns + wchan + +The system call integer:name mapping varies with kernel and is therefore +read from an input file of the format: + +:: + + # comments + 0 read + ... + +where all lines are pairs. This file can be created from the +output of ldms-gen-syscall-map. System call names must be less than 64 +characters. Unmapped system calls will be given names of the form +SYS\_. + +The env_msg option can have its output filtered by json or a text file, +e.g.: + +:: + + # env var name regular expressions (all OR-d together) + COLORTERM + DBU.* + DESKTOP_SESSION + DISPLAY + GDM.* + GNO.* + XDG.* + LS_COLORS + SESSION_MANAGER + SSH.* + XAU.* + +The fd_msg option can have its output filtered by json or a text file, +e.g.: + +:: + + /dev/ + /run/ + /var/ + /etc/ + /sys/ + /tmp/ + /proc/ + /ram/tmp/ + /usr/lib64/ + /usr/lib/ + +The files defined with published_pid_dir appear in (for example) + +:: + + /var/run/ldms-netlink-tracked/[0-9]* + +and each contains the JSON message sent by the publisher. Publishers, +not ldmsd, populate this directory to allow asynchronous startup. + +NOTES +=========================== + +The value strings given to the options sc_clk_tck and exe_suffix are +ignored; the presence of the option is sufficient to enable the +respective features. + +Some of the optionally collected data might be security sensitive. + +The publication of environment and cmdline (argv) stream data is done +once at the start of metric collection for the process. The message will +not be reemitted unless the sampler is restarted. Also, changes to the +environment and argv lists made within a running process are NOT +reflected in the /proc data maintained by the linux kernel. The +environment and cmdline values may contain non-JSON characters; these +will be escaped in the published strings. + +The publication of file information via fd_msg information may be +effectively made one-shot-per-process by setting fd_msg=2147483647. This +will cause late-loaded plugin library dependencies to be missed, +however. + +The status_uid and status_gid values can alternatively be collected as +"status_real_user", "status_eff_user", "status_sav_user", +"status_fs_user", "status_real_group", "status_eff_group", +"status_sav_group", "status_fs_group". These string values are most +efficiently collected if both the string value and the numeric values +are collected. + +SEE ALSO +============================== + +syscalls(2), ldmsd(8), ldms_quickstart(7), ldmsd_controller(8), +ldms_sampler_base(7), proc(5), sysconf(3), environ(3). diff --git a/rtd/man2rst/Plugin_lnet_stats.rst b/rtd/man2rst/Plugin_lnet_stats.rst new file mode 100644 index 000000000..949907086 --- /dev/null +++ b/rtd/man2rst/Plugin_lnet_stats.rst @@ -0,0 +1,90 @@ +================= +Plugin_lnet_stats +================= + +:Date: 18 Feb 2018 + +.. contents:: + :depth: 3 +.. + +NAME +================== + +Plugin_lnet_stats - man page for the LDMS lnet_stats plugin + +SYNOPSIS +====================== + +| Within ldmsctl +| ldmsctl> config name=lnet_stats [ = ] + +DESCRIPTION +========================= + +With LDMS (Lightweight Distributed Metric Service), plugins for the +ldmsd (ldms daemon) are configured via ldmsd_controller or a +configuration file. The lnet_stats plugin provides memory info from +/proc/sys/lnet/stats or equivalent. + +CONFIGURATION ATTRIBUTE SYNTAX +============================================ + +The lnet_stats plugin uses the sampler_base base class. This man page +covers only the configuration attributes, or those with default values, +specific to the this plugin; see ldms_sampler_base.man for the +attributes of the base class. + +**config** + | name= [schema= file=] + | ldmsctl configuration line. + + name= + | + | This MUST be lnet_stats. + + schema= + | + | Optional schema name. It is intended that the same sampler on + different nodes with different metrics have a different schema. + If not specified, will default to \`lnet_stats\`. + + file= + | + | Optional full path name of stats file to use. If not supplied, + the default search path described in NOTES is searched. + Typically, this option is only used in test environments which + may not have a real Lustre installation or in order to test + transient disappearance of the file. + +NOTES +=================== + +The default search path followed for LNET stats is: +/sys/kernel/debug/lnet/stats:/proc/sys/lnet/stats. Which file will +exist, if either, depends on the Lustre version and how many volumes are +currently mounted. Be aware that /sys/kernel/debug normally is only +readable by privileged users. + +The stats file disappears when all mounts are unmounted or not yet +mounted. While it is missing, the data set is not updated. + +This assumes the file search path as described above, instead of looking +it up from the Lustre runtime libraries. This avoids compile time +dependence on Lustre which may be upgraded independently of LDMS. This +is not considered a bug. + +EXAMPLES +====================== + +:: + + Within ldmsd_controller or a configuration file: + load name=lnet_stats + config name=lnet_stats producer=vm1_1 instance=vm1_1/lnet_stats component_id=10 + start name=lnet_stats interval=1000000 + +SEE ALSO +====================== + +ldmsd(8), ldms_quickstart(7), ldmsd_controller(8), ldms_sampler_base(7) diff --git a/rtd/man2rst/Plugin_loadavg.rst b/rtd/man2rst/Plugin_loadavg.rst new file mode 100644 index 000000000..f56d5ce85 --- /dev/null +++ b/rtd/man2rst/Plugin_loadavg.rst @@ -0,0 +1,90 @@ +============== +Plugin_loadavg +============== + +:Date: 7 Apr 2020 + +.. contents:: + :depth: 3 +.. + +NAME +=============== + +Plugin_loadavg - man page for the LDMS loadavg plugin + +SYNOPSIS +=================== + +| Within ldmsd_controller +| config name=loadavg [ = ] + +DESCRIPTION +====================== + +The loadavg plugin provides OS information from /proc/loadavg + +CONFIGURATION ATTRIBUTE SYNTAX +========================================= + +This plugin uses the sampler_base base class. This man page covers only +the configuration attributes, or those with default values, specific to +the this plugin; see ldms_sampler_base.man for the attributes of the +base class. + +**config** + name= [schema=] [metrics=] [force_integer] + + name= + | + | This MUST be loadavg. + + force_integer + | + | If present, this flag forces load metrics to be stored as + integers of 100*value provided in the proc file. + + schema= + | + | Optional schema name. If schema is not specified, it will be + computed. The default name is loadavg if the metrics option is + not supplied. The default name when metrics is specified is + loadavgXXXXXX, where each X corresponds to whether or not that + metric is included. When force_integer is configured, the + loadavg prefix becomes loadavgi. + + metrics= + | + | comma separated list of metrics to include. If not given, all + are included. The complete list is load1min, load5min, + load15min, runnable, scheduling_entities, newest_pid. + +DATA +=============== + +This reports metrics from /proc/loadavg, which has the format: load1min +load5min load15min runnable/scheduling_entities newest_pid. + +The load numbers are multiplied by 100 and cast to unsigned integers as +they are collected, rather than being collected as real numbers. + +EXAMPLES +=================== + +Within ldmsd_controller or a configuration file: + +:: + + load name=loadavg + config name=loadavg producer=vm1_1 component_id=1 instance=vm1_1/loadavg + start name=loadavg interval=1000000 + +NOTES +================ + +See proc(5) for the definitions of the metrics. + +SEE ALSO +=================== + +proc(5), ldmsd(8), ldms_sampler_base(7), ldmsd_controller(8) diff --git a/rtd/man2rst/Plugin_lustre2_client.rst b/rtd/man2rst/Plugin_lustre2_client.rst new file mode 100644 index 000000000..36707f4d0 --- /dev/null +++ b/rtd/man2rst/Plugin_lustre2_client.rst @@ -0,0 +1,100 @@ +===================== +Plugin_lustre2_client +===================== + +:Date: 26 Oct 2017 + +.. contents:: + :depth: 3 +.. + +NAME +====================== + +Plugin_lustre2_client - man page for the LDMS lustre2_client plugin + +SYNOPSIS +========================== + +| Within ldmsd_controller or a configuration file: +| ldmsctl> config name=lustre2_client [ = ] + +DESCRIPTION +============================= + +The lustre2_client plugin provides Lustre metric information. + +CONFIGURATION ATTRIBUTE SYNTAX +================================================ + +This plugin uses the sampler_base base class. This man page covers only +the configuration attributes, or those with default values, specific to +the this plugin; see **ldms_sampler_base**\ (7) for the attributes of +the base class. + +**config** **name**\ *=* * osc*\ **=** *mdc*\ **=** *llite*\ **=** *osc_path =* **mdc_path=**\ *"*\ **llite_path=** + +Descriptions: + + **name**\ *=* + This MUST be lustre2_client. + + **** + Please see **ldms_sampler_base**\ (7) for sampler_base options. + + **osc**\ *=* + CSV list of OSC's. + + **mdc**\ *=* + CSV list of MDC's. + + **llite**\ *=* + CSV list of LLITE's. + + **osc_path**\ *=* + A user custom path to osc. + + **mdc_path**\ *=* + A user custom path to osc. + + **llite_path**\ *=* + A user custom path to llite. + +NOTES +======================= + +For oscs,mdcs and llites: if not specified, NONE of the oscs/mdcs/llites +will be added. If {oscs,mdcs,llites} is set to \*, all of the available +{oscs,mdcs,llites} at the time will be added. + +The names that make up the list of oscs, mdcs and llites do not have to +include the uid part. For example, 'lustre-ffff8803245d4000' is the +actual file in /proc/fs/lustre/llite/, but you can just say +llites=lustre to include this component into the set. + +osc_path, mdc_path, llite_path are optional full path names of stats +files if not in default location. The default locations are: +/sys/kernel/debug/lustre/{osc, mdc, llite}, and /proc/fs/lustre/{osc, +mdc, llite} depends on the Lustre version. Be aware that +/sys/kernel/debug is only readable by privileged users. + +BUGS +====================== + +None known. + +EXAMPLES +========================== + +:: + + load name=lustre2_client + config name=lustre2_client producer=compute1 component_id=1 instance=compute1/lustre2_client llites=* + ldmsctl> start name=lustre2_client interval=1000000 + ldmsctl> quit + +SEE ALSO +========================== + +**ldms_sampler_base**\ (7), **ldmsd**\ (8), **ldms_quickstart**\ (7), +**ldmsd_controller**\ (8) diff --git a/rtd/man2rst/Plugin_lustre_client.rst b/rtd/man2rst/Plugin_lustre_client.rst new file mode 100644 index 000000000..341c4ea55 --- /dev/null +++ b/rtd/man2rst/Plugin_lustre_client.rst @@ -0,0 +1,92 @@ +==================== +Plugin_lustre_client +==================== + +:Date: 1 May 2019 + +.. contents:: + :depth: 3 +.. + +NAME +===================== + +Plugin_lustre_client - man page for the LDMS lustre_client plugin + +SYNOPSIS +========================= + +| Within ldmsd_controller or a configuration file: +| config name=lustre_client [ = ] + +DESCRIPTION +============================ + +With LDMS (Lightweight Distributed Metric Service), plugins for the +ldmsd (ldms daemon) are configured via ldmsd_controller or a +configuration file. The lustre_client plugin provide a metric set for +each of the lustre client mounts found on a node. The schema is named +"lustre_client". The data for the metric sets is generally found in +/proc/fs/lustre/llite/\*/stats. + +This plugin currently employs zero configuration. The producer name is +set to the hostname by default, and the metric set instance names are +derived from the llite instance name. Any user-supplied configuration +values not documented here will be ignored. + +This plugin should work with at least Lustre versions 2.8, 2.10, and +2.12. + +CONFIGURATION ATTRIBUTE SYNTAX +=============================================== + +**config** + | name= [job_set=] [producer=] + [component_id=] + | configuration line + + name= + | + | This MUST be lustre_client. + + job_set= + | + | The name of the metric set that contains the job id information + (default=job_id) + + producer= + | + | The default used for producer (if not provided) is the result of + gethostname(). The set instance names will be + $producer/$llite_name. + + component_id= + | + | Optional (defaults to 0) number of the host where the sampler is + running. All sets on a host will have the same value. + + perm= + | + | Set the access permissions for the metric sets. (default 440). + +NOTES +====================== + +Improperly spelled option names are not trapped as configuration errors. + +EXAMPLES +========================= + +Within ldmsd_controller or a configuration file: + +:: + + load name=lustre_client + config name=lustre_client + start name=lustre_client interval=1000000 + +SEE ALSO +========================= + +ldmsd(8), ldms_quickstart(7), ldmsd_controller(8), ldms_sampler_base(7), +gethostname(2) diff --git a/rtd/man2rst/Plugin_lustre_mdc.rst b/rtd/man2rst/Plugin_lustre_mdc.rst new file mode 100644 index 000000000..7a783a443 --- /dev/null +++ b/rtd/man2rst/Plugin_lustre_mdc.rst @@ -0,0 +1,160 @@ +================= +Plugin_lustre_mdc +================= + +:Date: 1 May 2019 + +.. contents:: + :depth: 3 +.. + +NAME +================== + +Plugin_lustre_mdc - man page for the LDMS lustre_mdc plugin + +SYNOPSIS +====================== + +| Within ldmsd_controller or a configuration file: +| config name=lustre_mdc + +DESCRIPTION +========================= + +With LDMS (Lightweight Distributed Metric Service), plugins for the +ldmsd (ldms daemon) are configured via ldmsd_controller or a +configuration file. + +The lustre_mdc plugin provides schema lustre_mdc for daemons with read +access to the lustre files in /proc/fs/lustre/mdc/\*/md_stats and +/sys/kernel/debug/lustre/mdc/\*/stats. The metric sets will have +instance names combining the producer name and the mdc name. + +This plugin will work with Lustre versions 2.12 and others which share +these file locations and formats. + +CONFIGURATION ATTRIBUTE SYNTAX +============================================ + +**config** + | name= [producer=] [component_id=] + | configuration line + + name= + | + | This MUST be lustre_mdc. + + producer= + | + | The default used for producer (if not provided) is the result of + gethostname(). The set instance names will be + / + + component_id= + | + | Optional (defaults to 0) number of the host where the sampler is + running. All sets on a host will have the same value. + + job_set= + | + | Optional (defaults to "job_info"). Typically should be set to + /jobid or /job_info depending on choice of + job sampling plugin. + + mdc_timing=0 + | + | Optionally exclude timing data from + /sys/kernel/debug/lustre/mdc/\*/stats. If given, the sampler may + be run by unprivileged users. If /sys/kernel/debug/ cannot be + opened by the user, it is a configuration error unless + mdc_timing=0 is given. + + auto_reset=0 + | + | Turn off the default behavior of resetting the counters when an + overflow condition is detected. Reset is implemented by writing + 0 to the corresponding /proc or /sys file. + +SCHEMA +==================== + +The default schema name is lustre_mdc_ops_timing with all the data +described in DATA REPORTED below included. If mdc_timing=0 is given, +only the operation counts from md_stats are reported and the default +schema name changes to lustre_mdc_ops. + +DATA REPORTED +=========================== + +fs_name: The lustre file system name, e.g. xscratch. mdc: The mdc target +that goes with the metrics, e.g. xscratch-MDT0000. last_reset: The time +of the last reset performed by this sampler for any of its metric sets. + +Operation counts from /proc/fs/lustre/mdc/\*/md_stats. See also kernel +source lustre/lustre/obdclass/lprocfs_status.c and +lustre/lustre/include/obd_class.h: mps_stats[]: "close", "create", +"enqueue", "getattr", "intent_lock", "link", "rename", "setattr", +"fsync", "read_page", "unlink", "setxattr", "getxattr", +"intent_getattr_async", "revalidate_lock", + +Client operation timing statistics (all but .count are in microseconds) +for the following list of fields in +/sys/kernel/debug/lustre/mdc/\*/stats: "req_waittime", "mds_getattr", +"mds_getattr_lock", "mds_close", "mds_readpage", "mds_connect", +"mds_get_root", "mds_statfs", "ldlm_cancel", "obd_ping", "seq_query", +"fld_query" + +and statistics: "\__count" the number of events observed, "\__min" the +minimum event duration observed, "\__max" the maximum duration observed, +"\__sum" the sum of all durations observed, "\__sumsqs" the sum of +squares of all durations observed + +NOTES +=================== + +The counters and file locations supported by this plugin are those +present in Lustre 2.12. The fields labeled [reqs] are omitted. Data +names not listed here are simply ignored. + +The minimum sample interval recommended for this sampler is 5-10 +seconds, as the data volume may be substantial and resolving shorter +bursts of metadata activity is generally unnecessary. + +The average and sample standard deviation can be computed from sum and +sumsqs, but once these counters roll over to negative values on a high +up-time client, they may be less useful. The counters can be manually +reset with bash: + +:: + + for i in /proc/fs/lustre/mdc/*/md_stats /sys/kernel/debug/lustre/mdc/*/stats; do + echo 0 $i; + done + +The lustre utility equivalent of this plugin is to inspect the output of +lctl get_param -R mdc.\*.stats lctl get_param -R mdc.\*.md_stats + +Specifying instance=xxx as an option will be ignored. + +BUGS +================== + +No known bugs. + +EXAMPLES +====================== + +Within ldmsd_controller or a configuration file: + +:: + + load name=lustre_mdc + config name=lustre_mdc + start name=lustre_mdc interval=1000000 + +SEE ALSO +====================== + +ldmsd(8), ldms_quickstart(7), ldmsd_controller(8), ldms_sampler_base(7), +lctl(8). diff --git a/rtd/man2rst/Plugin_lustre_mdt.rst b/rtd/man2rst/Plugin_lustre_mdt.rst new file mode 100644 index 000000000..6c1355d51 --- /dev/null +++ b/rtd/man2rst/Plugin_lustre_mdt.rst @@ -0,0 +1,91 @@ +================= +Plugin_lustre_mdt +================= + +:Date: 1 May 2019 + +.. contents:: + :depth: 3 +.. + +NAME +================== + +Plugin_lustre_mdt - man page for the LDMS lustre_mdt plugin + +SYNOPSIS +====================== + +| Within ldmsd_controller or a configuration file: +| config name=lustre_mdt + +DESCRIPTION +========================= + +With LDMS (Lightweight Distributed Metric Service), plugins for the +ldmsd (ldms daemon) are configured via ldmsd_controller or a +configuration file. + +The lustre_mdt plugin provides metric sets for two different schemas: +lustre_mdt and lustre_mdt_job_stats. + +The metric sets using schema lustre_mdt will have a producer name set to +the hostname, and the instance name set to the mdt name. The data for +these metrics sets come from a combination of the data in +/proc/fs/lustre/mdt/\*/stats and a few other single-value files in +/proc/fs/lustre/mdt/\*/. + +The metric sets using schema lustre_mdt_job_stats will have a producer +name set to the hostname, and the instance name will be set to a +combination of the mdt name and the job_id string. The data for these +metrics sets come from /proc/fs/lustre/mdt/\*/job_stats. + +This plugin currently employs zero configuration. Any user-supplied +configuration values will be ignored. Future versions may add +configuration options. + +This plugin should work with at least Lustre versions 2.8, 2.10, and +2.12. + +CONFIGURATION ATTRIBUTE SYNTAX +============================================ + +**config** + | name= [producer=] [component_id=] + | configuration line + + name= + | + | This MUST be lustre_mdt. + + producer= + | + | The default used for producer (if not provided) is the result of + gethostname(). The set instance names will be + $producer/$mdt_name. + + component_id= + | + | Optional (defaults to 0) number of the host where the sampler is + running. All sets on a host will have the same value. + +BUGS +================== + +No known bugs. + +EXAMPLES +====================== + +Within ldmsd_controller or a configuration file: + +:: + + load name=lustre_mdt + config name=lustre_mdt + start name=lustre_mdt interval=1000000 + +SEE ALSO +====================== + +ldmsd(8), ldms_quickstart(7), ldmsd_controller(8), ldms_sampler_base(7) diff --git a/rtd/man2rst/Plugin_lustre_ost.rst b/rtd/man2rst/Plugin_lustre_ost.rst new file mode 100644 index 000000000..3cd09b493 --- /dev/null +++ b/rtd/man2rst/Plugin_lustre_ost.rst @@ -0,0 +1,91 @@ +================= +Plugin_lustre_ost +================= + +:Date: 1 May 2019 + +.. contents:: + :depth: 3 +.. + +NAME +================== + +Plugin_lustre_ost - man page for the LDMS lustre_ost plugin + +SYNOPSIS +====================== + +| Within ldmsd_controller or a configuration file: +| config name=lustre_ost + +DESCRIPTION +========================= + +With LDMS (Lightweight Distributed Metric Service), plugins for the +ldmsd (ldms daemon) are configured via ldmsd_controller or a +configuration file. + +The lustre_ost plugin provides metric sets for two different schemas: +lustre_ost and lustre_ost_job_stats. + +The metric sets using schema lustre_ost will have a producer name set to +the hostname, and the instance name set to the ost name. The data for +these metrics sets come from a combination of the data in +/proc/fs/lustre/ost/\*/stats and a few other single-value files in +/proc/fs/lustre/ost/\*/. + +The metric sets using schema lustre_ost_job_stats will have a producer +name set to the hostname, and the instance name will be set to a +combination of the ost name and the job_id string. The data for these +metrics sets come from /proc/fs/lustre/ost/\*/job_stats. + +This plugin currently employs zero configuration. Any user-supplied +configuration values will be ignored. Future versions may add +configuration options. + +This plugin should work with at least Lustre versions 2.8, 2.10, and +2.12. + +CONFIGURATION ATTRIBUTE SYNTAX +============================================ + +**config** + | name= [producer=] [component_id=] + | configuration line + + name= + | + | This MUST be lustre_ost. + + producer= + | + | The default used for producer (if not provided) is the result of + gethostname(). The set instance names will be + $producer/$ost_name. + + component_id= + | + | Optional (defaults to 0) number of the host where the sampler is + running. All sets on a host will have the same value. + +BUGS +================== + +No known bugs. + +EXAMPLES +====================== + +Within ldmsd_controller or a configuration file: + +:: + + load name=lustre_ost + config name=lustre_ost + start name=lustre_ost interval=1000000 + +SEE ALSO +====================== + +ldmsd(8), ldms_quickstart(7), ldmsd_controller(8), ldms_sampler_base(7) diff --git a/rtd/man2rst/Plugin_meminfo.rst b/rtd/man2rst/Plugin_meminfo.rst new file mode 100644 index 000000000..1cf06fd6d --- /dev/null +++ b/rtd/man2rst/Plugin_meminfo.rst @@ -0,0 +1,71 @@ +============== +Plugin_meminfo +============== + +:Date: 04 Feb 2018 + +.. contents:: + :depth: 3 +.. + +NAME +=============== + +Plugin_meminfo - man page for the LDMS meminfo plugin + +SYNOPSIS +=================== + +| Within ldmsd_controller or a configuration file: +| config name=meminfo [ = ] + +DESCRIPTION +====================== + +With LDMS (Lightweight Distributed Metric Service), plugins for the +ldmsd (ldms daemon) are configured via ldmsd_controller or a +configuration file. The meminfo plugin provides memory info from +/proc/meminfo. + +CONFIGURATION ATTRIBUTE SYNTAX +========================================= + +The meminfo plugin uses the sampler_base base class. This man page +covers only the configuration attributes, or those with default values, +specific to the this plugin; see ldms_sampler_base.man for the +attributes of the base class. + +**config** + | name= [schema=] + | configuration line + + name= + | + | This MUST be meminfo. + + schema= + | + | Optional schema name. It is intended that the same sampler on + different nodes with different metrics have a different schema. + If not specified, will default to \`meminfo\`. + +BUGS +=============== + +No known bugs. + +EXAMPLES +=================== + +Within ldmsd_controller or a configuration file: + +:: + + load name=meminfo + config name=meminfo producer=vm1_1 instance=vm1_1/meminfo + start name=meminfo interval=1000000 + +SEE ALSO +=================== + +ldmsd(8), ldms_quickstart(7), ldmsd_controller(8), ldms_sampler_base(7) diff --git a/rtd/man2rst/Plugin_msr_interlagos.rst b/rtd/man2rst/Plugin_msr_interlagos.rst new file mode 100644 index 000000000..d596b0171 --- /dev/null +++ b/rtd/man2rst/Plugin_msr_interlagos.rst @@ -0,0 +1,425 @@ +===================== +Plugin_msr_interlagos +===================== + +:Date: 04 Jan 2018 + +.. contents:: + :depth: 3 +.. + +NAME +====================== + +Plugin_msr - man page for the LDMS msr interlagos plugin + +SYNOPSIS +========================== + +| Within ldmsd_controller script or a configuration script: +| load name=msr_interlagos +| config name=msr_interlagos action= [ = ] +| add name=msr_interlagos [ = ] + +DESCRIPTION +============================= + +With LDMS (Lightweight Distributed Metric Service), plugins for the +ldmsd (ldms daemon) are configured via ldmsd_controller or by use of a +configuration file provided as an argument to the "-c" flag when +starting ldmsd. In the case of the configuration file the commands are +the same as those used via the ldmsd_controller interface. The +msr_interlagos plugin provides msr counter information for the AMD +Family 15h Models 00h-0Fh Processors (Interlagos) only. + +This is a developmental version of the sampler. It may change at any +time. + +The sampler will allow you to select from an identified set of counters. +These are only correctly defined for the AMD Interlagos processor. The +counter addresses, what events are potentially being counted, the event +names, the counter types (core, uncore), etc. are defined in a +configuration file. An example of this file can be found at: +/util/configs/msr/interlagos/bw_msr_configs. + +The actual counters desired can be defined/modified at/during run time +using the defined event names subject to the constraint that each +counter can only count a single event name at a time. If a second name +mapping into an already selected counter is selected, the selection will +fail. The event names must be identified via the action=add directive +for each desired event name. When all desired event names have been +added, the directive action=finalize is used to instantiate the event +name to counter mappings. + +The metric names are reported as generic names in the output set since +their actual identities may be changed dynamically. For any given +counter the first value (e.g., CTR) is the uint64 representation of +the counter configuration used in the counter setup. The subsequent +values (e.g., CTR\_) are the values read from the counters (1 +per numa node or num core values (with optional additional zero values +if maxcore specified (see more below)). + +To build the msr_interlagos sampler, build with the following flags: +**--enable_msr_interlagos** + +The ldmsd_controller interface includes functions for manipulating the +sampling state and counter identities as described below. + +EXTERNAL MODIFICATION OF COUNTERS AND EXTERNAL INTERACTIONS +============================================================================= + +Note that a user, with appropriate privilege, can change the identity of +the event being collected via an external methodology such as wrmsr. +Because of this, the msr_interlagos plugin first rechecks the event +identity of each counter before sampling, however this is not atomic so +there is a slight possibility of a race condition where the user may +change the counter between the check and the read. If the check fails +zero values are reported for all metrics for that particular counter, +including the control register(s), and the metric name is a zero length +string. This continues until the identity is reset, either by external +methods or by issuing the action=rewrite directive. + +If a user job changes the counters, it is intended that interaction with +the Resource Manager can invoke the rewrite command for the counters +once the user job has exited. A script is supplied that can be called +from epilog to perform this event rewrite. The script is blocking on the +rewrite in order to avoid a race condition with the next job setting the +counters before the rewrite is completed. There is a maximum time time +limit on the blocking call in the script. The script return code +indicates success or failure. Note that options that require the LDMS +daemon to check for a flag set by the scheduler are subject to race +conditions. + +COUNTER CONFIGURATION FILE +============================================ + +**!!!WARNING!!!** This plugin only works for Interlagos. Using this +sampler on other architectures or misconfiguration of the configuration +file may result in unforseen results with possible damage to the system +as the control register addresses will not map into the same +functionality. **!!!WARNING!!!** + +Fields in the MSR sampler configuration file are: Name, Write_addr, +Event, Umask, Read_addr, os_user, core_ena, core_sel, special_flag, +ctr_type. Please use or modify the example configuration file provided +in /util/configs/msr/interlagos/bw_msr_configs. + +Valid options for core_flag are MSR_DEFAULT and UNCORE_PER_NUMA. +MSR_DEFAULT indicates that the associated register will collect the same +event across all entities (core or numa domain). UNCORE_PER_NUMA is only +valid for uncore counters for which the unit mask can be used to specify +for which target numa domain events are being counted. A unit mask of +"0x0" indicates events will be counted for only the numa domain in which +the counter resides. A unit mask of "0xF" indicates events will be +counted for only numa domains in which the counter does not reside. This +enables understanding cache affinity and the level of IO crossing numa +boundaries. Valid options for ctr_type are CTR_NUMCORE and CTR_UNCORE. +These distinguish core and uncore counters. + +Lines starting with a # mark are comments and are skipped. + +:: + + ##### Core counters ########## + TLB_DM, 0xc0010200, 0x046, 0x07, 0xc0010201, 0x3, 0x0, 0x0, MSR_DEFAULT, CTR_NUMCORE + TOT_CYC, 0xc0010202, 0x076, 0x00, 0xc0010203, 0x3, 0x0, 0x0, MSR_DEFAULT, CTR_NUMCORE + L2_DCM, 0xc0010202, 0x043, 0x00, 0xc0010203, 0x3, 0x0, 0x0, MSR_DEFAULT, CTR_NUMCORE + L1_DCM, 0xc0010204, 0x041, 0x01, 0xc0010205, 0x3, 0x0, 0x0, MSR_DEFAULT, CTR_NUMCORE + L1_DCA, 0xc0010204, 0x040, 0x00, 0xc0010205, 0x3, 0x0, 0x0, MSR_DEFAULT, CTR_NUMCORE + #LS_DISP, 0xc0010204, 0x029, 0x01, 0xc0010205, 0x3, 0x0, 0x0, MSR_DEFAULT, CTR_NUMCORE + #LS_DISP, 0xc0010204, 0x029, 0x02, 0xc0010205, 0x3, 0x0, 0x0, MSR_DEFAULT, CTR_NUMCORE + #LS_DISP, 0xc0010204, 0x029, 0x04, 0xc0010205, 0x3, 0x0, 0x0, MSR_DEFAULT, CTR_NUMCORE + LS_DISP, 0xc0010204, 0x029, 0x07, 0xc0010205, 0x3, 0x0, 0x0, MSR_DEFAULT, CTR_NUMCORE + RETIRED_FLOPS, 0xc0010206, 0x003, 0xFF, 0xc0010207, 0x3, 0x0, 0x0, MSR_DEFAULT, CTR_NUMCORE + DP_OPS, 0xc0010206, 0x003, 0xF0, 0xc0010207, 0x3, 0x0, 0x0, MSR_DEFAULT, CTR_NUMCORE + VEC_INS, 0xc0010208, 0x0CB, 0x04, 0xc0010209, 0x3, 0x0, 0x0, MSR_DEFAULT, CTR_NUMCORE + TOT_INS, 0xc001020A, 0x0C0, 0x00, 0xc001020B, 0x3, 0x0, 0x0, MSR_DEFAULT, CTR_NUMCORE + ##### Uncore counters ########## + L3_CACHE_MISSES, 0xc0010240, 0x4E1, 0xF7, 0xc0010241, 0x0, 0x1, 0x0, MSR_DEFAULT, CTR_UNCORE + RW_DRAM_EXT, 0xc0010242, 0x1E0, 0xF, 0xc0010243, 0x0, 0x1, 0x0, UNCORE_PER_NUMA, CTR_UNCORE + IO_DRAM_INT, 0xc0010242, 0x1E1, 0x0, 0xc0010243, 0x0, 0x1, 0x0, UNCORE_PER_NUMA, CTR_UNCORE + DCT_PREFETCH, 0xc0010242, 0x1F0, 0x64, 0xc0010243, 0x0, 0x1, 0x0, MSR_DEFAULT, CTR_UNCORE + DCT_RD_TOT, 0xc0010244, 0x1F0, 0x62, 0xc0010245, 0x0, 0x1, 0x0, MSR_DEFAULT, CTR_UNCORE + RW_DRAM_INT, 0xc0010246, 0x1E0, 0x0, 0xc0010247, 0x0, 0x1, 0x0, UNCORE_PER_NUMA, CTR_UNCORE + IO_DRAM_EXT, 0xc0010246, 0x1E1, 0xF, 0xc0010247, 0x0, 0x1, 0x0, UNCORE_PER_NUMA, CTR_UNCORE + DCT_WRT, 0xc0010246, 0x1F0, 0x19, 0xc0010247, 0x0, 0x1, 0x0, MSR_DEFAULT, CTR_UNCORE + # + # Note that for the following, CTR_NUMCORE pairs are: + # [0] Control: 0xc0010200 Data: 0xc0010201 + # [1] Control: 0xc0010202 Data: 0xc0010203 + # [2] Control: 0xc0010204 Data: 0xc0010205 + # [3] Control: 0xc0010206 Data: 0xc0010207 + # [4] Control: 0xc0010208 Data: 0xc0010209 + # [5] Control: 0xc001020A Data: 0xc001020B + # + # And CTR_UNCORE pairs are: + # [0] Control: 0xc0010240 Data: 0xc0010241 + # [1] Control: 0xc0010242 Data: 0xc0010243 + # [2] Control: 0xc0010244 Data: 0xc0010245 + # [3] Control: 0xc0010246 Data: 0xc0010247 + # + # The first column below indicates the counters available for a particular + # feature. For example [2:0] indicates that the core counters (CTR_NUMCORE) + # 0, 1, and 2, as indicated above, are available to count TLB_DM. + # + # NOTE: For the UNCORE_PER_NUMA case, use 0x0 to exclude external numa access + # and 0xF to exclude local numa access and only count external access. + ##### Core counters ########## + #[2:0] TLB_DM, 0xc0010200, 0x046, 0x07, 0xc0010201, 0x3, 0x0, 0x0, MSR_DEFAULT, CTR_NUMCORE + #[2:0] TOT_CYC, 0xc0010202, 0x076, 0x00, 0xc0010203, 0x3, 0x0, 0x0, MSR_DEFAULT, CTR_NUMCORE + #[2:0] L2_DCM, 0xc0010202, 0x043, 0x00, 0xc0010203, 0x3, 0x0, 0x0, MSR_DEFAULT, CTR_NUMCORE + #[5:0] L1_DCM, 0xc0010204, 0x041, 0x01, 0xc0010205, 0x3, 0x0, 0x0, MSR_DEFAULT, CTR_NUMCORE + #[5:0] L1_DCA, 0xc0010204, 0x040, 0x00, 0xc0010205, 0x3, 0x0, 0x0, MSR_DEFAULT, CTR_NUMCORE + #[5:0] LS_DISP, 0xc0010204, 0x029, 0x01, 0xc0010205, 0x3, 0x0, 0x0, MSR_DEFAULT, CTR_NUMCORE + #[5:0] LS_DISP, 0xc0010204, 0x029, 0x02, 0xc0010205, 0x3, 0x0, 0x0, MSR_DEFAULT, CTR_NUMCORE + #[5:0] LS_DISP, 0xc0010204, 0x029, 0x04, 0xc0010205, 0x3, 0x0, 0x0, MSR_DEFAULT, CTR_NUMCORE + #[5:0] LS_DISP, 0xc0010204, 0x029, 0x07, 0xc0010205, 0x3, 0x0, 0x0, MSR_DEFAULT, CTR_NUMCORE + #[3] RETIRED_FLOPS, 0xc0010206, 0x003, 0xFF, 0xc0010207, 0x3, 0x0, 0x0, MSR_DEFAULT, CTR_NUMCORE + #[3] DP_OPS, 0xc0010206, 0x003, 0xF0, 0xc0010207, 0x3, 0x0, 0x0, MSR_DEFAULT, CTR_NUMCORE + #[5:0] VEC_INS, 0xc0010208, 0x0CB, 0x04, 0xc0010209, 0x3, 0x0, 0x0, MSR_DEFAULT, CTR_NUMCORE + #[5:0] TOT_INS, 0xc001020A, 0x0C0, 0x00, 0xc001020B, 0x3, 0x0, 0x0, MSR_DEFAULT, CTR_NUMCORE + ##### Uncore counters ########## + #[3:0] L3_CACHE_MISSES, 0xc0010240, 0x4E1, 0xF7, 0xc0010241, 0x0, 0x1, 0x0, MSR_DEFAULT, CTR_UNCORE + #[3:0] RW_DRAM_EXT, 0xc0010242, 0x1E0, 0xF, 0xc0010243, 0x0, 0x1, 0x0, UNCORE_PER_NUMA, CTR_UNCORE + #[3:0] IO_DRAM_INT, 0xc0010242, 0x1E1, 0x0, 0xc0010243, 0x0, 0x1, 0x0, UNCORE_PER_NUMA, CTR_UNCORE + #[3:0] DCT_PREFETCH, 0xc0010242, 0x1F0, 0x64, 0xc0010243, 0x0, 0x1, 0x0, MSR_DEFAULT, CTR_UNCORE + #[3:0] DCT_RD_TOT, 0xc0010244, 0x1F0, 0x62, 0xc0010245, 0x0, 0x1, 0x0, MSR_DEFAULT, CTR_UNCORE + #[3:0] RW_DRAM_INT, 0xc0010246, 0x1E0, 0x0, 0xc0010247, 0x0, 0x1, 0x0, UNCORE_PER_NUMA, CTR_UNCORE + #[3:0] IO_DRAM_EXT, 0xc0010246, 0x1E1, 0xF, 0xc0010247, 0x0, 0x1, 0x0, UNCORE_PER_NUMA, CTR_UNCORE + #[3:0] DCT_WRT, 0xc0010246, 0x1F0, 0x19, 0xc0010247, 0x0, 0x1, 0x0, MSR_DEFAULT, CTR_UNCORE + +OUTPUT FORMAT +=============================== + +Example output format from the "ldms_ls" command is shown below. Since +the counters can be added in any order and be changed dynamically, the +names are generic (e.g., Ctr0_n) with CtrN_name being the string version +of the name and CtrN_wctl being the write control register (event code +and unit mask for the msr variable assigned to that counter). + +This is followed a vector of the values. If there is only 1 value in the +vector, then the name is CtrN. If there is a value per numa domain, then +the name is CtrN_n. If there is a value per core, then the name is +CtrN_c. + +If the write control register is the same for all values in the vector, +it is only written once and called CtrN_wctl. If the write control +register is different for the values in the vector, as it would be for +the per numa domain values, then the write control register variable is +a vector of length > 1 and is named CtrN_wctl_n. Zeros in the +CtrN_wctl_n indicate that the "maxcore" value specified in the +configuration of the sampler was greater than the actual number of cores +and hence those wctl and variable data values will be 0. + +Example output is below: + +:: + + nid00010/msr_interlagos: consistent, last update: Sun Oct 30 16:34:16 2016 [4398us] + M u64 component_id 10 + D u64 job_id 0 + D char[] Ctr0_name "L3_CACHE_MISSES" + D u64[] Ctr0_wctl 85903603681 + D u64[] Ctr0_n 8761095,660101,0,0 + D char[] Ctr1_name "DCT_RD_TOT" + D u64[] Ctr1_wctl 73018663664 + D u64[] Ctr1_n 16748451,1103973,0,0 + D char[] Ctr2_name "RW_DRAM_EXT" + D u64[] Ctr2_wctl_n 73018642144,73018641888,0,0 + D u64[] Ctr2_n 4901448,7120727,0,0 + D char[] Ctr3_name "RW_DRAM_INT" + D u64[] Ctr3_wctl_n 73018638816,73018639072,0,0 + D u64[] Ctr3_n 74099900,3773483,0,0 + D char[] Ctr4_name "TOT_CYC" + D u64[] Ctr4_wctl 4391030 + D u64[] Ctr4_c 775759456,2595008788,234822206,155962379,51951208,53210798,82771568,52716295,85501768,50656894,175839012,619930959,179902397,110558187,334344071,353769784,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 + D char[] Ctr5_name "TOT_INS" + D u64[] Ctr5_wctl 4391104 + D u64[] Ctr5_c 211085929,410194651,45686350,11096207,4489395,4565853,13261794,3626609,15062986,3753527,3802413,194511990,55444449,7321398,39989531,36190191,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 + D char[] Ctr6_name "L1_DCM" + D u64[] Ctr6_wctl 4391233 + D u64[] Ctr6_c 5101215,22654419,1078523,247674,101807,99840,403194,75661,403958,81801,106359,2316889,663984,186842,944343,921712,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 + D char[] Ctr7_name "RETIRED_FLOPS" + D u64[] Ctr7_wctl 4456195 + D u64[] Ctr7_c 122,197,408,57,3,0,2,0,0,0,2,131,272,0,13,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 + D char[] Ctr8_name "VEC_INS" + D u64[] Ctr8_wctl 4392139 + D u64[] Ctr8_c 13185,32428971,9960,8153,65,0,6517,0,2863,0,280,497910,88393,624,59806,26,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 + D char[] Ctr9_name "TLB_DM" + D u64[] Ctr9_wctl 4392774 + D u64[] Ctr9_c 1312,131553,1080,698,154,2,546,3,266,59,125,678,901,196,6254,155,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 + +LDMSD_CONTROLLER CONFIGURATION COMMANDS ORDER +=============================================================== + +Configuration commands are intended to be issued in the following order: + +- load + +- config action=initialize + +- config action=add (one or more) + +- config action=finalize (one or more) + +- start + +The following config commands can be issued anytime after the start in +any order + +- config action=halt + +- config action=continue + +- config action=reassign + +- config action=rewrite + +LDMSD_CONTROLLER CONFIGURATION ATTRIBUTE SYNTAX +================================================================= + +The msr_interlagos plugin uses the sampler_base base class. This man +page covers only the configuration attributes, or those with default +values, specific to the this plugin; see ldms_sampler_base.man for the +attributes of the base class. + +**config** + | name= action= [ = ... ] + | configuration line + + name= + | + | This MUST be msr_interlagos + + action= + | + | Options are initialize, add, finalize, halt, continue, reassign, + rewrite, and ls: + + **initialize** + | corespernuma= conffile= [maxcore= + schema= ] + | initialize the plugin. sampler_base configuration arguments + should be specified at this point. + + corespernuma= + | + | Cores per numa node. Used to determine which and how many + cores are used in setting counters that report per numa node. + + maxcore= + | + | Maxcores that will be reported for all core counters and will + also be used in counters that report per numa node. Must be + >= actual number of cores. Any additional values will be + reported with 0 values. Optional. Defaults to using the + actual number of cores. + + schema= + | + | Schema name. Optional. Defaults to msr_interlagos. + + **add** + | metricname= + | add a counter metric to the set. The metric set will be built in + the order the metrics are added + + metricname= + | + | The name of counter e.g., L3_CACHE_MISSES. Options are listed + in a separate section of this man page. + + **finalize** + | + | creates the set after all the adds. No metrics may be added + after this point. + + **halt** + | metricname= + | halts collection for this counter. Zero values will be returned + for all metrics for this counter. + + metricname= + | + | The name of counter e.g., L3_CACHE_MISSES. metricname=all + halts all. + + **continue** + | metricname= + | continues collection for this counter after a halt. + + metricname= + | + | The name of counter e.g., L3_CACHE_MISSES. metricname=all + continues all. + + **rewrite** + | metricname= + | rewrites the counter variable. Used in case the counter variable + has been changed for this address external to ldms. + + metricname= + | + | The name of counter e.g., L3_CACHE_MISSES. metricname=all + rewrites all counters. + + **reassign** + | oldmetricname= newmetricname= + | replaces a metric in the metric set with a new one. It must be + the same size (e.g., numcores vs single value) as the previous + counter. + + oldmetricname= + | + | The name of counter to be replaced e.g., TOT_CYC + + newmetricname= + | + | The name of counter that the previous variable will be + replaced with e.g., TOT_INS + + **ls** + | + | writes info about the intended counters to the log file. + +BUGS +====================== + +The sampler is not robust to errors in the configuration file (i.e., +there is no error checking with respect to registers being written to or +the contents being written). An error could result in unexpected +operation including damage to the host. + +NOTES +======================= + +- This is a developmental version of the sampler. It may change at any + time. + +- The format of the configuration file and the fields has changed since + the v2 release. + +- This plugin only works for Interlagos. Using this sampler on other + architectures may result in badness as the addresses will not be + correct. + +EXAMPLES +========================== + +Within ldmsd_controller or a configuration file: + +| config name=msr_interlagos action=initialize producer=nid00010 + instance=nid00010 component_id=10 corespernuma=8 + conffile=/XXX/msr_conf.txt +| config name=msr_interlagos action=add metricname=L3_CACHE_MISSES +| config name=msr_interlagos action=add metricname=TOT_CYC +| config name=msr_interlagos action=finalize +| config name=msr_interlagos action=reassign oldmetricname=TOT_CYC + newmetricname=TOT_INS +| config name=msr_interlagos action=halt metricname=TOT_CYC + +SEE ALSO +========================== + +ldmsd(7), ldms_quickstart(7), ldms_sampler_base(7), +Plugin_store_function_csv(7), ldmsd_controller(8) diff --git a/rtd/man2rst/Plugin_opa2.rst b/rtd/man2rst/Plugin_opa2.rst new file mode 100644 index 000000000..f8139f351 --- /dev/null +++ b/rtd/man2rst/Plugin_opa2.rst @@ -0,0 +1,90 @@ +=========== +Plugin_opa2 +=========== + +:Date: 5 Feb 2018 + +.. contents:: + :depth: 3 +.. + +NAME +============ + +Plugin_opa2 - man page for the LDMS opa2 OmniPath network plugin + +SYNOPSIS +================ + +| Within ldmsd_controller or a configuration file: +| load name=opa2 config name=opa2 [ = ] + +DESCRIPTION +=================== + +The opa2 plugin provides local port counters from OmniPath hardware. A +separate data set is created for each port. All sets use the same +schema. + +CONFIGURATION ATTRIBUTE SYNTAX +====================================== + +**config** + | name= producer= instance= + [schema=] [component_id=] [ports=] + | configuration line + + name= + | + | This MUST be opa2. + + producer= + | + | The producer string value. + + instance= + | + | The set_name supplied is ignored, and the name + $producer/$CA/$port is used. + + schema= + | + | Optional schema name. Default opa2. The same schema is used for + all sets. + + component_id= + | + | Optional component identifier. Defaults to zero. + + ports= + | + | Port list is a comma separated list of ca_name.portnum or a + '\*'. The default is '\*', which collects a set for every host + fabric interface port. + +BUGS +============ + +None known. + +EXAMPLES +================ + +Within ldmsd_controller or a configuration file: + +:: + + load name=opa2 + config name=opa2 producer=compute1 instance=compute1/opa2 component_id=1 + start name=opa2 interval=1000000 + +NOTES +============= + +This sampler will be expanded in the future to capture additional +metrics. + +SEE ALSO +================ + +ldmsd(8), ldms_quickstart(7), ldmsd_controller(8) diff --git a/rtd/man2rst/Plugin_papi.rst b/rtd/man2rst/Plugin_papi.rst new file mode 100644 index 000000000..d4e5cfa54 --- /dev/null +++ b/rtd/man2rst/Plugin_papi.rst @@ -0,0 +1,112 @@ +=========== +Plugin_papi +=========== + +:Date: 09 May 2016 + +.. contents:: + :depth: 3 +.. + +NAME +============ + +Plugin_papi - man page for the LDMS papi sampler plugin. + +SYNOPSIS +================ + +| Within ldmsctl +| ldmsctl> config name=spapi [ = ] + +DESCRIPTION +=================== + +With LDMS (Lightweight Distributed Metric Service), plugins for the +ldmsd (ldms daemon) are configured via ldmsctl. The papi sampler plugin +runs on the nodes and provides data about the the occurrence of +micro-architectural events using papi library by accessing hardware +performance counters. + +ENVIRONMENT +=================== + +You will need to build LDMS with --enable-papi. Papi library should be +available through plugin library path. + +LDMSCTL CONFIGURATION ATTRIBUTE SYNTAX +============================================== + +**config** + name= events= + pid= producer= instance= + [schema=] [component_id= with_jobid=] ldmsctl + configuration line + +name= + | + | This MUST be spapi. + +producer= + | + | The producer string value. + +instance= + | + | The name of the metric set + +schema= + | + | Optional schema name. It is intended that the same sampler on + different nodes with different metrics have a different schema. + +component_id= + | + | Optional component identifier. Defaults to zero. + +with_jobid= + | + | Option to collect job id with set or 0 if not. + +events= + | + | Comma separated list of events. Available events can be determined + using papi_avail command if papi is installed on system. + +pid - The PID for the process being monitored + | + +NOTES +============= + +In order to check if an event is available on the system you can run +papi_avail. + +BUGS +============ + +No known bugs. + +EXAMPLES +================ + +The following is a short example that measures 4 events. + | + | Total CPU cycles + | Total CPU instructions + | Total branch instructions + | Mispredicted branch instructions + +$ldmsctl -S $LDMSD_SOCKPATH + +| ldmsctl> load name=spapi +| ldmsctl> config name=spapi producer=$PRODUCER_NAME + instance=$INSTANCE_NAME pid=$PID + events=PAPI_TOT_INS,PAPI_TOT_CYC,PAPI_BR_INS,PAPI_BR_MSP +| ldmsctl> start name=spapi interval=$INTERVAL_VALUE +| ldmsctl> quit + +SEE ALSO +================ + +papi_avail(1) , ldmsd(7), ldms_quickstart(7) diff --git a/rtd/man2rst/Plugin_papi_sampler.rst b/rtd/man2rst/Plugin_papi_sampler.rst new file mode 100644 index 000000000..40ed5ec72 --- /dev/null +++ b/rtd/man2rst/Plugin_papi_sampler.rst @@ -0,0 +1,129 @@ +=================== +Plugin_papi_sampler +=================== + +:Date: 30 Sep 2019 + +.. contents:: + :depth: 3 +.. + +NAME +==================== + +Plugin_papi_sampler - man page for the LDMSD papi_sampler plugin + +SYNOPSIS +======================== + +Within ldmsd_controller or a configuration file: **config** +**name=papi_sampler** **producer=**\ *PRODUCER* +**instance=**\ *INSTANCE* [ **component_id=\ COMP_ID** ] [ +**stream=\ STREAM** ] [ **job_expiry=\ EXPIRY_SEC** ] + +DESCRIPTION +=========================== + +**papi_sampler** monitors PAPI events of processes of Slurm jobs. + +The job script must define **SUBSCRIBER_DATA** environment variable as a +JSON object that has at least **"papi_sampler"** attribute as follows: + + :: + + SUBSCRIBER_DATA='{"papi_sampler":{"file":"/PATH/TO/PAPI.JSON"}}' + +where the **"file"** attribute inside **"papi_sampler"** points to a +JSON-formatted text file containing user-defined schema name and PAPI +events of interest, e.g. + + :: + + { + "schema": "my_papi", + "events": [ + "PAPI_TOT_INS", + "PAPI_L1_DCM" + ] + } + +**papi_sampler** relies on **slurm_notfifier** SPANK plugin to notify it +about the starting/stopping of jobs on the node over ldmsd_stream. +Please consult **Plugin_slurm_notifier(7)** for more information on how +to deploy and configure it. The value of SUBSCRIBER_DATA from the job +script is carried over to **papi_sampler** when the job started, and an +LDMS set will be created according to the PAPI JSON file pointed by the +SUBSCRIBER_DATA. In the case of multi-tenant (multiple jobs running on a +node), each job has its own set. The set is deleted after *job_expiry* +period after the job exited. + +CONFIG OPTIONS +============================== + +**name=papi_sampler** + This MUST be papi_sampler (the name of the plugin). + +**producer=**\ *PRODUCER* + The name of the data producer (e.g. hostname). + +**instance=**\ *INSTANCE* + This is mandatory due to the fact that **papi_sampler** extends + **sampler_base** and this option is required by **sampler_base** + config. However, the value is ignored and can be anything. The actual + name of the **papi_sampler** instance is + *PRODUCER*/*SCHEMA*/*JOB_ID*. + +**component_id=**\ *COMPONENT_ID* + An integer identifying the component (default: *0*). + +**stream=**\ *STREAM* + The name of the stream that **slurm_notifier** SPANK plugin uses to + notify the job events. This attribute is optional with the default + being *slurm*. + +**job_expiry=**\ *EXPIRY_SEC* + The number of seconds to retain the set after the job has exited. The + default value is *60*. + +BUGS +==================== + +No known bugs. + +EXAMPLES +======================== + +Plugin configuration example: + + :: + + load name=papi_sampler + config name=papi_sampler producer=node0 instance=NA component_id=2 job_expiry=10 + start name=papi_sampler interval=1000000 offset=0 + +Job script example: + + :: + + #!/bin/bash + export SUBSCRIBER_DATA='{"papi_sampler":{"file":"/tmp/papi.json"}}' + srun bash -c 'for X in {1..60}; do echo $X; sleep 1; done' + +PAPI JSON example: + + :: + + { + "schema": "my_papi", + "events": [ + "PAPI_TOT_INS", + "PAPI_L1_DCM" + ] + } + +SEE ALSO +======================== + +**Plugin_slurm_notifier**\ (7), **Plugin_syspapi_sampler**\ (7), +**ldmsd**\ (8), **ldms_quickstart**\ (7), **ldmsd_controller**\ (8), +**ldms_sampler_base**\ (7). diff --git a/rtd/man2rst/Plugin_perfevent.rst b/rtd/man2rst/Plugin_perfevent.rst new file mode 100644 index 000000000..8670122c9 --- /dev/null +++ b/rtd/man2rst/Plugin_perfevent.rst @@ -0,0 +1,355 @@ +================ +Plugin_perfevent +================ + +:Date: 18 Feb 2018 + +.. contents:: + :depth: 3 +.. + +NAME +================= + +Plugin_perfevent - man page for the LDMS perfevent sampler plugin. + +SYNOPSIS +===================== + +| Within ldmsctl +| ldmsctl> config name=perfevent [ = ] + +DESCRIPTION +======================== + +With LDMS (Lightweight Distributed Metric Service), plugins for the +ldmsd (ldms daemon) are configured via ldmsd_controller or a +configuration file. The perfevent sampler plugin runs on the nodes and +provides data about the the occurrence of micro-architectural events +using linux perfevent subsystem by accessing hardware performance +counters. + +ENVIRONMENT +======================== + +You will need to build LDMS with --enable-perfevent. Perfevent subsystem +is available since Linux 2.6.31. + +CONFIGURATION ATTRIBUTE SYNTAX +=========================================== + +The perfevent plugin uses the sampler_base base class. This man page +covers only the configuration attributes, or those with default values, +specific to the this plugin. See ldms_sampler_base.man for the +attributes of the base class; those attributes are specified as part of +the 'init' action arguments. + +**config** + | name= action [schema=] + | configuration line + + name= + | + | This MUST be perfevent. + + schema= + | + | Optional schema name. It is intended that the same sampler on + different nodes with different metrics have a different schema. + + action=init + | + | Perform initialization + + action=del metricname= + | + | Deletes the specified event. + + action=ls + | + | List the currently configured events. + + action=add metricname= pid= cpu= type= id= + | + | Adds a metric to the list of configured events. + | metricname + | The metric name for the event + | pid + | The PID for the process being monitored. The counter will follow + the process to whichever CPU/core is in use. Note that 'pid' and + 'cpu' are mutually exclusive. + | cpu + | Count this event on the specified CPU. This will accumulate + events across all PID that land on the specified CPU/core. Note + that 'pid' and 'cpu' are mutually exclusive. + | type + | The event type. + | id + | The event id. + + The pid and cpu arguments allow specifying which process and CPU to monitor: + | + | pid == 0 and cpu == -1 + | This measures the calling process/thread on any CPU. + | pid == 0 and cpu >= 0 + | This measures the calling process/thread only when running on + the specified CPU. + | pid > 0 and cpu == -1 + | This measures the specified process/thread on any CPU. + | pid > 0 and cpu >= 0 + | This measures the specified process/thread only when running on + the specified CPU. + | pid == -1 and cpu >= 0 + | This measures all processes/threads on the specified CPU. This + requires CAP_SYS_ADMIN capability or a + /proc/sys/kernel/perf_event_paranoid value of less than 1. + | pid == -1 and cpu == -1 + | This setting is invalid and will return an error. + + For more information visit: http://man7.org/linux/man-pages/man2/perf_event_open.2.html + + **type** + | + | This field specifies the overall event type. It has one of the + following values: + | PERF_TYPE_HARDWARE + | This indicates one of the "generalized" hardware events provided + by the kernel. See the id field definition for more details. + | PERF_TYPE_SOFTWARE + | This indicates one of the software-defined events provided by + the kernel (even if no hardware support is available). + | PERF_TYPE_TRACEPOINT + | This indicates a tracepoint provided by the kernel tracepoint + infrastructure. + | PERF_TYPE_HW_CACHE + | This indicates a hardware cache event. This has a special + encoding, described in the id field definition. + | PERF_TYPE_RAW + | This indicates a "raw" implementation-specific event in the id + field. + | PERF_TYPE_BREAKPOINT (since Linux 2.6.33) + | This indicates a hardware breakpoint as provided by the CPU. + Breakpoints can be read/write accesses to an address as well as + execution of an instruction address. + + **id** + | + | This specifies which event you want, in conjunction with the + type field. + | There are various ways to set the id field that are dependent on + the value of the previously described type field. + | What follows are various possible settings for id separated out + by type. + | If type is PERF_TYPE_HARDWARE, we are measuring one of the + generalized hardware CPU events. Not all of these are available + on all platforms. Set id to one of the following: + | PERF_COUNT_HW_CPU_CYCLES + | Total cycles. Be wary of what happens during CPU frequency + scaling. + | PERF_COUNT_HW_INSTRUCTIONS + | Retired instructions. Be careful, these can be affected by + various issues, most notably hardware interrupt counts. + | PERF_COUNT_HW_CACHE_REFERENCES + | Cache accesses. Usually this indicates Last Level Cache accesses + but this may vary depending on your CPU. This may include + prefetches and coherency messages; again this depends on the + design of your CPU. + | PERF_COUNT_HW_CACHE_MISSES + | Cache misses. Usually this indicates Last Level Cache misses; + this is intended to be used in conjunction with the + | PERF_COUNT_HW_CACHE_REFERENCES + | event to calculate cache miss rates. + | PERF_COUNT_HW_BRANCH_INSTRUCTIONS + | Retired branch instructions. Prior to Linux 2.6.35, this used + the wrong event on AMD processors. + | PERF_COUNT_HW_BRANCH_MISSES + | Mispredicted branch instructions. + | PERF_COUNT_HW_BUS_CYCLES + | Bus cycles, which can be different from total cycles. + | PERF_COUNT_HW_STALLED_CYCLES_FRONTEND (since Linux 3.0) + | Stalled cycles during issue. + | PERF_COUNT_HW_STALLED_CYCLES_BACKEND (since Linux 3.0) + | Stalled cycles during retirement. + + | PERF_COUNT_HW_REF_CPU_CYCLES (since Linux 3.3) + | Total cycles; not affected by CPU frequency scaling. + | If type is PERF_TYPE_SOFTWARE, we are measuring software events + provided by the kernel. Set config to one of the following: + | PERF_COUNT_SW_CPU_CLOCK + | This reports the CPU clock, a high-resolution per-CPU timer. + | PERF_COUNT_SW_TASK_CLOCK + | This reports a clock count specific to the task that is running. + | PERF_COUNT_SW_PAGE_FAULTS + | This reports the number of page faults. + | PERF_COUNT_SW_CONTEXT_SWITCHES + | This counts context switches. Until Linux 2.6.34, these were all + reported as user-space events, after that they are reported as + happening in the kernel. + | PERF_COUNT_SW_CPU_MIGRATIONS + | This reports the number of times the process has migrated to a new + CPU. + | PERF_COUNT_SW_PAGE_FAULTS_MIN + | This counts the number of minor page faults. These did not require + disk I/O to handle. + | PERF_COUNT_SW_PAGE_FAULTS_MAJ + | This counts the number of major page faults. These required disk + I/O to handle. + | PERF_COUNT_SW_ALIGNMENT_FAULTS (since Linux 2.6.33) + | This counts the number of alignment faults. These happen when + unaligned memory accesses happen; the kernel can handle these but + it reduces performance. This happens only on some architectures + (never on x86). + | PERF_COUNT_SW_EMULATION_FAULTS (since Linux 2.6.33) + | This counts the number of emulation faults. The kernel sometimes + traps on unimplemented instructions and emulates them for user + space. This can negatively impact performance. + | PERF_COUNT_SW_DUMMY (since Linux 3.12) + | This is a placeholder event that counts nothing. Informational + sample record types such as mmap or comm must be associated with an + active event. This dummy event allows gathering such records + without requiring a counting event. + | If type is PERF_TYPE_TRACEPOINT, then we are measuring kernel + tracepoints. The value to use in id can be obtained from under + debugfs tracing/events/\*/\*/id if ftrace is enabled in the kernel. + | If type is PERF_TYPE_HW_CACHE, then we are measuring a hardware CPU + cache event. To calculate the appropriate id value use the + following equation: + | (perf_hw_cache_id) \| (perf_hw_cache_op_id << 8) \| + (perf_hw_cache_op_result_id << 16) + | where perf_hw_cache_id is one of: + | PERF_COUNT_HW_CACHE_L1D + | for measuring Level 1 Data Cache + | PERF_COUNT_HW_CACHE_L1I + | for measuring Level 1 Instruction Cache + | PERF_COUNT_HW_CACHE_LL + | for measuring Last-Level Cache + | PERF_COUNT_HW_CACHE_DTLB + | for measuring the Data TLB + | PERF_COUNT_HW_CACHE_ITLB + | for measuring the Instruction TLB + | PERF_COUNT_HW_CACHE_BPU + | for measuring the branch prediction unit + | PERF_COUNT_HW_CACHE_NODE (since Linux 3.1) + | for measuring local memory accesses + | and perf_hw_cache_op_id is one of + | PERF_COUNT_HW_CACHE_OP_READ + | for read accesses + | PERF_COUNT_HW_CACHE_OP_WRITE + | for write accesses + | PERF_COUNT_HW_CACHE_OP_PREFETCH + | for prefetch accesses and perf_hw_cache_op_result_id is one of + | PERF_COUNT_HW_CACHE_RESULT_ACCESS + | to measure accesses + | PERF_COUNT_HW_CACHE_RESULT_MISS + | to measure misses + | If type is PERF_TYPE_RAW, then a custom "raw" id value is needed. + Most CPUs support events that are not covered by the "generalized" + events. These are implementation defined; see your CPU manual (for + example the Intel Volume 3B documentation or the AMD BIOS and + Kernel Developer Guide). The libpfm4 library can be used to + translate from the name in the architectural manuals to the raw hex + value perf_event_open() expects in this field. + +NOTES +================== + +The official way of knowing if perf_event_open() support is enabled is +checking for the existence of the file +/proc/sys/kernel/perf_event_paranoid. + +The enum values for type and id are specified in kernel. Here are the +values in version 3.9 (retrieved from +http://lxr.cpsc.ucalgary.ca/lxr/linux+v3.9/include/uapi/linux/perf_event.h#L28): + +enum perf_type_id { PERF_TYPE_HARDWARE = 0, PERF_TYPE_SOFTWARE = 1, +PERF_TYPE_TRACEPOINT = 2, PERF_TYPE_HW_CACHE = 3, PERF_TYPE_RAW = 4, +PERF_TYPE_BREAKPOINT = 5, + +PERF_TYPE_MAX, /\* non-ABI \*/ }; + +enum perf_hw_id { /\* \* Common hardware events, generalized by the +kernel: \*/ PERF_COUNT_HW_CPU_CYCLES = 0, PERF_COUNT_HW_INSTRUCTIONS = +1, PERF_COUNT_HW_CACHE_REFERENCES = 2, PERF_COUNT_HW_CACHE_MISSES = 3, +PERF_COUNT_HW_BRANCH_INSTRUCTIONS = 4, PERF_COUNT_HW_BRANCH_MISSES = 5, +PERF_COUNT_HW_BUS_CYCLES = 6, PERF_COUNT_HW_STALLED_CYCLES_FRONTEND = 7, +PERF_COUNT_HW_STALLED_CYCLES_BACKEND = 8, PERF_COUNT_HW_REF_CPU_CYCLES = +9, + +PERF_COUNT_HW_MAX, /\* non-ABI \*/ }; + +/\* \* Generalized hardware cache events: \* \* { L1-D, L1-I, LLC, ITLB, +DTLB, BPU, NODE } x \* { read, write, prefetch } x \* { accesses, misses +} \*/ enum perf_hw_cache_id { PERF_COUNT_HW_CACHE_L1D = 0, +PERF_COUNT_HW_CACHE_L1I = 1, PERF_COUNT_HW_CACHE_LL = 2, +PERF_COUNT_HW_CACHE_DTLB = 3, PERF_COUNT_HW_CACHE_ITLB = 4, +PERF_COUNT_HW_CACHE_BPU = 5, PERF_COUNT_HW_CACHE_NODE = 6, + +PERF_COUNT_HW_CACHE_MAX, /\* non-ABI \*/ }; enum perf_hw_cache_op_id { +PERF_COUNT_HW_CACHE_OP_READ = 0, PERF_COUNT_HW_CACHE_OP_WRITE = 1, +PERF_COUNT_HW_CACHE_OP_PREFETCH = 2, + +PERF_COUNT_HW_CACHE_OP_MAX, /\* non-ABI \*/ }; + +enum perf_hw_cache_op_result_id { PERF_COUNT_HW_CACHE_RESULT_ACCESS = 0, +PERF_COUNT_HW_CACHE_RESULT_MISS = 1, + +PERF_COUNT_HW_CACHE_RESULT_MAX, /\* non-ABI \*/ }; + +/\* \* Special "software" events provided by the kernel, even if the +hardware \* does not support performance events. These events measure +various \* physical and sw events of the kernel (and allow the profiling +of them as \* well): \*/ enum perf_sw_ids { PERF_COUNT_SW_CPU_CLOCK = 0, +PERF_COUNT_SW_TASK_CLOCK = 1, PERF_COUNT_SW_PAGE_FAULTS = 2, +PERF_COUNT_SW_CONTEXT_SWITCHES = 3, PERF_COUNT_SW_CPU_MIGRATIONS = 4, +PERF_COUNT_SW_PAGE_FAULTS_MIN = 5, PERF_COUNT_SW_PAGE_FAULTS_MAJ = 6, +PERF_COUNT_SW_ALIGNMENT_FAULTS = 7, PERF_COUNT_SW_EMULATION_FAULTS = 8, + +PERF_COUNT_SW_MAX, /\* non-ABI \*/ }; + +BUGS +================= + +No known bugs. + +EXAMPLES +===================== + +The following is a short example that measures 4 events. + | + | Total CPU cycles + | Total CPU instructions + | Total branch instructions + | Mispredicted branch instructions + +| IF we set the value of PID=1234 and CPU_NUM is -1, this measures the + process with pid=1234 on any CPU. If the CPU_NUM is 1, this measures + the process with pid=1234 only on CPU 1. +| IF we set the value of PID=-1 and CPU_NUM is 1, this measures all + processes/threads on the CPU number 1. This requires CAP_SYS_ADMIN + capability or a /proc/sys/kernel/perf_event_paranoid value of less + than 1. + +$ldmsctl -S $LDMSD_SOCKPATH + +| ldmsctl> load name=perfevent +| ldmsctl> config name=perfevent action=add + metricname="PERF_COUNT_HW_CPU_CYCLES" pid=$PID cpu=$CPU_NUM type=0 + id=0 +| ldmsctl> config name=perfevent action=add + metricname="PERF_COUNT_HW_INSTRUCTIONS" pid=$PID cpu=$CPU_NUM type=0 + id=1 +| ldmsctl> config name=perfevent action=add + metricname="PERF_COUNT_HW_BRANCH_INSTRUCTIONS" pid=$PID cpu=$CPU_NUM + type=0 id=4 +| ldmsctl> config name=perfevent action=add + metricname="PERF_COUNT_HW_BRANCH_MISSES" pid=$PID cpu=$CPU_NUM type=0 + id=5 +| ldmsctl> config name=perfevent action=init instance=$INSTANCE_NAME + producer=$PRODUCER_NAME +| ldmsctl> start name=perfevent interval=$INTERVAL_VALUE +| ldmsctl> quit + +SEE ALSO +===================== + +PERF_EVENT_OPEN(2), ldmsd(7), ldms_quickstart(7), ldms_sampler_base(7) diff --git a/rtd/man2rst/Plugin_procdiskstats.rst b/rtd/man2rst/Plugin_procdiskstats.rst new file mode 100644 index 000000000..97185aad6 --- /dev/null +++ b/rtd/man2rst/Plugin_procdiskstats.rst @@ -0,0 +1,81 @@ +==================== +Plugin_procdiskstats +==================== + +:Date: 18 Feb 2018 + +.. contents:: + :depth: 3 +.. + +NAME +===================== + +Plugin_procdiskstats - man page for the LDMS procdiskstats plugin + +SYNOPSIS +========================= + +| Within ldmsd_controller or a configuration file: +| config name=procdiskstats [ = ] + +DESCRIPTION +============================ + +With LDMS (Lightweight Distributed Metric Service), plugins for the +ldmsd (ldms daemon) are configured via ldmsd_controller or a +configuration file. The procdiskstats plugin provides disk info. + +WARNING: This sampler is unsupported. + +CONFIGURATION ATTRIBUTE SYNTAX +=============================================== + +The procdiskstats plugin uses the sampler_base base class. This man page +covers only the configuration attributes, or those with default values, +specific to the this plugin; see ldms_sampler_base.man for the +attributes of the base class. + +**config** + | name= [schema=] device= + | configuration line + + name= + | + | This MUST be procdiskstats. + + schema= + | + | Optional schema name. It is intended that the same sampler on + different nodes with different metrics have a different schema. + If not specified, will default to \`procdiskstats\`. + + device= + | + | Comma separated list of devices + +BUGS +===================== + +No known bugs. + +NOTES +====================== + +- This sampler is unsupported. + +EXAMPLES +========================= + +Within ldmsd_controller or a configuration file: + +:: + + load name=procdiskstats + config name=procdiskstats producer=vm1_1 instance=vm1_1/procdiskstats component_id=1 + start name=procdiskstats interval=1000000 + +SEE ALSO +========================= + +ldmsd(8), ldms_quickstart(7), ldmsd_controller(8), ldms_sampler_base(7) diff --git a/rtd/man2rst/Plugin_procinterrupts.rst b/rtd/man2rst/Plugin_procinterrupts.rst new file mode 100644 index 000000000..d8aa92f2e --- /dev/null +++ b/rtd/man2rst/Plugin_procinterrupts.rst @@ -0,0 +1,71 @@ +===================== +Plugin_procinterrupts +===================== + +:Date: 10 Feb 2018 + +.. contents:: + :depth: 3 +.. + +NAME +====================== + +Plugin_interrupts - man page for the LDMS interrupts plugin + +SYNOPSIS +========================== + +| Within ldmsd_controller or a configuration file: +| config name=interrupts [ = ] + +DESCRIPTION +============================= + +With LDMS (Lightweight Distributed Metric Service), plugins for the +ldmsd (ldms daemon) are configured via ldmsd_controller or a +configuration file. The interrupts plugin provides info from +/proc/interrupts. The metric name will be irq.#CPU_NUMBER. + +CONFIGURATION ATTRIBUTE SYNTAX +================================================ + +The procinterrupts plugin uses the sampler_base base class. This man +page covers only the configuration attributes, or those with default +values, specific to the this plugin; see ldms_sampler_base.man for the +attributes of the base class. + +**config** + | name= [schema=] + | configuration line + + name= + | + | This MUST be procinterrupts. + + schema= + | + | Optional schema name. It is intended that the same sampler on + different nodes with different metrics have a different schema. + If not specified, will default to \`procinterrupts\`. + +BUGS +====================== + +No known bugs. + +EXAMPLES +========================== + +Within ldmsd_controller or a configuration file: + +:: + + load name=procinterrupts + config name=procinterrupts producer=1 instance=vm1_1/procinterrupts + start name=procinterrupts interval=1000000 + +SEE ALSO +========================== + +ldmsd(8), ldms_quickstart(7), ldmsd_controller(8), ldms_sampler_base(7) diff --git a/rtd/man2rst/Plugin_procnet.rst b/rtd/man2rst/Plugin_procnet.rst new file mode 100644 index 000000000..eb9b758d0 --- /dev/null +++ b/rtd/man2rst/Plugin_procnet.rst @@ -0,0 +1,76 @@ +============== +Plugin_procnet +============== + +:Date: 9 Apr 2021 + +.. contents:: + :depth: 3 +.. + +NAME +=============== + +Plugin_procnet - man page for the LDMS procnet plugin + +SYNOPSIS +=================== + +| Within ldmsd_controller or a configuration file: +| config name=procnet [common attributes] [exclude_ports=] + +DESCRIPTION +====================== + +With LDMS (Lightweight Distributed Metric Service), plugins for the +ldmsd (ldms daemon) are configured via ldmsd_controller or a +configuration file. The procnet plugin provides network info from +/proc/net/dev, creating a different set for each device, reporting only +active devices, and reporting an active device only when counters +change. + +CONFIGURATION ATTRIBUTE SYNTAX +========================================= + +The procnet plugin uses the sampler_base base class. This man page +covers only the configuration attributes, or those with default values, +specific to the this plugin; see ldms_sampler_base.man for the +attributes of the base class. + +**config** + | name= exclude_ports= + | configuration line + + name= + | + | This MUST be procnet. + + exclude_ports= + | + | Comma separated list of ports to exclude. + + schema= + | + | Optional schema name. If not specified, will default to + \`procnet\`. + +BUGS +=============== + +Interfaces reported and exclude_ports lists are each limited to 20. + +EXAMPLES +=================== + +Within ldmsd_controller or a configuration file: + +:: + + load name=procnet + config name=procnet producer=vm1_1 instance=vm1_1/procnet exclude_ports=lo + start name=procnet interval=1000000 + +SEE ALSO +=================== + +ldmsd(8), ldms_quickstart(7), ldmsd_controller(8), ldms_sampler_base(7) diff --git a/rtd/man2rst/Plugin_procnetdev.rst b/rtd/man2rst/Plugin_procnetdev.rst new file mode 100644 index 000000000..b0a95b81c --- /dev/null +++ b/rtd/man2rst/Plugin_procnetdev.rst @@ -0,0 +1,77 @@ +================= +Plugin_procnetdev +================= + +:Date: 10 Dec 2018 + +.. contents:: + :depth: 3 +.. + +NAME +================== + +Plugin_procnetdev - man page for the LDMS procnetdev plugin + +SYNOPSIS +====================== + +| Within ldmsd_controller or a configuration file: +| config name=procnetdev [ = ] + +DESCRIPTION +========================= + +With LDMS (Lightweight Distributed Metric Service), plugins for the +ldmsd (ldms daemon) are configured via ldmsd_controller or a +configuration file. The procnetdev plugin provides network info from +/proc/net/dev. + +CONFIGURATION ATTRIBUTE SYNTAX +============================================ + +The procnetdev plugin uses the sampler_base base class. This man page +covers only the configuration attributes, or those with default values, +specific to the this plugin; see ldms_sampler_base.man for the +attributes of the base class. + +**config** + | name= ifaces= + | configuration line + + name= + | + | This MUST be procnetdev. + + ifaces= + | + | CSV list of ifaces. Order matters. Non-existent ifaces will be + included and default to 0-value data. + + schema= + | + | Optional schema name. It is intended that the same sampler on + different nodes with different metrics or ifaces have a + different schema. If not specified, will default to + \`procnetdev\`. + +BUGS +================== + +Interfaces list is limited to 20. + +EXAMPLES +====================== + +Within ldmsd_controller or a configuration file: + +:: + + load name=procnetdev + config name=procnetdev producer=vm1_1 instance=vm1_1/procnetdev iface=eth0,eth1 + start name=procnetdev interval=1000000 + +SEE ALSO +====================== + +ldmsd(8), ldms_quickstart(7), ldmsd_controller(8), ldms_sampler_base(7) diff --git a/rtd/man2rst/Plugin_procnetdev2.rst b/rtd/man2rst/Plugin_procnetdev2.rst new file mode 100644 index 000000000..37f39f743 --- /dev/null +++ b/rtd/man2rst/Plugin_procnetdev2.rst @@ -0,0 +1,79 @@ +================== +Plugin_procnetdev2 +================== + +:Date: 07 Jan 2022 + +.. contents:: + :depth: 3 +.. + +NAME +=================== + +Plugin_procnetdev2 - man page for the LDMS procnetdev2 plugin + +SYNOPSIS +======================= + +| Within ldmsd_controller or a configuration file: +| config name=procnetdev2 [ = ] + +DESCRIPTION +========================== + +With LDMS (Lightweight Distributed Metric Service), plugins for the +ldmsd (ldms daemon) are configured via ldmsd_controller or a +configuration file. The procnetdev2 plugin uses LDMS_V_LIST and +LDMS_V_RECORD to provide network info from /proc/net/dev. + +CONFIGURATION ATTRIBUTE SYNTAX +============================================= + +The procnetdev2 plugin uses the sampler_base base class. This man page +covers only the configuration attributes, or those with default values, +specific to the this plugin; see ldms_sampler_base.man for the +attributes of the base class. + +**config** + | name= [ifaces=] + | configuration line + + name= + | + | This MUST be procnetdev2. + + ifaces= + | + | (Optional) A CSV list of interfaces to sample. If not specified, + all available interfaces in /proc/net/dev will be reported. It + is OK to specify non-existing interfaces in the ifaces list. + + schema= + | + | Optional schema name. It is intended that the same sampler on + different nodes with different metrics or ifaces have a + different schema. If not specified, will default to + \`procnetdev\`. + +BUGS +=================== + +The maximum number of interfaces is limited to 32. + +EXAMPLES +======================= + +Within ldmsd_controller or a configuration file: + +:: + + load name=procnetdev + config name=procnetdev producer=vm1_1 instance=vm1_1/procnetdev2 ifaces=eth0,eth1 + start name=procnetdev interval=1000000 offset=0 + +SEE ALSO +======================= + +ldmsd(8), ldms_quickstart(7), ldmsd_controller(8), ldms_sampler_base(7), +Plugin_procnetdev(7) diff --git a/rtd/man2rst/Plugin_procnfs.rst b/rtd/man2rst/Plugin_procnfs.rst new file mode 100644 index 000000000..49bc21ea3 --- /dev/null +++ b/rtd/man2rst/Plugin_procnfs.rst @@ -0,0 +1,70 @@ +============== +Plugin_procnfs +============== + +:Date: 10 Feb 2018 + +.. contents:: + :depth: 3 +.. + +NAME +=============== + +Plugin_procnfs - man page for the LDMS procnfs plugin + +SYNOPSIS +=================== + +| Within ldmsd_controller or a configuration file: +| config name=procnfs [ = ] + +DESCRIPTION +====================== + +With LDMS (Lightweight Distributed Metric Service), plugins for the +ldmsd (ldms daemon) are configured via ldmsd_controller or a +configuration file. The procnfs plugin provides info from +/proc/net/rpc/nfs + +CONFIGURATION ATTRIBUTE SYNTAX +========================================= + +The procnfs plugin uses the sampler_base base class. This man page +covers only the configuration attributes, or those with default values, +specific to the this plugin; see ldms_sampler_base.man for the +attributes of the base class. + +**config** + | name= [schema=] + | configuration line + + name= + | + | This MUST be procnfs. + + schema= + | + | Optional schema name. It is intended that the same sampler on + different nodes with different metrics have a different schema. + If not specified, will default to \`procnfs\`. + +BUGS +=============== + +No known bugs. + +EXAMPLES +=================== + +:: + + Within ldmsd_controller or a configuration file: + load name=procnfs + config name=procnfs producer=vm1_1 instance=vm1_1/procnfs + start name=procnfs interval=1000000 + +SEE ALSO +=================== + +ldmsd(8), ldms_quickstart(7), ldmsd_controller(8), ldms_sampler_base(7) diff --git a/rtd/man2rst/Plugin_procstat.rst b/rtd/man2rst/Plugin_procstat.rst new file mode 100644 index 000000000..1c49164e7 --- /dev/null +++ b/rtd/man2rst/Plugin_procstat.rst @@ -0,0 +1,85 @@ +=============== +Plugin_procstat +=============== + +:Date: 03 Dec 2016 + +.. contents:: + :depth: 3 +.. + +NAME +================ + +Plugin_procstat - man page for the LDMS procstat plugin + +SYNOPSIS +==================== + +| Within ldmsd_controller or in a configuration file +| config name=procstat [ = ] + +DESCRIPTION +======================= + +The procstat plugin provides cpu utilization info from /proc/stat, +allowing for hyperthreading and downed core variability. As +hyperthreading might be variable and user selectable depending on system +configuration, the maximum number of cores potentially appearing should +be set in the plugin options with the maxcpu parameter. Cores not +actually appearing will be reported as 0 values. + +CONFIGURATION ATTRIBUTE SYNTAX +========================================== + +See ldms_sampler_base(7) for the common sampler options. + +**config** + | maxcpu= + | configuration line + + maxcpu= + | + | Values are 0 to N, where 0 logs only totalized data and N + reserves slots for N cores. If less than N cores are found, + 0-values are reported. If more than N cores are found, they are + ignored with an INFO note in the log. Default is the number of + cores found locally when the sampler is started. If machines + monitored may have cores disabled or variable hyperthreading + status, set maxcpu to the most cores that will be reported + anywhere in the cluster. + + sc_clk_tck=1 + | + | Enable optional reporting of sysconf(\_SC_CLK_TCK), the + scheduler ticks-per-second defined at kernel build time as + CONFIG_HZ, collected from sysconf(3). Typically HPC systems use + 100, while 250, 300, 1000 may also occur. + +DATA +================ + +This reports both interrupt count and time processing them. For detailed +interrupt data by type, consider Plugin_procinterrupts(7). + +BUGS +================ + +Reporting all interrupts by name is not implemented. + +EXAMPLES +==================== + +Within ldmsd_controller or a configuration file: + +:: + + load name=procstat + config name=procstat producer=vm1_1 component_id=1 maxcpu=4 instance=vm1_1/procstat with_jobid=0 + start name=procstat interval=1000000 offset=0 + +SEE ALSO +==================== + +ldms_sampler_base(7), Plugin_procinterrupts(7), Kernel source +fs/proc/stat.c and proc(5), ldmsd(8), ldmsd_controller(8) diff --git a/rtd/man2rst/Plugin_procstat2.rst b/rtd/man2rst/Plugin_procstat2.rst new file mode 100644 index 000000000..88f6896a4 --- /dev/null +++ b/rtd/man2rst/Plugin_procstat2.rst @@ -0,0 +1,76 @@ +================ +Plugin_procstat2 +================ + +:Date: 14 Jan 2022 + +.. contents:: + :depth: 3 +.. + +NAME +================= + +Plugin_procstat2 - man page for the LDMS procstat2 plugin + +SYNOPSIS +===================== + +| Within ldmsd_controller or a configuration file: +| config name=procstat2 [ = ] + +DESCRIPTION +======================== + +With LDMS (Lightweight Distributed Metric Service), plugins for the +ldmsd (ldms daemon) are configured via ldmsd_controller or a +configuration file. The procstat2 plugin provides data from /proc/stat. + +CONFIGURATION ATTRIBUTE SYNTAX +=========================================== + +The procstat2 plugin uses the sampler_base base class. This man page +covers only the configuration attributes, or those with default values, +specific to the this plugin; see ldms_sampler_base(7) for the attributes +of the base class. + +**config** + | name= [schema=] + | configuration line + + name= + | + | This MUST be procstat2. + + schema= + | + | Optional schema name. It is intended that the same sampler on + different nodes with different metrics have a different schema. + If not specified, will default to \`procstat2\`. + + intr_max= + | + | (Optional). The maximum number of inerrupt numbers supported in + intr_list. If not specified, intr_max will be the current number + of interrupts in the intr list. + +BUGS +================= + +No known bugs. + +EXAMPLES +===================== + +Within ldmsd_controller or a configuration file: + +:: + + load name=procstat2 + config name=procstat2 producer=vm1_1 instance=vm1_1/procstat2 + start name=procstat2 interval=1000000 + +SEE ALSO +===================== + +ldmsd(8), ldms_quickstart(7), ldmsd_controller(8), ldms_sampler_base(7) diff --git a/rtd/man2rst/Plugin_rapl.rst b/rtd/man2rst/Plugin_rapl.rst new file mode 100644 index 000000000..8705cc98d --- /dev/null +++ b/rtd/man2rst/Plugin_rapl.rst @@ -0,0 +1,80 @@ +=========== +Plugin_rapl +=========== + +:Date: 18 Feb 2018 + +.. contents:: + :depth: 3 +.. + +NAME +============ + +Plugin_rapl - man page for the LDMS rapl plugin + +SYNOPSIS +================ + +| Within ldmsd_controller or a configuration file: +| config name=rapl [ = ] + +DESCRIPTION +=================== + +With LDMS (Lightweight Distributed Metric Service), plugins for the +ldmsd (ldms daemon) are configured via ldmsd_controller or a +configuration file. The rapl plugin provides energy sampling using RAPL +via the PAPI interface for sandybridge. + +WARNING: This sampler is unsupported. + +CONFIGURATION ATTRIBUTE SYNTAX +====================================== + +The rapl plugin uses the sampler_base base class. This man page covers +only the configuration attributes, or those with default values, +specific to the this plugin; see ldms_sampler_base.man for the +attributes of the base class. + +**config** + | name= [schema=] + | configuration line + + name= + | + | This MUST be rapl. + + schema= + | + | Optional schema name. It is intended that the same sampler on + different nodes with different metrics have a different schema. + If not specified, will default to \`rapl\`. + +BUGS +============ + +No known bugs. + +NOTES +============= + +- WARNING: This is for sandybridge only. + +- This sampler is unsupported. + +EXAMPLES +================ + +Within ldmsd_controller or a configuration file: + +:: + + load name=rapl + config name=rapl producer=vm1_1 instance=vm1_1/rapl component_id=1 + start name=rapl interval=1000000 + +SEE ALSO +================ + +ldmsd(8), ldms_quickstart(7), ldmsd_controller(8), ldms_sampler_base(7) diff --git a/rtd/man2rst/Plugin_rdc_sampler.rst b/rtd/man2rst/Plugin_rdc_sampler.rst new file mode 100644 index 000000000..8a25f553f --- /dev/null +++ b/rtd/man2rst/Plugin_rdc_sampler.rst @@ -0,0 +1,120 @@ +================== +Plugin_rdc_sampler +================== + +:Date: 1 Apr 2021 + +.. contents:: + :depth: 3 +.. + +NAME +=================== + +Plugin_rdc_sampler - man page for the LDMS rdc_sampler plugin + +SYNOPSIS +======================= + +| Within ldmsd_controller or a configuration file: +| config name=rdc_sampler [ = ] + +DESCRIPTION +========================== + +The rdc_sampler plugin provides AMD gpu device data. Data sets may be +wide or per-device. Plugins for the ldmsd (ldms daemon) are configured +via ldmsd_controller or a configuration file. + +CONFIGURATION ATTRIBUTE SYNTAX +============================================= + +**config** + | name= [producer=] [instance=] + [component_id=] [schema=] [uid=] + [gid=] [perm=] [metrics=LIST] + [update_freq=MICROSEC] [max_keep_age=SEC] [max_keep_samples=N] + | configuration line + + name= + | + | This MUST be rdc_sampler. + + producer=. + | + | The producer string value for the timing set. + + instance= + | + | The set instance names will be suffixed by device number + (gpu%d). + + schema= + | + | Optional schema base name. The default is rdc_sampler. The name + base is suffixed to create uniquely defined schema names based + on the plugin options specified. + + component_id= + | + | Optional component identifier for the timing set. Defaults to + zero. + + metrics=LIST + | + | The list of values to be collected as named in rdc_field_t from + rdc/rdc.h. + + update_freq=MICROSEC + | + | An argument passed to rdc_field_watch. + + max_keep_age=SEC + | + | An argument passed to rdc_field_watch. + + max_keep_samples=N + | + | An argument passed to rdc_field_watch. + + warmup=K + | + | Delay K cycles update_freq long before attempting to read data + from the gpu. + +EXAMPLES +======================= + +Within ldmsd_controller or a configuration file: + +:: + + load name=rdc_sampler + config name=rdc_sampler component_id=1 + start name=rdc_sampler interval=1000000 + +NOTES +==================== + +The exact schema name that will be generated can be determined using the +ldms_rdc_schema_name utility. The data available may depend on the +specific GPUs and their configuration. + +The rdc libraries loaded by the plugin may emit inconsequential error +messages to stdout. Two such begin with " ERROR +RdcLibraryLoader.cc" " ERROR RdcMetricFetcherImpl.cc" The +latter suggests you may have requested metrics unsupported by your +hardware. + +BUGS +=================== + +At ldmsd exit, there is a race between sampler termination and the rdc +library thread cleanup. This may lead to an exception being thrown in +the library code that terminates ldmsd with a C++ exception message. + +SEE ALSO +======================= + +ldmsd(8), ldms_quickstart(7), ldmsd_controller(8), +ldms_rdc_schema_name(1) diff --git a/rtd/man2rst/Plugin_sampler_atasmart.rst b/rtd/man2rst/Plugin_sampler_atasmart.rst new file mode 100644 index 000000000..d0eb73577 --- /dev/null +++ b/rtd/man2rst/Plugin_sampler_atasmart.rst @@ -0,0 +1,87 @@ +======================= +Plugin_sampler_atasmart +======================= + +:Date: 18 Feb 2018 + +.. contents:: + :depth: 3 +.. + +NAME +======================== + +Plugin_sampler_atasmart - man page for the LDMS sampler_atasmart plugin + +SYNOPSIS +============================ + +| Within ldmsd_controller or a configuration file: +| config name=sampler_atasmart [ = ] + +DESCRIPTION +=============================== + +With LDMS (Lightweight Distributed Metric Service), plugins for the +ldmsd (ldms daemon) are configured via ldmsd_controller or a +configuration file. The sampler_atasmart plugin provides disk info via +sampler_atasmart. + +WARNING: This sampler is unsupported. + +ENVIRONMENT +=============================== + +To build this sampler, the tasmart library must be loaded. + +CONFIGURATION ATTRIBUTE SYNTAX +================================================== + +The sampler_atasmart plugin uses the sampler_base base class. This man +page covers only the configuration attributes, or those with default +values, specific to the this plugin; see ldms_sampler_base.man for the +attributes of the base class. + +**config** + | name= [schema=] disks= + | configuration line + + name= + | + | This MUST be sampler_atasmart. + + schema= + | + | Optional schema name. It is intended that the same sampler on + different nodes with different metrics have a different schema. + If not specified, will default to \`sampler_atasmart\`. + + disks + | + | A comma-separated list of disk names (e.g., /dev/sda,/dev/sda1) + +BUGS +======================== + +No known bugs. + +NOTES +========================= + +- This sampler is unsupported. + +EXAMPLES +============================ + +Within ldmsd_controller or a configuration file: + +:: + + load name=sampler_atasmart + config name=sampler_atasmart producer=vm1_1 instance=vm1_1/sampler_atasmart component_id=1 + start name=sampler_atasmart interval=1000000 + +SEE ALSO +============================ + +ldmsd(8), ldms_quickstart(7), ldmsd_controller(8), ldms_sampler_base(7) diff --git a/rtd/man2rst/Plugin_shm_sampler.rst b/rtd/man2rst/Plugin_shm_sampler.rst new file mode 100644 index 000000000..df34722ea --- /dev/null +++ b/rtd/man2rst/Plugin_shm_sampler.rst @@ -0,0 +1,71 @@ +================== +Plugin_shm_sampler +================== + +:Date: 5 March 2018 + +.. contents:: + :depth: 3 +.. + +This is a sampler plug-in module within the the LDMS that can read from +a dynamic number of shm files. + +| Within ldmsd_controller or a configuration file: +| load name=shm_sampler +| config name=shm_sampler [ = ] + +is a sampler plug-in module within the the LDMS. This sampler can read +from a dynamic number of shm files. These files are tracked by a central +index file in shared memory. The main usage of this sampler is to stream +application performance data. + +| Configuration options: +| producer= instance= + [shm_index=][shm_boxmax=][shm_array_max=][shm_metric_max=] + [shm_set_timeout=][component_id=] [schema=] + [job_set= job_id= app_id= job_start= + job_end=] + +A unique name for the host providing the data + +A unique name for the metric set + +A unique name for the shared memory index file + +Maximum number of entries in the shared memory index file + +Maximum number of elements in array metrics + +Maximum number of metrics + +No read/write timeout in seconds + +A unique number for the component being monitored, Defaults to zero. + +The name of the metric set schema, Defaults to the sampler name + +The instance name of the set containing the job data, default is +'job_info' + +The name of the metric containing the Job Id, default is 'job_id' + +The name of the metric containing the Application Id, default is +'app_id' + +The name of the metric containing the Job start time, default is +'job_start' + +The name of the metric containing the Job end time, default is 'job_end' + +None known. + +Within ldmsd_controller or a configuration file: + +:: + + load name=shm_sampler + config name=shm_sampler producer=samplerd instance=samplerd/shm_sampler shm_index=/ldms_shm_mpi_index shm_boxmax=4 component_id=23 + start name=shm_sampler interval=1000000 offset=0 + +ldmsd(8), ldms_quickstart(7), ldmsd_controller(8) diff --git a/rtd/man2rst/Plugin_slingshot_info.rst b/rtd/man2rst/Plugin_slingshot_info.rst new file mode 100644 index 000000000..804b73ae5 --- /dev/null +++ b/rtd/man2rst/Plugin_slingshot_info.rst @@ -0,0 +1,71 @@ +===================== +Plugin_slingshot_info +===================== + +:Date: 1 May 2022 + +.. contents:: + :depth: 3 +.. + +NAME +====================== + +Plugin_slingshot_info - man page for the LDMS slingshot_info plugin + +SYNOPSIS +========================== + +| Within ldmsd_controller or a configuration file: +| config name=slingshot_info [ = ] + +DESCRIPTION +============================= + +With LDMS (Lightweight Distributed Metric Service), plugins for the +ldmsd (ldms aemon) are configured via ldmsd_controller or a +configuration file. The slingshot_info plugin provides a single metric +set that contains a list of records. Each record contains all of the +informational fields for a single slingshot NIC. + +The slingshot_info sampler plugin provides a fairly small set of general +information about each slingshot NIC, including FRU description, serial +number, etc. Likely users will want to sample this plugin relatively +infrequently. For detailed slingshot NIC counter data, see the +slingshot_metrics sampler plugin. + +The schema is named "slingshot_info" by default. + +CONFIGURATION ATTRIBUTE SYNTAX +================================================ + +The slingshot_info plugin uses the sampler_base base class. This man +page covers only the configuration attributes, or those with default +values, specific to the this plugin; see ldms_sampler_base.man for the +attributes of the base class. + +**config** + | name= [counters=] [counters_file=] + | configuration line + + name= + | + | This MUST be slingshot_info. + +EXAMPLES +========================== + +Within ldmsd_conteroller or a configuration file: + +:: + + load name=slingshot_info + config name=slingshot_info producer=host1 instance=host1/slingshot_info + start name=slingshot_info interval=1000000 offset=0 + +SEE ALSO +========================== + +ldmsd(8), ldms_quickstart(7), ldmsd_controller(8), ldms_sampler_base(7), +Plugin_slingshot_metrics(7) diff --git a/rtd/man2rst/Plugin_slingshot_metrics.rst b/rtd/man2rst/Plugin_slingshot_metrics.rst new file mode 100644 index 000000000..9bd5c2577 --- /dev/null +++ b/rtd/man2rst/Plugin_slingshot_metrics.rst @@ -0,0 +1,106 @@ +======================== +Plugin_slingshot_metrics +======================== + +:Date: 1 May 2022 + +.. contents:: + :depth: 3 +.. + +NAME +========================= + +Plugin_slingshot_metrics - man page for the LDMS slingshot_metrics +plugin + +SYNOPSIS +============================= + +| Within ldmsd_controller or a configuration file: +| config name=slingshot_metrics [ = ] + +DESCRIPTION +================================ + +With LDMS (Lightweight Distributed Metric Service), plugins for the +ldmsd (ldms aemon) are configured via ldmsd_controller or a +configuration file. The slingshot_metrics plugin provides a single +metric set that contains a list of records. Each record contains all of +the metrics for a single slingshot NIC. + +The slingshot_metrics sampler plugin provides detailed counter metrics +for each slignshot NIC. + +The schema is named "slingshot_metrics" by default. + +CONFIGURATION ATTRIBUTE SYNTAX +=================================================== + +The slingshot_metrics plugin uses the sampler_base base class. This man +page covers only the configuration attributes, or those with default +values, specific to the this plugin; see ldms_sampler_base.man for the +attributes of the base class. + +**config** + | name= [counters=] [counters_file=] + | configuration line + + name= + | + | This MUST be slingshot_metrics. + + counters= + | + | (Optional) A CSV list of names of slingshot counter names. See + Section COUTNER NAMES for details. If neither this option nor + counters_file are specified, a default set of counters will be + used. + + counters_files= + | + | (Optional) A path to a file that contains a list of counter + names, one per line. See Section COUNTER NAMES for details. A + line will be consider a comment if the character on the line is + a "#". If neither this option nor counters are specified, a + default set of counters will be used. + + refresh_interval_sec= + | + | (Optional) The sampler caches the list of slinghost devices, and + that cache is refreshed at the beginning of a sample cycle if + the refresh interval time has been exceeded. + refresh_interval_sec sets the minimum number of seconds between + refreshes of the device cache. The default refresh interval is + 600 seconds. + +COUNTER NAMES +================================== + +The names of the counters can be found in the slingshot/cassini header +file cassini_cntr_def.h in the array c1_cntr_defs (specifically the +strings in the "name" field of said array entries). + +In addition to the individual counter names, this plugin allows +specifying entire groups of counters by using the counter name pattern +"group:", for insance, "group:hni". The available groups +are: ext, pi_ipd, mb, cq, lpe, hni, ext2. These groups correspond with +the enum c_cntr_group in the cassini_cntr_def.h file. Additionally, one +may use "group:all", which simply includes all available counters. + +EXAMPLES +============================= + +Within ldmsd_conteroller or a configuration file: + +:: + + load name=slingshot_metrics + config name=slingshot_metrics producer=host1 instance=host1/slingshot_metrics counters=ixe_rx_tcp_pkt,group:hni refresh_interval_sec=3600 + start name=slingshot_metrics interval=1000000 offset=0 + +SEE ALSO +============================= + +ldmsd(8), ldms_quickstart(7), ldmsd_controller(8), ldms_sampler_base(7) diff --git a/rtd/man2rst/Plugin_slurm_notifier.rst b/rtd/man2rst/Plugin_slurm_notifier.rst new file mode 100644 index 000000000..cb0c80a84 --- /dev/null +++ b/rtd/man2rst/Plugin_slurm_notifier.rst @@ -0,0 +1,78 @@ +===================== +Plugin_slurm_notifier +===================== + +:Date: 30 Sep 2019 + +.. contents:: + :depth: 3 +.. + +NAME +====================== + +Plugin_slurm_notifier - man page for the SPANK slurm_notifier plugin + +SYNOPSIS +========================== + +Within plugstack.conf: **required** +*OVIS_PREFIX*/*LIBDIR*/ovis-ldms/libslurm_notifier.so +**stream=**\ *STREAM_NAME* **timeout=**\ *TIMEOUT_SEC* **[user_debug]** +**client=**\ *XPRT*\ **:**\ *HOST*\ **:**\ *PORT*\ **:**\ *AUTH* ... + +DESCRIPTION +============================= + +**slurm_notifier** is a SPANK plugin that notifies **ldmsd** about job +events (e.g. job start, job termination) and related information (e.g. +job_id, task_id, task process ID). The notification is done over +**ldmsd_stream** publish mechanism. See SUBSCRIBERS below for plugins +known to consume the spank plugin messages. + +**stream=**\ *STREAM_NAME* specifies the name of publishing stream. The +default value is *slurm*. + +**timeout=**\ *TIMEOUT_SEC* is the number of seconds determining the +time-out of the LDMS connections (default *5*). + +**user_debug,** if present, enables sending certain plugin management +debugging messages to the user's slurm output. (default: disabled -- +slurm_debug2() receives the messages instead). + +**client=**\ *XPRT*\ **:**\ *HOST*\ **:**\ *PORT*\ **:**\ *AUTH* +specifies **ldmsd** to which **slurm_notifier** publishes the data. The +*XPRT* specifies the type of the transport, which includes **sock**, +**rdma**, **ugni**, and **fabric**. The *HOST* is the hostname or the IP +address that **ldmsd** resides. The *PORT* is the listening port of the +**ldmsd**. The *AUTH* is the LDMS authentication method that the +**ldmsd** uses, which are **munge**, or **none**. The **client** option +can be repeated to specify multiple **ldmsd**'s. + +SUBSCRIBERS +============================= + +The following plugins are known to process slurm_notifier messages: + +:: + + slurm_sampler (collects slurm job & task data) + slurm_sampler2 (collects slurm job & task data) + papi_sampler (collects PAPI data from tasks identified) + linux_proc_sampler (collects /proc data from tasks identified) + +EXAMPLES +========================== + +/etc/slurm/plugstack.conf: + + :: + + required /opt/ovis/lib64/ovis-ldms/libslurm_notifier.so stream=slurm timeout=5 client=sock:localhost:10000:munge client=sock:node0:10000:munge + +SEE ALSO +========================== + +**spank**\ (8), **Plugin_slurm_sampler**\ (7), +**Plugin_papi_sampler**\ (7), **Plugin_linux_proc_sampler**\ (7), +**ldmsd**\ (8), **ldms_quickstart**\ (7), diff --git a/rtd/man2rst/Plugin_slurm_sampler.rst b/rtd/man2rst/Plugin_slurm_sampler.rst new file mode 100644 index 000000000..06054ed1b --- /dev/null +++ b/rtd/man2rst/Plugin_slurm_sampler.rst @@ -0,0 +1,93 @@ +==================== +Plugin_slurm_sampler +==================== + +:Date: 30 Sep 2019 + +.. contents:: + :depth: 3 +.. + +NAME +===================== + +Plugin_slurm_sampler - man page for the LDMSD slurm_sampler plugin + +SYNOPSIS +========================= + +Within ldmsd_controller or a configuration file: **config** +**name=slurm_sampler** **producer=**\ *PRODUCER* +**instance=**\ *INSTANCE* [ **component_id=\ COMP_ID** ] [ +**stream=\ STREAM** ] [ **job_count=\ MAX_JOBS** ] [ +**task_count=\ MAX_TASKS** ] + +DESCRIPTION +============================ + +**slurm_sampler** is a sampler plugin that collects the information of +the Slurm jobs running on the node. It subscribes to the specified +**stream** to which the **slurm_notifier** SPANK plugin (see +**Plugin_slurm_notifier**\ (7)) publish Slurm job events (default +stream: *slurm*). The sampler supports multi-tenant jobs. + +The **job_count** option is the number of slots in the LDMS set +allocated for concurrent jobs. If the number of concurrent jobs on the +node is greater than **job_count**, the new job will occupy the slot of +the oldest job. If **job_count** is not specified, the default value is +*8*. + +The **task_count** is the maximum number of tasks per job on the node. +If not specified, it is *CPU_COUNT*. In the event of the sampler failed +to obtain *CPU_COUNT*, the default value is *64*. + +CONFIG OPTIONS +=============================== + +**name=slurm_sampler** + This MUST be slurm_sampler (the name of the plugin). + +**producer=**\ *PRODUCER* + The name of the data producer (e.g. hostname). + +**instance=**\ *INSTANCE* + The name of the set produced by this plugin. This option is required. + +**component_id=**\ *COMPONENT_ID* + An integer identifying the component (default: *0*). + +**stream=**\ *STREAM* + The name of the LDMSD stream to get the job event data. + +**job_count=**\ *MAX_JOBS* + The number of slots to hold job information. If all slots are + occupied at the time the new job arrived, the oldest slot is reused. + The default value is *8*. + +**task_count=**\ *MAX_TASKS* + The number of slots for tasks information per job. If not specified, + the sampler will try to obtain system CPU_COUNT and use it as + task_count. If it failed, the default value is *64*. + +BUGS +===================== + +No known bugs. + +EXAMPLES +========================= + +Plugin configuration example: + + :: + + load name=slurm_sampler + config name=slurm_sampler producer=${HOSTNAME} instance=${HOSTNAME}/slurm \ + component_id=2 stream=slurm job_count=8 task_count=8 + start name=slurm_sampler interval=1000000 offset=0 + +SEE ALSO +========================= + +**ldmsd**\ (8), **ldms_quickstart**\ (7), **ldmsd_controller**\ (8), +**ldms_sampler_base**\ (7). diff --git a/rtd/man2rst/Plugin_store_app.rst b/rtd/man2rst/Plugin_store_app.rst new file mode 100644 index 000000000..237233afa --- /dev/null +++ b/rtd/man2rst/Plugin_store_app.rst @@ -0,0 +1,119 @@ +================ +Plugin_store_app +================ + +:Date: 30 Sep 2019 + +.. contents:: + :depth: 3 +.. + +NAME +================== + +ldmsd_store_app - LDMSD store_app storage plugin + +SYNOPSIS +====================== + +**load** **name**\ =\ **store_app** + +**config** **name**\ =\ **store_app** **path**\ =\ *STORE_ROOT_PATH* [ +**perm\ =\ OCTAL_PERM** ] + +**strgp_add** **name**\ =\ *STRGP_NAME* **plugin**\ =\ **store_app** +**container**\ =\ *CONTAINER_NAME* **schema**\ =\ *LDMS_SCHEMA* + +**strgp_prdcr_add** **name**\ =\ *STRGP_NAME* +**regex**\ =\ *PRDCR_REGEX* + +DESCRIPTION +========================= + +**``store_app``** is an LDMSD storage plugin for storing data from the +sets from **``app_sampler``** LDMSD sampler plugin. **``store_app``** +uses **``SOS``** as its database back-end. The **``path``** option +points to the directory containing **``SOS``** containers for this +plugin (one container per **``strgp``**). If the container does not +exist, it will be created with permission given by **``perm``** option +(default: 0660). The container contains multiple schemas, each of which +assoicates with a metric from the sets from **``app_sampler``** (e.g. +**``stat_utime``**). Schemas in the container have the following +attributes: + +- **``timestamp``** : the data sampling timestamp. + +- **``component_id``**: the component ID producing the data. + +- **``job_id``**: the Slurm job ID. + +- **``app_id``**: the application ID. + +- **``rank``**: the Slurm task rank. + +- **METRIC_NAME**: the metric value (the name of this attribute is the + metric name of the metric). + +- **``comp_time``**: (indexed) the join of **``component_id``** and + **``timestamp``**. + +- **``time_job``**: (indexed) the join of **``timestamp``** and + **``job_id``**. + +- **``job_rank_time``**: (indexed) the join of **``job_id``**, + **``rank``**, and **``timestamp``**. + +- **``job_time_rank``**: (indexed) the join of **``job_id``**, + **``timestamp``**, and **``rank``**. + +CONFIG OPTIONS +============================ + +name + The name of the plugin instance to configure. + +path + The path to the directory that contains SOS containers (one container + per strgp). + +perm + The octal mode (e.g. 0777) that is used in SOS container creation. + The default is **0660**. + +EXAMPLES +====================== + + :: + + # in ldmsd config file + load name=store_app + config name=store_app path=/sos perm=0600 + strgp_add name=app_strgp plugin=mstore_app container=app schema=app_sampler + # NOTE: the schema in strgp is LDMS set schema, not to confuse with the one + # schema per metric in our SOS container. + strgp_prdcr_add name=app_strgp regex=.* + strgp_start name=app_strgp + +The following is an example on how to retrieve the data using Python: + + :: + + from sosdb import Sos + cont = Sos.Container() + cont.open('/sos/app') + sch = cont.schema_by_name('status_vmsize') + attr = sch.attr_by_name('time_job') # attr to iterate over must be indexed + itr = attr.attr_iter() + b = itr.begin() + while b == True: + obj = itr.item() + print(obj['status_vmsize']) # object attribute access by name + print(obj[5]) # equivalent to above + print(obj[:]) # get everything at once + b = itr.next() + +SEE ALSO +===================== + +**Plugin_app_sampler**\ (7), **ldmsd**\ (8), **ldms_quickstart**\ (7), +**ldmsd_controller**\ (8), diff --git a/rtd/man2rst/Plugin_store_csv.rst b/rtd/man2rst/Plugin_store_csv.rst new file mode 100644 index 000000000..a0aad9e78 --- /dev/null +++ b/rtd/man2rst/Plugin_store_csv.rst @@ -0,0 +1,487 @@ +================ +Plugin_store_csv +================ + +:Date: 26 Nov 2018 + +.. contents:: + :depth: 3 +.. + +NAME +================= + +Plugin_store_csv - man page for the LDMS store_csv plugin + +SYNOPSIS +===================== + +| Within ldmsd_controller script or a configuration file: +| load name=store_csv +| config name=store_csv [ = ] +| config name=store_csv [opt_file=filename] [ = ] +| config name=store_csv [container=c schema=s] [ = ] +| strgp_add name= plugin=store_csv container= schema= + [decomposition=] + +DESCRIPTION +======================== + +With LDMS (Lightweight Distributed Metric Service), store plugins for +the ldmsd (ldms daemon) are configured via the ldmsd_controller or a +configuration file. The store_csv plugin is a CSV store. + +STORE_CSV CONFIGURATION SOURCES +============================================ + +Default configuration options can be defined on the config line or in +the store_csv line of the options file. Options for the specific +instance matching 'container=c schema=s" can be given in the file +indicated by opt_file=filename when configuring the defaults (see +section OPTIONS FILE below) or can be scripted. + +The configuration parameters rolltype, rollover, and rollagain are +applied to all metric sets alike from the values given on the command +line or in the "store_csv" line of the options file. All other options +can be specified per instance. + +The config defaults (a config line without container or schema defined) +can be specified once in scripting or the opt_file. They are used for +any container/schema pair not explicitly configured. + +The config values for a specific container/schema pair can be specified +once in scripting or in the opt_file. Any attribute not specifically +defined will take on the value configured in the default config line or +opt_file. + +STORE_CSV CONFIGURATION ATTRIBUTE SYNTAX +===================================================== + +**config** + | name= path= [ altheader=<0/!0> + typeheader= time_format=<0/1> ietfcsv=<0/1> + buffer=<0/1/N> buffertype=<3/4> rolltype= + rollover= rollempty=<0/1> userdata=<0/!0>] + [rename_template= [rename_uid= + [rename_gid=]] + [create_uid=] [create_gid=] [opt_file=filename] [ietfcsv=<0/1>] + [typeheader=<0/1/2>] [array_expand=] + [array_sep=] [array_lquote=] + [array_rquote=] + | ldmsd_controller configuration line + + name= + | + | This MUST be store_csv. + + opt_file= + | + | The options for the plugin and specific instances will be read + from the named file. See OPTIONS FILE. + + path= + | + | This option is required; the config line or the options file + must supply a default value. The output files will be put into a + directory whose root is specified by the path argument. This + directory must exist; the subdirectories and files will be + created. The full path to the output files will be + //. Container and schema are set when + the strgp is added. If you choose a rollover option, then the + filename will also be postpended by "." followed by the epoch + time e.g., XXX/meminfo_ctr/meminfo-123456789. + + altheader=<0/!0> + | + | Distinguishes whether or not to write the header to a separate + file than the data file. 0 = same file. Any non-zero is a + separate file. Default is the same file. If a separate file is + used then, if the data file is called "meminfo" the additional + header file will be called "meminfo.HEADER"). If you choose a + rollover option, the header file name will be postpended with + the epochtime, similar to the data file, and a new one will be + written at each rollover. Default is altheader=0. + + typeheader= + | + | Controls the presence and format of a .KIND file. The kind CSV + file gives type information on each metric (or metric array). + For example, if the metric file is named meminfo, the kind file + is named meminfo.KIND and if the metric file is named + meminfo.15111111, the kind file is named meminfo.KIND.15111111. + The typeformat parameter is 0 (no kind file), 1 (ldms kinds with + arrays flattend out into scalars), 2 (LDMS kinds with arrays). + The typeformat supporting arrays uses the notation + [] for extraction of lengths by scripting tools. + The default typeformat is 0. + + time_format=<0/1> + Controls the format of the initial time fields in each line of the + CSV files. + + A value of 0 means the classic LDMS format where the first field + (Time) is ., and the second field + (Time_usec) is repeated. + + A value of 1 chooses an alternate format where the first field + (Time_msec) is , and the second field + (Time_usec) is just the additional since the epoch in + excess of the milliseconds since epoch. In other words, there is no + overlap of the values in the first and seconds fields, which is in + contrast to the repetition employed by format 0. + + ietfcsv=<0/1> + | + | Turns on (1) or off (0) use of IETF 4180 quoting for header + column names. + + userdata=<0/!0> + | + | Distinguishes whether or not to write each metrics' user data + along with each data value. 0 = no write. Any non-zero means to + write the values. Default is to not write. + + buffer=<0/1/N> + | + | Distinguishes whether or not to buffer the data for the + writeout. 0 = does not buffer. 1 enables buffering with the + system determining the flush. N will flush after approximately N + kB of data (> 4) or N lines -- buffertype determines which of + these it is. Default is system controlled buffering (1). + + buffertype=<3/4> + | + | If buffer=N then buffertype determines if the buffer parameter + refers to kB of writeout or number of lines. The values are the + same as in rolltype, so only 3 and 4 are applicable. + + rolltype= + | + | By default, the store does not rollover and the data is written + to a continously open filehandle. Rolltype and rollover are used + in conjunction to enable the store to manage rollover, including + flushing before rollover. The header will be rewritten when a + roll occurs. Valid options are: + + 1 + | + | wake approximately every rollover seconds and roll. Rollover + is suppressed if no data at all has been written and + rollempty=0. + + 2 + | + | wake daily at rollover seconds after midnight (>=0) and roll. + Rollover is suppressed if no data at all has been written and + rollempty=0. + + 3 + | + | roll after approximately rollover records are written. + + 4 + roll after approximately rollover bytes are written. + + 5 + | + | wake at rollover seconds after midnight (>=0) and roll, then + repeat every rollagain (> rollover) seconds during the day. + For example "rollagain=3600 rollover=0 rolltype=5" rolls + files hourly. Rollover is suppressed if no data at all has + been written and rollempty=0. + + rollover= + | + | Rollover value controls the frequency of rollover (e.g., number + of bytes, number of records, time interval, seconds after + midnight). Note that these values are estimates. + + rollempty=0 + | + | Turn off rollover of empty files. Default value is 1 (create + extra empty files). + + create_perm= + | + | Only octal (e.g.0744) specifications are allowed. If unspecified + or 0 is given, then no change is made. The default permission is + 0600 for data files. The mode specified can include execute bits + which will apply to intermediate directories created but not + data files. For example 0755 will yield 0755 for new directories + and 0644 for data files. + + create_uid= + | + | Specify a new user id for data files. If unspecified, no change + in user ownership is made. Changes in ownership of the files do + not affect intermediate directories. + + create_gid= + | + | Specify a new group id for data files. If unspecified, no change + in group ownership is made. + + rename_template= + | + | This option relocates closed CSV files, typically to a + subdirectory, for processing by other tools that watch + directories. The metapath template is applied to define a new + name after file closure. The rename is limited to locations on + the same mount point, per the C rename(2) call. Substitutions + (%) in the provided template are performed as described in + METAPATH SUBSTITUTIONS below. Errors in template specification + will cause the rename to be skipped. As part of the renaming + process, the mode and ownership of the file may also be adjusted + by specifying rename_perm, rename_uid, and rename_gid. Missing + intermediate directories will be created if possible. To enable + greater flexibility than the renaming just described (e.g. + crossing file systems), an external program must monitor the + output directory and handle completed files. + + rename_perm= + | + | Only octal (e.g.0744) specifications are allowed. If unspecified + or 0 is given, then no change is made. The permissions are + changed before the rename and even if the rename fails. This + option is applied only if rename_template is applied. + + rename_uid= + | + | Specify a new user id for the file. If unspecified, no change in + user ownership is made. Changes in ownership of the files do not + affect intermediate directories that might be created following + the template. This option is applied only if rename_template is + applied. + + rename_gid= + | + | Specify a new group id for the file. If unspecified, no change + in group ownership is made. This option is applied only if + rename_template is applied. + + expand_array= + | + | The default is false. Each array element is stored in a column. + True means that all elements are stored in a single column. + + array_sep= + | + | Specify a character to separate array elements. If exand_array + is true, the value is ignored. + + array_lquote= + | + | Specify the left-quote character if expand_array is true. If + expand_array is false, the value is ignored. + + array_rquote= + | + | Specify the right-quote character if expand_array is true. If + expand_array is false, the value is ignored. + +OPTIONS FILE +========================= + +The plug-in options file or repeated scripted config calls replace the +LDMS v3 'action' keyword for defining instance specific settings. + +The options file recognizes lines starting with # as comments. +Continuation lines are allowed (end lines with a \\ to continue them). +Comment lines are continued if ended with a \\. See EXAMPLES below. + +When an option is needed for a plugin instance, the content of the +options file is searched beginning with the options line holding +"container=$c schema=$s". If the matching container/schema is not found +in the options file or the option is not defined among the options on +that line of the file, then the option value from the ldmsd script +'config' command line is used. If the option is not set on the command +line, the defaults are taken from the line of the options file +containing the keyword 'store_csv'. If the option is found in none of +these places, the compiled default is applied. + +STRGP_ADD ATTRIBUTE SYNTAX +======================================= + +The strgp_add sets the policies being added. This line determines the +output files via identification of the container and schema. + +**strgp_add** + | plugin=store_csv name= schema= + container= [decomposition=] + | ldmsd_controller strgp_add line + + plugin= + | + | This MUST be store_csv. + + name= + | + | The policy name for this strgp. + + container= + | + | The container and the schema determine where the output files + will be written (see path above). They also are used to match + any specific config lines. + + schema= + | + | The container and the schema determine where the output files + will be written (see path above). You can have multiples of the + same sampler, but with different schema (which means they will + have different metrics) and they will be stored in different + containers (and therefore files). + + decomposition= + | + | Optionally use set-to-row decomposition with the specified + configuration file in JSON format. See more about decomposition + in ldmsd_decomposition(7). + +STORE COLUMN ORDERING +================================== + +This store generates output columns in a sequence influenced by the +sampler data registration. Specifically, the column ordering is + + Time, Time_usec, ProducerName, \* + +where each is either + + .userdata, .value + +or if userdata has been opted not to include, just: + + + +The column sequence of is the order in which the +metrics are added into the metric set by the sampler (or the order they +are specifed by the user). + +Note that the sampler's number and order of metric additions may vary +with the kind and number of hardware features enabled on a host at +runtime or with the version of kernel. Because of this potential for +variation, down-stream tools consuming the CSV files should always +determine column names or column number of a specific metric by parsing +the header line or .HEADER file. + +METAPATH SUBSTITUTION +================================== + +The following % escape sequence replacements are performed on the +rename_template value for file renamings: + +%P + | + | plugin name + +%C + | + | container name + +%S + | + | schema name + +%T + | + | file type (DATA, HEADER, KIND, UNITS, CNAMES, PYNAMES) + +%B + | + | basename(closed-file-name) + +%D + | + | dirname(closed-file-name) + +%{ENV_VAR_NAME} + | + | getenv(ENV_VAR_NAME). The use of undefined or empty environment + vars yields an empty substitution, not an error. Characters in the + environment variable are restricted to: 'A-Za-z0-9%@()+-\_./:='; + other characters present will prevent the rename. + +%s + | + | timestamp suffix, if it exists. + +NOTES +================== + +- Please note the argument changes from v2 and v3. The notification of + file events has be removed, being redundant with renaming closed + files into a spool directory. + +- The 'sequence' option has been removed. The 'action' option has been + replaced; see "OPTIONS FILE" above. + +- In the opt_file passed by name to store_csv, including the line + prefix "config name=store_csv" is redundant and is disallowed. The + opt_file syntax is plugin specific and is not an ldmsd configuration + script. Scripts written in the store_csv opt_file syntax cannot be + used directly with the ldmsd include statement. + +BUGS +================= + +None known. + +IMPERFECT FEATURES +=============================== + +The rename and create options do not accept symbolic permissions, uid, +or gid. There is no metapath substitution for file creation. + +EXAMPLES +===================== + +Within ldmsd_controller or in a ldmsd command script file + +:: + + load name=store_csv + config name=store_csv opt_file=/etc/sysconfig/ldms.d/store-plugins/store_csv.conf + strgp_add name=csv_mem_policy plugin=store_csv container=loadavg_store schema=loadavg + +Or with interactive modifications to override file properties: + +:: + + load name=store_csv + config name=store_csv altheader=1 rolltype=2 rollover=0 path=/mprojects/ovis/ClusterData/${LDMSCLUSTER} create_gid=1000000039 create_perm=640 rename_template=%D/archive-spool/%{HOSTNAME}/%B rename_perm=444 + +And in the options file for store_csv +(/etc/sysconfig/ldms.d/store-plugins/store_csv.conf by convention) + +:: + + # defaults for csv, unless overridden on ldmsd script config line. + store_csv altheader=1 path=/XXX/storedir rolltype=2 rollover=0 + # tailored setting for loadavg instance + container=loadavg_store schema=loadavg altheader=0 path=/XXX/loaddir \ + create_gid=1000000039 create_perm=640 \ + rename_template=%D/archive-spool/%{HOSTNAME}/%B \ + rename_perm=444 + +Updating from v3: + +If in version 3 "config name=store_csv action=custom container=cstore +schema=meminfo" was used for a specific csv instance, then put the +additional options for that store instance in the store_csv options file +on a line: + +container=cstore schema=meminfo \* + +or use them interactively or in a script as: + +config name=store_csv container=cstore schema=meminfo \* + +after the store_csv defaults have been set. + +SEE ALSO +===================== + +ldmsd(8), ldms_quickstart(7), ldmsd_controller(8), +ldmsd_decomposition(7) diff --git a/rtd/man2rst/Plugin_store_flatfile.rst b/rtd/man2rst/Plugin_store_flatfile.rst new file mode 100644 index 000000000..71b96fe0b --- /dev/null +++ b/rtd/man2rst/Plugin_store_flatfile.rst @@ -0,0 +1,107 @@ +===================== +Plugin_store_flatfile +===================== + +:Date: 24 Oct 2019 + +.. contents:: + :depth: 3 +.. + +NAME +====================== + +Plugin_store_flatfile - man page for the LDMS store_flatfile plugin + +SYNOPSIS +========================== + +| Within ldmsd_controller script or a configuration file: +| load name=store_flatfile +| config name=store_flatfile path=datadir +| strgp_add plugin=store_flatfile [ = ] + +DESCRIPTION +============================= + +The flatfile store generates one file per metric with time, producer, +component id, and value columns separated by spaces. The file name is +$datadir/$container/$schema/$metric_name. + +STRGP_ADD ATTRIBUTE SYNTAX +============================================ + +The strgp_add sets the policies being added. This line determines the +output files via identification of the container and schema. + +**strgp_add** + | plugin=store_flatfile name= schema= + container= + | ldmsd_controller strgp_add line + + plugin= + | + | This MUST be store_flatfile. + + name= + | + | The policy name for this strgp. + + container= + | + | The container and the schema determine where the output files + will be written (see path above). They also are used to match + any action=custom configuration.node/meminfo. + + schema= + | + | The container and schema determines where the output files will + be written (see path above). + +NOTES +======================= + +- As of LDMS Version 4.3.3 there is a change in behavior. Previously + there was a choice that an associated value with a metric was its + udata, rather than the component_id. In the code the variable name + used was 'comp_id', even though it wasn't necessarily input as such + in the sampler. his code now explictly gets the component_id by name. + +- We expect to develop additional options controlling output files and + output file format. + +- There is no option to quote string values, handle rollover, or handle + buffering. + +- There is a maximum of 20 concurrent flatfile stores. + +BUGS +====================== + +- Numeric array metrics are not presently supported. + +EXAMPLES +========================== + +Within ldmsd_controller or in a configuration file + +:: + + load name=store_flatfile + config name=store_flatfile path=/XXX/datadir + + # log only Active from the meminfo sampler + strgp_add name=store_flatfile_meminfo plugin=store_flatfile schema=meminfo container=flat + strgp_prdcr_add name=store_flatfile_meminfo regex=localhost1 + strgp_metric_add name=store_flatfile_meminfo metric=Active + strgp_start name=store_flatfile_meminfo regex=localhost1 + + # log all from vmstat + strgp_add name=store_flatfile_vmstat plugin=store_flatfile schema=vmstat container=flat + strgp_prdcr_add name=store_flatfile_vmstat regex=localhost1 + strgp_start name=store_flatfile_vmstat regex=localhost1 + +SEE ALSO +========================== + +ldmsd(8), ldms_quickstart(7), ldmsd_controller(8) diff --git a/rtd/man2rst/Plugin_store_kafka.rst b/rtd/man2rst/Plugin_store_kafka.rst new file mode 100644 index 000000000..08c58605c --- /dev/null +++ b/rtd/man2rst/Plugin_store_kafka.rst @@ -0,0 +1,86 @@ +================== +Plugin_store_kafka +================== + +:Date: 2 Jun 2022 + +.. contents:: + :depth: 3 +.. + +NAME +=================== + +Plugin_store_kafka - man page for the LDMS store_kafka plugin + +SYNOPSIS +======================= + +| Within ldmsd_controller script: +| ldmsd_controller> load name=store_kafka +| ldmsd_controller> config name=store_kafka + [path=] +| ldmsd_controller> strgp_add name= plugin=store_kafka + container= decomposition= + +DESCRIPTION +========================== + +**store_kafka** uses librdkafka to send rows from the decomposition to +the Kafka servers (specified by strgp's *container* parameter) in JSON +format. The row JSON objects have the following format: { "column_name": +COLUMN_VALUE, ... }. + +PLUGIN CONFIGURATION +=================================== + +**config** **name=**\ *store_kafka* [ **path=\ KAFKA_CONFIG_JSON_FILE** +] + +Configuration Options: + + **name=**\ *store_kafka* + | + | The name of the plugin. This must be **store_kafka**. + + **path=**\ *KAFKA_CONFIG_JSON_FILE* + The optional KAFKA_CONFIG_JSON_FILE contains a dictionary with + KEYS being Kafka configuration properties and VALUES being their + corresponding values. **store_kafka** usually does not require + this option. The properties in the KAFKA_CONFIG_JSON_FILE is + applied to all Kafka connections from store_kafka. Please see + `librdkafka CONFIGURATION + page `__ + for a list of supported properties. + +STRGP CONFIGURATION +================================== + +**strgp_add** **name=**\ *NAME* **plugin=**\ store_kafka +**container=**\ *KAFKA_SERVER_LIST* +**decomposition=**\ *DECOMP_CONFIG_JSON_FILE* + +strgp options: + + **name=**\ *NAME* + | + | The name of the strgp. + + **plugin=**\ store_kafka + | + | The plugin must be store_kafka. + + **container=**\ *KAFKA_SERVER_LIST* + | + | A comma-separated list of Kafka servers (host[:port]). For + example: container=localhost,br1.kf:9898. + + **decomposition=**\ *DECOMP_CONFIG_JSON_FILE* + | + | Set-to-row decomposition configuration file (JSON format). See + more about decomposition in **ldmsd_decomposition**\ (7). + +SEE ALSO +======================= + +ldmsd_decomposition(7) diff --git a/rtd/man2rst/Plugin_store_papi.rst b/rtd/man2rst/Plugin_store_papi.rst new file mode 100644 index 000000000..02f5e99c4 --- /dev/null +++ b/rtd/man2rst/Plugin_store_papi.rst @@ -0,0 +1,114 @@ +================= +Plugin_store_papi +================= + +:Date: 30 Sep 2019 + +.. contents:: + :depth: 3 +.. + +.SH NAME + +Plugin_store_papi - man page for the LDMSD store_papi plugin + +.SH SYNOPSIS + +Within ldmsd_controller or a configuration file: + +**load** **name=store_papi** + +**config** **name=store_papi** **path=**\ *STORE_ROOT_PATH* + +**strgp_add** **name=**\ *STRGP_NAME* **plugin=store_papi** +**container=**\ *CONTAINER* **schema=**\ *SCHEMA* + +**strgp_prdcr_add** **name=**\ *STRGP_NAME* **regex=**\ *PRDCR_REGEX* + +.SH DESCRIPTION + +**store_papi** is an LDMSD storage plugin for storing data from +**papi_sampler** specifically as it expects a collection of PAPI event +metrics after a certain job metric (task_ranks) that only +**papi_sampler** produced. **store_papi** stores data in a SOS container +(specified by **strgp** **container** option). Unlike **store_sos** (see +**Plugin_store_sos**\ (7)) where an entire LDMS snapshot results in an +SOS data entry, **store_papi** split the PAPI events in the set into +their own schemas and data points. For example, if we have PAPI_TOT_INS +and PAPI_TOT_CYC as PAPI events in the **papi_sampler** set, we will +have PAPI_TOT_INS and PAPI_TOT_CYC schemas in the SOS container storing +respective PAPI events. This allows storing flexible, user-defined +schemas at run-time by user jobs (LDMS schemas of sets from +**papi_sampler** are defined at run-time by user jobs). Please note that +the schema name defined by user job must match **strgp**'s schema in +order to store the data. + +.SH CONFIG OPTIONS + +**name=store_papi** + This MUST be store_papi (the name of the plugin). + +**path=**\ *STORE_ROOT_PATH* + The path to the root of the store. SOS container for each schema + specified by the storage policy (**strgp**) will be placed in the + *STORE_ROOT_PATH* directory. + +.SH STORAGE POLICY + +An LDMSD storage plugin is like a storage driver that provides only +storing mechanism. A storage policy (**strgp**) is a glue binding data +sets from various producers to a container of a storage plugin. + +**strgp_add** command defines a new storage policy, identified by +**name**. The **plugin** attribute tells the storage policy which +storage plugin to work with. The **schema** attribute identifies LDMS +schema the data set of which is consumed by the storage policy. The +**container** attribute identifies a container inside the storage plugin +that will store data. + +**strgp_prdcr_add** is a command to specify producers that feed data to +the storage policy. + +.SH BUGS + +No known bugs. + +.SH EXAMPLES + +Plugin configuration example: + + :: + + load name=store_papi + config name=store_papi path=/var/store + strgp_add name=papi_strgp plugin=store_papi container=papi schema=papi + strgp_prdcr_add name=papi_strgp regex=.* + +The following job script and PAPI JSON config combination is an example +of submiting a PAPI-enabled job that will end up in the storage of the +configuration above. + +Job script example: + + :: + + #!/bin/bash + export SUBSCRIBER_DATA='{"papi_sampler":{"file":"/tmp/papi.json"}}' + srun bash -c 'for X in {1..60}; do echo $X; sleep 1; done' + +PAPI JSON example (/tmp/papi.json): + + :: + + { + "schema": "papi", + "events": [ + "PAPI_TOT_INS", + "PAPI_L1_DCM" + ] + } + +.SH SEE ALSO + +**Plugin_papi_sampler**\ (7), **ldmsd**\ (8), **ldms_quickstart**\ (7), +**ldmsd_controller**\ (8), **ldms_sampler_base**\ (7). diff --git a/rtd/man2rst/Plugin_store_rabbitkw.rst b/rtd/man2rst/Plugin_store_rabbitkw.rst new file mode 100644 index 000000000..b14499155 --- /dev/null +++ b/rtd/man2rst/Plugin_store_rabbitkw.rst @@ -0,0 +1,230 @@ +===================== +Plugin_store_rabbitkw +===================== + +:Date: 10 Jun 2018 + +.. contents:: + :depth: 3 +.. + +NAME +====================== + +Plugin_store_rabbitkw - man page for the LDMS store_rabbitkw plugin + +SYNOPSIS +========================== + +| Within ldmsd_controller or in a configuration file +| load name=store_rabbitkw +| config name=store_rabbitkw [ = ] +| strgp_add name=store_rabbitkw [ = ] + +DESCRIPTION +============================= + +The store_rabbitkw plugin is a rabbitmq producer. Actual storage of data +must be arranged separately by configuring some other amqp client. + +CONFIGURATION ATTRIBUTE SYNTAX +================================================ + +The configuration parameters routing_key, host, port, exchange, vhost, +user, and pwfile are shared across all metric sets. + +**config** + | name= exchange= routing_key= host= + port= vhost= user= pwfile= + [extraprops= logmsg= useserver=[y/n> heartbeat= + timeout= retry=] + | These parameters are: + + name= + | + | This MUST be store_rabbitkw. + + routing_key + | + | The routing key shared by all metric sets is . + + host= + | + | The rabbitmq server host. The default is localhost. + + port= + | + | The server port on the nearest rabbitmq host. The default is + 5672. + + exchange= + | + | The amqp exchange to publish with is . The default is + amq.topic. This must preexist; the plugin will no cause its + creation. + + vhost= + | + | The virtual host to be used is . The default is "/". + + user= + | + | The amqp username is . The default is "guest". + + pwfile= + | + | The file contains the amqp user password in the format + 'secretword=password. The default password "guest" is assumed if + no file is specified. + + retry= + | + | If amqp connection fails due to network or server issue, retry + every seconds. Default is 60. + + heartbeat= + | + | Heartbeat interval used to detect failed connections. + + timeout= + | + | Timeout to use for connections, in milliseconds. Default is + 1000. + + extraprops= + | + | Turn on (y) or off (n) the use of extra properties with all + messages. If AMQP-based filtering is not planned, 'n' will + reduce message sizes slightly. + + logmsg= + | + | Enable (y) or disable (n, the default) logging all message + metric content at the DEBUG level. This is a debugging option. + + useserver= + | + | Enable (y, the default) or disable (n) calls to the amqp server; + this is a debugging option. + +STORE ATTRIBUTE SYNTAX +======================================== + +**store** + | name= schema= container= + + name= + | + | This MUST be store_rabbitkw. + + schema= + | + | The name of the metric group, independent of the host name. The + schema will be used as a header in messages if extraprops is y. + + container= + | + | The container will be used as a header in messages if extraprops + is y. + +AMQ event contents +==================================== + +This store generates rabbitmq events containing the data from LDMS set +instances. All events are on the single queue that is configured. + +The properties follow the AMQP standard, with LDMS specific +interpretations: + + timestamp + | + | The sample collection time in MICROSECONDS UTC. Divide by + 1,000,000 to get seconds UTC. + + app_id + | + | The app_id is LDMS. + +Optional AMQ event contents +============================================= + +These fields and headers are present if extraprops=y is configured. + +content_type + | + | <"text/plain"> for all. + +reply_to + | + | The metric set instance name. + +container + | + | The container configuration name. + +schema + | + | The schema configuration name. + +PAYLOAD FORMAT +================================ + +Payloads are ASCII formatted, tab separated "label=val" lists. + +Scalar metric values are formatted in obvious C ways to ensure full +precision is retained. Each is a tab-separated triplet 'metric=$name +type=$scalar_type value=$value'. Before the metric values on each line +are the keys and values: timestamp_us, producer, container, schema. + +Array values are formatted as semicolon separated lists: Each metric +appears as a tab-separated quartet 'metric=$name type=$scalar_type +length=$array_length value=$value'. + +CHAR_ARRAY values are formatted as strings. Note these are terminated at +the first nul character. + +NOTES +======================= + +The semantics of LDMS messages are not an extremely close match to +network mail and news messages targeted by AMQP. The interpretations on +message properties used here may be subject to change in future +releases. + +The authentication to AMQP server uses the SASL plaintext method. In HPC +environments this is normally secure. Additional options enabling +encryption are likely to appear in future work at a cost in CPU. +Normally, an amqp server federation member should be hosted on or very +near the LDMS aggregator host. + +Presently each payload contains a single line (with tab separators). +Future versions may capture multiple set instances per message, where +each set is separated by newlines from the others. + +The behavior of this AMQP client when faced with AMQP server +disappearance is to retry connection later and to ignore any metric data +seen while disconnected. + +BUGS +====================== + +String data containing tab characters are not compatible with this data +encoding. This may be fixed when a satisfactory alternate representation +is agreed for these special characters. + +EXAMPLES +========================== + +See the LDMS test script rabbitkw + +ADMIN HINTS +============================= + +On Linux, this requires an amqp service (typically +rabbitmq-server.service) running in the network. That service may +require epmd.service. + +SEE ALSO +========================== + +ldmsd(8), rabbitmq-server(1), ldmsd_controller(8), store_rabbitv3(7) diff --git a/rtd/man2rst/Plugin_store_rabbitv3.rst b/rtd/man2rst/Plugin_store_rabbitv3.rst new file mode 100644 index 000000000..e1c59cc35 --- /dev/null +++ b/rtd/man2rst/Plugin_store_rabbitv3.rst @@ -0,0 +1,221 @@ +===================== +Plugin_store_rabbitv3 +===================== + +:Date: 03 Dec 2016 + +.. contents:: + :depth: 3 +.. + +NAME +====================== + +Plugin_store_rabbitv3 - man page for the LDMS store_rabbitv3 plugin + +SYNOPSIS +========================== + +| Within ldmsd_controller or in a configuration file +| load name=store_rabbitv3 +| config name=store_rabbitv3 [ = ] +| strgp_add name=store_rabbitv3 [ = ] + +DESCRIPTION +============================= + +The store_rabbitv3 plugin is a rabbitmq producer. Actual storage of data +must be arranged separately by configuring some other amqp client. + +CONFIGURATION ATTRIBUTE SYNTAX +================================================ + +The configuration parameters root, host, port, exchange, vhost, user, +and pwfile are shared across all metric sets. + +**config** + | name= root= host= port= + exchange= vhost= user= pwfile= + extraprops= metainterval= + | These parameters are: + + name= + | + | This MUST be store_rabbitv3. + + root= + | + | The routing key prefix shared by all metric sets will be . + + host= + | + | The rabbitmq server host. The default is localhost. + + port= + | + | The server port on the nearest rabbitmq host. The default is + 5672. + + exchange= + | + | The amqp exchange to publish with is . The default is + amq.topic. + + vhost= + | + | The virtual host to be used is . The default is "/". + + user= + | + | The amqp username is . The default is "guest". + + pwfile= + | + | The file contains the amqp user password in the format + 'secretword=password. The default password "guest" is assumed if + no file is specified. + + extraprops= + | + | Turn on (y) or off (n) the use of extra properties with all + messages. + + mint + | + | The number of seconds between emission of time and host + invariant (meta) metrics. + +STORE ATTRIBUTE SYNTAX +======================================== + +**store** + | name= schema= container= + + name= + | + | This MUST be store_rabbitv3. + + schema= + | + | The name of the metric group, independent of the host name. + + container= + | + | The container will be used in the routing key. The current + routing key patterns is: + .... + + Use a unique container parameter for different metric sets coming + from different sampler (e.g., do not use the same container for + procstat and meminfo); however, use the same container for the same + metric set coming from all hosts (e.g., for all meminfo). + +AMQ event contents +==================================== + +This store generates rabbitmq events. The message in each event is just +the metric value in string form. The message properties of each event +encode everything else. + +The properties follow the AMQP standard, with LDMS specific +interpretations: + + timestamp + | + | The sample collection time in MICROSECONDS UTC. Divide by + 1,000,000 to get seconds UTC. + + type + | + | The ldms metric data type. + + app_id + | + | The app_id is the integer component_id, if it has been defined + by the sampler. + +Optional AMQ event contents +============================================= + +These fields and headers are present if extraprops=y is configured. + +content_type + | + | <"text/plain"> for all. + +reply_to + | + | The producer name. + +metric + | + | The label registered by the sampler plugin, which might be + anything. + +metric_name_amqp + | + | The label modified to work as a routing key, not necessarily easily + read. + +metric_name_least + | + | The label modified to work as a programming variable name, possibly + shortened and including a hash suffix. Not expected to be fully + human-readable in all cases. It will be the same across runs for + metric sets whose content labels do not vary across runs. + +container + | + | The container configuration name. + +schema + | + | The schema configuration name. + +PAYLOAD FORMAT +================================ + +Payloads are ASCII formatted. + +Scalar values are formatted in obvious C ways to ensure full precision +is retained. Each is a doublet: type,value + +Array values are formatted as comma separated lists: +type,array-length,value[,value]\*. + +Char array values omit the commas in the value list, giving the +appearance of a string. Note however that there may be embedded nul +characters. + +NOTES +======================= + +The semantics of LDMS messages are not an extremely close match to +network mail and news messages. The interpretations on message +properties used here may be subject to change in major releases of LDMS. + +The authentication to AMQP server uses the SASL plaintext method. In HPC +environments this is normally secure. Additional options enabling +encryption are likely to appear in future work at a cost in CPU. +Normally, an amqp server federation member should be hosted on or very +near the LDMS aggregator host. + +BUGS +====================== + +The periodic emission of meta metrics should be per (producer,metric) +pair, but the store API is not yet sufficient to make this a scalable +and efficient operation. In the meanwhile, meta metrics are emitted on +first definition and assumed to be identical for a metric set across all +producers. The special case of component_id (if present) is handled +correctly when extraprops=y is configured. + +EXAMPLES +========================== + +See the LDMS test script ldms_local_amqptest.sh. + +SEE ALSO +========================== + +ldmsd(8), rabbitmq-server(1), ldmsd_controller(8) diff --git a/rtd/man2rst/Plugin_store_slurm.rst b/rtd/man2rst/Plugin_store_slurm.rst new file mode 100644 index 000000000..674a3df1f --- /dev/null +++ b/rtd/man2rst/Plugin_store_slurm.rst @@ -0,0 +1,104 @@ +================== +Plugin_store_slurm +================== + +:Date: 30 Sep 2019 + +.. contents:: + :depth: 3 +.. + +NAME +=================== + +Plugin_store_slurm - man page for the LDMSD store_slurm plugin + +SYNOPSIS +======================= + +Within ldmsd_controller or a configuration file: **load** +**name=store_slurm** + +**config** **name=store_slurm** **path=**\ *STORE_ROOT_PATH* [ +**verbosity=\ (0\ \|\ 1\ \|\ 2)** ] + +**strgp_add** **name=**\ *STRGP_NAME* **plugin=store_slurm** +**container=**\ *CONTAINER* **schema=mt-slurm** + +**strgp_prdcr_add** **name=**\ *STRGP_NAME* **regex=**\ *PRDCR_REGEX* + +DESCRIPTION +========================== + +**store_slurm** is an LDMSD storage plugin that stores job data from +**slurm_sampler** specifically, and must not be used with other data. + +PLUGIN CONFIG OPTIONS +==================================== + +**name=store_slurm** + This MUST be store_slurm (the name of the plugin). + +**path=**\ *STORE_ROOT_PATH* + The path to the root of the store. SOS container for each schema + specified by the storage policy (**strgp**) will be placed in the + *STORE_ROOT_PATH* directory. + +**verbosity=(**\ *0*\ **\|**\ *1*\ **\|**\ *2*\ **)** + + *0* + (default) for SUMMARY verbosity level. The storage plugin only + stores single entry for each job. + + *1* + for RANK verbosity level. The storage plugin stores job data entry + per each rank (process) in the job. + + *2* + for TIME (the most verbosed) verbosity level. The storage plugin + stores job data entries every time the slurm_sampler set is + updated. In this verbosity level, we would have a lot of job + entries that are the same in everything except for the timestamp. + +STORAGE POLICY +============================= + +An LDMSD storage plugin is like a storage driver that provides only +storing mechanism. A storage policy (**strgp**) is a glue binding data +sets from various producers to a container of a storage plugin. + +**strgp_add** command defines a new storage policy, identified by +**name**. The **plugin** attribute tells the storage policy which +storage plugin to work with. The **schema** attribute identifies LDMS +schema the data set of which is consumed by the storage policy. The +**container** attribute identifies a container inside the storage plugin +that will store data. + +The **schema** for **store_slurm** is always *mt-slurm* as +**slurm_sampler** restricts "mt-slurm" as its schema name. + +**strgp_prdcr_add** is a command to specify producers that feed data to +the storage policy. + +BUGS +=================== + +No known bugs. + +EXAMPLES +======================= + +Plugin configuration + prdcr example: + + :: + + load name=store_slurm + config name=store_slurm path=/var/store verbosity=1 + strgp_add name=slurm_strgp plugin=store_slurm container=slurm schema=mt-slurm + strgp_prdcr_add name=slurm_strgp regex=.* + +SEE ALSO +======================= + +**Plugin_slurm_sampler**\ (7), **ldmsd**\ (8), **ldms_quickstart**\ (7), +**ldmsd_controller**\ (8), **ldms_sampler_base**\ (7). diff --git a/rtd/man2rst/Plugin_store_sos.rst b/rtd/man2rst/Plugin_store_sos.rst new file mode 100644 index 000000000..f09e6e2d7 --- /dev/null +++ b/rtd/man2rst/Plugin_store_sos.rst @@ -0,0 +1,346 @@ +================ +Plugin_store_sos +================ + +:Date: 21 Dec 2015 + +.. contents:: + :depth: 3 +.. + +NAME +================= + +Plugin_store_sos - man page for the LDMS store_sos plugin + +SYNOPSIS +===================== + +| Within ldmsd_controller script: +| ldmsd_controller> load name=store_sos +| ldmsd_controller> config name=store_sos path=path +| ldmsd_controller> strgp_add plugin=store_sos [ = ] + +DESCRIPTION +======================== + +With LDMS (Lightweight Distributed Metric Service), store plugins for +the ldmsd (ldms daemon) are configured via the ldmsd_controller. The +store_sos plugin is a sos store. + +To build the store_sos, build with the following flag: **--enable_sos** + +STORE_SOS INIT CONFIGURATION ATTRIBUTE SYNTAX +========================================================== + +**config** + | name= path= + | ldmsd_controller configuration line + + name= + | + | This MUST be store_sos. + + path= + | + | The store will be put into a directory whose root is specified + by the path argument. This directory must exist; the store will + be created. The full path to the store will be + /. The schema(s) determine the schemas of the + data base. Container and schema are set when the strgp is added. + +STRGP_ADD ATTRIBUTE SYNTAX +======================================= + +The strgp_add sets the policies being added. This line identifies the +container and schema for a store. + +**strgp_add** + | plugin=store_sos name= schema= + container= [decomposition=] + | ldmsd_controller strgp_add line + + plugin= + | + | This MUST be store_sos. + + name= + | + | The policy name for this strgp. + + container= + | + | The container and schema define the store as described above + (see path). + + schema= + | + | The container and schema define the store as described above + (see path). You can have multiples of the same path and + container, but with different schema (which means they will have + different metrics) and they will be stored in the same store. + + decomposition= + | + | Optionally use set-to-row decomposition with the specified + configuration file in JSON format. See more about decomposition + in ldmsd_decomposition(7). + +USING SOS COMMANDS TO MANAGE PARTITIONS +==================================================== + +Some of the basic sos commands are given below. SOS tools will be built +into XXX. Any commands given with no argument, will return usage info. + +**sos_part_query** + | + | List the partitions defined in a container. + +**sos_part_create** + | -C [=] part_name + | Create a partition. + + **-C** ** + | + | Path to the container + + **-s** *state* + | + | State of the new partition (case insensitive). Default is + OFFLINE. Optional parameter. Valid options are: + + - PRIMARY: all new allocations go in this partition + + - ONLINE: objects are accessible, but the partition does not grow + + - OFFLINE: object references are invalid; the partition may be moved + or deleted. + + **part_name** + | + | Name of the partition + +**sos_part_delete** + | -C + | Delete a partition in a container. The partition must be in the + OFFLINE state to be deleted. + + **-C** ** + | + | Path to the container + + **name** + | + | Name of the parition + +**sos_part_modify** + | -C [=] part_name + | Modify the state of a partition. + + **-C** ** + | + | Path to the container + + **-s** *state* + | + | State of the new partition (case insensitive). Default is + OFFLINE. Optional parameter. Valid options are: + + - PRIMARY: all new allocations go in this partition + + - ONLINE: objects are accessible, but the partition does not grow + + - OFFLINE: object references are invalid; the partition may be moved + or deleted. + + **part_name** + | + | Name of the partition + +**sos_part_move** + | + | Move a partition to another storage location. -C -p + part_name + + **-C** ** + | + | Path to the container + + **-p** ** + | + | The new path. + + **part_name** + | + | Name of the partition + +USING SOS COMMANDS TO LOOK AT DATA IN A PARTITION +============================================================== + +sos_cmd can be used to get data from an sos instance. Some relevant +command options are below. Example usage is in the example section. + +**sos_cmd** + | -C -l + | Print a directory of the schemas. + + **-C** ** + | + | Path to the container + +**sos_cmd** + | -C -i + | Show debug information for the container + + **-C** ** + | + | Path to the container + +**sos_cmd** + | -C -q -S -X -V -V .... + | Print data from a container + + **-C** ** + | + | Path to the container + + **-q** + Used to query + + **-S** ** + | + | Schema querying against + + **-X** ** + | + | Variable that is indexed to use in the query. + + **-V** ** + | + | One or more vars to output. + +NOTES +================== + +- The configuration lines do not allow specification of the partition, + that is done automatically (by default this is the epoch timestamp). + +- Management of partitions is done outside of LDMS (e.g., cron script + that calls creation of new partitions and changes from PRIMARY to + ACTIVE). + +BUGS +================= + +No known bugs. + +EXAMPLES +===================== + +Configuring store_sos: +---------------------- + +:: + + ldmsd_controller> load name=store_sos + ldmsd_controller> config name=store_sos path=/XXX/storedir + ldmsd_controller> strgp_add name=sos_mem_policy plugin=store_sos container=sos schema=meminfo + +Querying a container's partitions: +---------------------------------- + +:: + + $ sos_part /NVME/0/SOS_ROOT/Test + Partition Name RefCount Status Size Modified Accessed Path + -------------------- -------- ---------------- -------- ---------------- ---------------- ---------------- + 00000000 3 ONLINE 1M 2015/08/25 13:49 2015/08/25 13:51 /SOS_STAGING/Test + 00000001 3 ONLINE 2M 2015/08/25 11:54 2015/08/25 13:51 /NVME/0/SOS_ROOT/Test + 00000002 3 ONLINE 2M 2015/08/25 11:39 2015/08/25 13:51 /NVME/0/SOS_ROOT/Test + 00000003 3 ONLINE PRIMARY 2M 2015/08/25 11:39 2015/08/25 13:51 /NVME/0/SOS_ROOT/Test + +Looking at a container's directory: +----------------------------------- + +Variables that are options for -X in the sos_cmd will have indexed = 1 + +:: + + $ sos_cmd -C /NVME/0/LDMS -l + schema : + name : aries_nic_mmr + schema_sz : 1944 + obj_sz : 192 + id : 129 + -attribute : timestamp + type : TIMESTAMP + idx : 0 + indexed : 1 + offset : 8 + -attribute : comp_time + type : UINT64 + idx : 1 + indexed : 1 + offset : 16 + -attribute : job_time + type : UINT64 + idx : 2 + indexed : 1 + offset : 24 + -attribute : component_id + type : UINT64 + idx : 3 + indexed : 0 + offset : 32 + -attribute : job_id + type : UINT64 + idx : 4 + indexed : 0 + offset : 40 + -attribute : AR_NIC_NETMON_ORB_EVENT_CNTR_REQ_PKTS + type : UINT64 + idx : 5 + indexed : 0 + offset : 48 + -attribute : AR_NIC_NETMON_ORB_EVENT_CNTR_REQ_FLITS + type : UINT64 + idx : 6 + indexed : 0 + offset : 56 + -attribute : AR_NIC_NETMON_ORB_EVENT_CNTR_REQ_STALLED + type : UINT64 + idx : 7 + indexed : 0 + offset : 64 + ... + +Looking at variable values in a container: +------------------------------------------ + +:: + + $ sos_cmd -C /NVME/0/LDMS -q -S aries_nic_mmr -X timestamp -V timestamp -V AR_NIC_NETMON_ORB_EVENT_CNTR_REQ_PKTS + timestamp AR_NIC_NETMON_ORB_EVENT_CNTR_REQ_PKTS + -------------------------------- ------------------ + 1447449560.003480 1642207034 + 1447449630.002155 1642213993 + 1447449630.003115 88703749 + 1447449630.003673 74768272 + 1447449640.002818 74768367 + 1447449640.003201 88703844 + 1447449640.003249 1642214024 + 1447449650.002885 74768402 + 1447449650.003263 1642214059 + 1447449650.003325 88703874 + 1447449660.002954 74768511 + 1447449660.003308 1642214174 + 1447449660.003444 88703993 + 1447449670.003015 74768547 + 1447449670.003361 1642214205 + 1447449670.003601 88704024 + 1447449680.003081 74768582 + +SEE ALSO +===================== + +ldms(7), Plugin_store_csv(7), ldmsd_decomposition(7) diff --git a/rtd/man2rst/Plugin_store_timescale.rst b/rtd/man2rst/Plugin_store_timescale.rst new file mode 100644 index 000000000..06a846870 --- /dev/null +++ b/rtd/man2rst/Plugin_store_timescale.rst @@ -0,0 +1,175 @@ +====================== +Plugin_store_timescale +====================== + +:Date: 24 Oct 2019 + +.. contents:: + :depth: 3 +.. + +NAME +======================= + +Plugin_store_timescale - man page for the LDMS store_timescale plugin + +SYNOPSIS +=========================== + +| Within ldmsd_controller script or a configuration file: +| load name=store_timescale +| strgp_add name= plugin=store_timescale container= + schema= +| strgp_prdcr_add name= regex=.\* +| strgp_start name= + +DESCRIPTION +============================== + +With LDMS (Lightweight Distributed Metric Service), plugins for the +ldmsd (ldms daemon) are configured via ldmsd_controller or a +configuration file. The timescale_store plugin is a store developed by +Shanghai Jiao Tong University HPC Center to store collected data in +TimescaleDB. + +This store is a simplified version of store_influx. + +STORE_TIMESCALE CONFIGURATION ATTRIBUTE SYNTAX +================================================================= + +**config** + | name= user= pwfile= + hostaddr= port= dbname= + measurement_limit= + | ldmsd_controller configuration line + + name= + | + | This MUST be store_timescale. + + user= + | + | This option is required; It will be used as the user name to + connect to timescaledb. + + pwfile= + | + | This option is required; The file must have content + secretword=, the password will be used as the password + to connect to timescaledb. + + hostaddr= + | + | This option is required; It will be used as the ip addr of + timescaledb to connect to. + + port= + | + | This option is required; It will be used as the port number of + timescaledb to connect to. + + dbname= + | + | This option is required; It will be used as the timescaledb + database name to connect to. + + measurement_limit= + | + | This is optional; It specifies the maximum length of the sql + statement to create table or insert data into timescaledb; + default 8192. + +STRGP_ADD ATTRIBUTE SYNTAX +============================================= + +The strgp_add sets the policies being added. This line determines the +output files via identification of the container and schema. + +**strgp_add** + | plugin=store_timescale name= schema= + container= + | ldmsd_controller strgp_add line + + plugin= + | + | This MUST be store_timescale. + + name= + | + | The policy name for this strgp. + + container= + | + | The container and the schema determine where the output files + will be written (see path above). + + schema= + | + | The container and the schema determine where the output files + will be written (see path above). You can have multiples of the + same sampler, but with different schema (which means they will + have different metrics) and they will be stored in different + containers (and therefore files). + +STORE COLUMN ORDERING +======================================== + +This store generates output columns in a sequence influenced by the +sampler data registration. Specifically, the column ordering is + + Time, Time_usec, ProducerName, \* + +The column sequence of is the order in which the +metrics are added into the metric set by the sampler. + +NOTES +======================== + +None. + +BUGS +======================= + +None known. + +EXAMPLES +=========================== + +Within ldmsd_controller or in a ldmsd command script file + +:: + + load name=store_timescale + + + strgp_add name=store_tutorial1 plugin=store_timescale schema=test1 container=tutorial_sampler1 + + + strgp_prdcr_add name=store_tutorial1 regex=.* + + + strgp_start name=store_tutorial1 + + + strgp_add name=store_tutorial2 plugin=store_tutorial schema=test2 container=tutorial_sampler2 + + + strgp_prdcr_add name=store_tutorial2 regex=.* + + + strgp_start name=store_tutorial2 + + + strgp_add name=store_tutorial3 plugin=store_tutorial schema=test3 container=tutorial_sampler3 + + + strgp_prdcr_add name=store_tutorial3 regex=.* + + + strgp_start name=store_tutorial3 + +SEE ALSO +=========================== + +ldmsd(8), ldms_quickstart(7), ldmsd_controller(8), +Plugin_tutorial_sampler(7), Plugin_store_csv(7) diff --git a/rtd/man2rst/Plugin_store_tutorial.rst b/rtd/man2rst/Plugin_store_tutorial.rst new file mode 100644 index 000000000..5959adff4 --- /dev/null +++ b/rtd/man2rst/Plugin_store_tutorial.rst @@ -0,0 +1,164 @@ +===================== +Plugin_store_tutorial +===================== + +:Date: 24 Oct 2019 + +.. contents:: + :depth: 3 +.. + +NAME +====================== + +Plugin_store_tutorial - man page for the LDMS store_tutorial plugin + +SYNOPSIS +========================== + +| Within ldmsd_controller script or a configuration file: +| load name=store_tutorial +| config name=store_tutorial path= +| strgp_add name= plugin=store_tutorial container= + schema= +| strgp_prdcr_add name= regex=.\* +| strgp_start name= + +DESCRIPTION +============================= + +With LDMS (Lightweight Distributed Metric Service), plugins for the +ldmsd (ldms daemon) are configured via ldmsd_controller or a +configuration file. The tutorial_store plugin is a demo store described +in the LDMSCON2019 tutorial "LDMS v4: Sampler and Store Writing". + +This store is a simplified version of store_csv, handling only U64 and +producing no header and with no rollover. + +STORE_TUTORIAL CONFIGURATION ATTRIBUTE SYNTAX +=============================================================== + +**config** + | name= path= + | ldmsd_controller configuration line + + name= + | + | This MUST be store_tutorial. + + path= + | + | This option is required; the config line or the options file + must supply a default value. The output files will be put into a + directory whose root is specified by the path argument. This + directory must exist; the subdirectories and files will be + created. The full path to the output files will be + //. Container and schema are set when + the strgp is added. + +STRGP_ADD ATTRIBUTE SYNTAX +============================================ + +The strgp_add sets the policies being added. This line determines the +output files via identification of the container and schema. + +**strgp_add** + | plugin=store_tutorial name= schema= + container= + | ldmsd_controller strgp_add line + + plugin= + | + | This MUST be store_tutorial. + + name= + | + | The policy name for this strgp. + + container= + | + | The container and the schema determine where the output files + will be written (see path above). + + schema= + | + | The container and the schema determine where the output files + will be written (see path above). You can have multiples of the + same sampler, but with different schema (which means they will + have different metrics) and they will be stored in different + containers (and therefore files). + +STORE COLUMN ORDERING +======================================= + +This store generates output columns in a sequence influenced by the +sampler data registration. Specifically, the column ordering is + + Time, Time_usec, ProducerName, \* + +The column sequence of is the order in which the +metrics are added into the metric set by the sampler. + +NOTES +======================= + +None. + +BUGS +====================== + +None known. + +EXAMPLES +========================== + +Within ldmsd_controller or in a ldmsd command script file + +:: + + load name=store_tutorial + + + config name=store_tutorial path=/tmp/store + + + strgp_add name=store_tutorial1 plugin=store_tutorial schema=test1 container=tutorial_sampler1 + + + strgp_prdcr_add name=store_tutorial1 regex=.* + + + strgp_start name=store_tutorial1 + + + strgp_add name=store_tutorial2 plugin=store_tutorial schema=test2 container=tutorial_sampler2 + + + strgp_prdcr_add name=store_tutorial2 regex=.* + + + strgp_start name=store_tutorial2 + + + strgp_add name=store_tutorial3 plugin=store_tutorial schema=test3 container=tutorial_sampler3 + + + strgp_prdcr_add name=store_tutorial3 regex=.* + + + strgp_start name=store_tutorial3 + +| > ls /tmp/store +| tutorial_sampler1 tutorial_sampler2 tutorial_sampler +| > more /tmp/store/tutorial_sampler1/test1 +| 1571943275.194664,194664,localhost1,1,0,0,13,26,39,52,65,78,91,104,117,130 +| 1571943276.195789,195789,localhost1,1,0,0,14,28,42,56,70,84,98,112,126,140 +| 1571943277.196916,196916,localhost1,1,0,0,15,30,45,60,75,90,105,120,135,150 +| 1571943278.198051,198051,localhost1,1,0,0,16,32,48,64,80,96,112,128,144,160 +| 1571943279.199184,199184,localhost1,1,0,0,17,34,51,68,85,102,119,136,153,170 + +SEE ALSO +========================== + +ldmsd(8), ldms_quickstart(7), ldmsd_controller(8), +Plugin_tutorial_sampler(7), Plugin_store_csv(7) diff --git a/rtd/man2rst/Plugin_stream_csv_store.rst b/rtd/man2rst/Plugin_stream_csv_store.rst new file mode 100644 index 000000000..0a80eedb3 --- /dev/null +++ b/rtd/man2rst/Plugin_stream_csv_store.rst @@ -0,0 +1,261 @@ +======================= +Plugin_stream_csv_store +======================= + +:Date: 03 Oct 2021 + +.. contents:: + :depth: 3 +.. + +NAME +======================== + +Plugin_stream_csv_store - man page for the LDMS stream_csv_store plugin + +SYNOPSIS +============================ + +| Within ldmsd_controller or a configuration file: +| config name=stream_csv_store [ = ] + +DESCRIPTION +=============================== + +With LDMS (Lightweight Distributed Metric Service), plugins for the +ldmsd (ldms daemon) are configured via ldmsd_controller or a +configuration file. The stream_csv_store plugin is a DEVELOPMENTAL store +that writes out either a single stream's data to csv format if the input +type is a well-known json format or writes out the raw messages if the +input type is str. Input type will be determined by the +hello_cat_publisher or similar. + +CONFIGURATION ATTRIBUTE SYNTAX +================================================== + +**config** + | name=stream_csv_store path= container= + stream= [flushtime=] [buffer=<0/1>] [rolltype= + rollover= rollagain=] + | configuration line + + name= + | + | This MUST be stream_csv_store. + + path= + | + | path to the directory of the csv output file + + container= + | + | directory of the csv output file + + stream= + | + | csv list of streams to which to subscribe. + + flushtime= + | + | Flush any file that has not received data on its stream in the + last N sec. This is asynchonous to any buffering or rollover + that is occuring. Min time if enabled = 120 sec. This will occur + again at this interval if there is still no data received. + + buffer=<0/1> + | + | Optional buffering of the output. 0 to disable buffering, 1 to + enable it with autosize (default) + + rolltype= + | + | By default, the store does not rollover and the data is written + to a continously open filehandle. Rolltype and rollover are used + in conjunction to enable the store to manage rollover, including + flushing before rollover. The header will be rewritten when a + roll occurs. Valid options are: + + 1 + | + | wake approximately every rollover seconds and roll. Rollover + is suppressed if no data at all has been written. + + 2 + | + | wake daily at rollover seconds after midnight (>=0) and roll. + Rollover is suppressed if no data at all has been written. + + 3 + | + | roll after approximately rollover records are written. + + 4 + | + | roll after approximately rollover bytes are written. + + 5 + | + | wake at rollover seconds after midnight (>=0) and roll, then + repeat every rollagain (> rollover) seconds during the day. + For example "rollagain=3600 rollover=0 rolltype=5" rolls + files hourly. Rollover is suppressed if no data at all has + been written. + + rollover= + | + | Rollover value controls the frequency of rollover (e.g., number + of bytes, number of records, time interval, seconds after + midnight). Note that these values are estimates due to the + nature of thread wake-ups. Also, for rolltypes 3 and 4, there is + a minimum delay of ROLL_LIMIT_INTERVAL seconds between rollovers + no matter how fast the data is being received, which may lead to + larger than expected data files for small values of rollover. + +JSON FORMAT AND OUTPUT HEADER AND FORMAT +============================================================ + +The json is expected to be something like: + +:: + + {"foo":1, "bar":2, "zed-data":[{"count":1, "name":"xyz"},{"count":2, "name":"abc"}]} + +Note the brackets. There will be at most one list. It is expected that +each dictionary in the list will have the same item names. Everything +else must be singleton data items. + +The header is generated off the first received json ever. If that first +json is missing the list, or if the list has no entries, then list data +will not appear in the header and will not be parsed in subsequent data +lines. The header values will be the singleton names (e.g., foo, bar) +and a list will be broken up into and item per dictionary item with +names listname:dictname (e.g., zed_data:count, zed_data:name). + +There can be any number of dictionaries in a list. Data lines with +multiple dictionaries will be written out in the csv as separate lines, +with the singleton items repeated in each line like: + +:: + + #foo,bar,zed-data:count,zed-data:name + 1,2,1,xyz + 1,2,2,abc + +There will be a header in every output file (can be more than 1 output +file because of rollover). + +STORE OUTPUT FILENAME +========================================= + +The filename will be '.' (e.g., foo-123456789). +The timestamp is determined when the store is started or rolledover and +the file is created. That may be considerably earlier than when data is +streamed to the store. + +STORE COLUMN ORDERING +========================================= + +There is only column ordering for 'json' format. There is no column +ordering for 'str' format. 'str' format will always be written out, no +matter what the 'json' header keys may be. The json order is arbitrary. + +TIMING INFORMATION +====================================== + +Options for timing information are driven by #defines in the code source +right now. + +TIMESTAMP_STORE + | + | Set by #define or #undef TIMESTAMP_STORE. This will write out an + absolute timestamp in the file as the last item in the csv and is + called 'store_recv_time' in the header. The timestamp is only + gotten once, when the function is entered (e.g., if a data line has + multiple dicts, this will result in multiple output lines each of + which will have the same additional timestamp value). Both string + and json are timestamped. + +STREAM_CSV_DIAGNOSTICS + | + | Set by #define or #undef STREAM_CSV_DIAGNOSTICS. This will write + out diagnostic info to the log when stream_cb is called. + +BUGS +======================== + +No known bugs. + +NOTES +========================= + +This store is in development and may be changed at any time. + +Supports more than 1 stream. There is currently no performance guidence +about number of streams and amount of data. + +There is no way to know if a stream will actually be used or if a final +value is received. Therefore, this store will need to be restarted if +you want to use it with a new stream or if you want use the same stream +name, but with different fields in the json. + +It is possible that with buffering, if a stream's sends are ended, there +still may be unflushed data to a file. + +There is no way to remove a stream from the index nor to unsubscribe. +That is, there is nothing that is akin to open_store and close_store +pair as in an actual store plugin. Note that this is in development and +options are changing. For example, RESET funcationality has been removed +and flushtime functionality has changed. + +Note the restrictions on the data input above. Also how that affects the +header. + +EXAMPLES +============================ + +Within ldmsd_controller or a configuration file: + +:: + + load name=stream_csv_store + config name=stream_csv_store path=XYZ/store container=csv stream=foo buffer=1 + # dont call anything else on the store. the store action is called by a callback triggered by the stream. + + prdcr_add name=localhost1 host=localhost type=active xprt=sock port=52001 interval=20000000 + prdcr_subscribe stream=foo regex=localhost* + prdcr_start name=localhost1 + +Testdata: + +:: + + cat XXX/testdata.txt + {"job-id" : 10364, "rank" : 1, "kokkos-perf-data" : [ {"name" : "SPARTAFOO0", "count": 0, "time": 0.0000},{"name" : "SPARTAFOO1", "count": 1, "time": 0.0001},{"name" : "SPARTAFOO2", "count": 2, "time": 0.0002},{"name" : "SPARTAFOO3", "count": 3, "time": 0.0003},{"name" : "SPARTAFOO4", "count": 4, "time": 0.0004},{"name" : "SPARTAFOO5", "count": 5, "time": 0.0005},{"name" : "SPARTAFOO6", "count": 6, "time": 0.0006},{"name" : "SPARTAFOO7", "count": 7, "time": 0.0007},{"name" : "SPARTAFOO8", "count": 8, "time": 0.0008},{"name" : "SPARTAFOO9", "count": 9, "time": 0.0009}] } + +Publish: + +:: + + ldmsd_stream_publish -x sock -h localhost -p 52001 -s foo -t json -f XXX/testdata.txt -a + + + + Output: + cat XYZ/store/csv/foo.1614306320 + rank,job-id,kokkos-perf-data:time,kokkos-perf-data:name,kokkos-perf-data:count,store_recv_time + 1,10364,0.000000,"SPARTAFOO0",0,1614306329.167736 + 1,10364,0.000100,"SPARTAFOO1",1,1614306329.167736 + 1,10364,0.000200,"SPARTAFOO2",2,1614306329.167736 + 1,10364,0.000300,"SPARTAFOO3",3,1614306329.167736 + 1,10364,0.000400,"SPARTAFOO4",4,1614306329.167736 + 1,10364,0.000500,"SPARTAFOO5",5,1614306329.167736 + 1,10364,0.000600,"SPARTAFOO6",6,1614306329.167736 + 1,10364,0.000700,"SPARTAFOO7",7,1614306329.167736 + 1,10364,0.000800,"SPARTAFOO8",8,1614306329.167736 + 1,10364,0.000900,"SPARTAFOO9",9,1614306329.167736 + +SEE ALSO +============================ + +ldmsd(8), ldms_quickstart(7), ldmsd_controller(8), ldms_sampler_base(7), +ldmsd_stream_publish(7), Plugin_hello_sampler(7) diff --git a/rtd/man2rst/Plugin_sysclassib.rst b/rtd/man2rst/Plugin_sysclassib.rst new file mode 100644 index 000000000..d93c04b0b --- /dev/null +++ b/rtd/man2rst/Plugin_sysclassib.rst @@ -0,0 +1,73 @@ +================= +Plugin_sysclassib +================= + +:Date: 10 Feb 2018 + +.. contents:: + :depth: 3 +.. + +NAME +================== + +Plugin_sysclassib - man page for the LDMS sysclassib plugin + +SYNOPSIS +====================== + +| Within ldmsd_controller or in a configuration file +| config name=sysclassib [ = ] + +DESCRIPTION +========================= + +The sysclassib plugin provides IB metric information in raw and rate +(per second) forms. + +CONFIGURATION ATTRIBUTE SYNTAX +============================================ + +The sysclassib plugin uses the sampler_base base class. This man page +covers only the configuration attributes, or those with default values, +specific to the this plugin; see ldms_sampler_base.man for the +attributes of the base class. + +**config**\ name=\ **[schema=]**\ ports=\ **[metrics_type=]** + | + | configuration line + + name= + | + | This MUST be sysclassib. + + metrics_type= + | + | Values are 0 or 1. 0 = counter data only. 1 = include rate data + (per second) in addition. Default is 0. + + ports= + | + | CSV list of the form CARD1.PORT1,CARD2.PORT2. Default is all + discovered values. + +BUGS +================== + +No known bugs. + +EXAMPLES +====================== + +Within ldmsd_controller or a configuration file: + +:: + + load name=sysclassib + config name=sysclassib component_id=1 producer=vm1_1 instance=vm1_1/sysclassib metric_type=1 + start name=sysclassib interval=1000000 offset=0 + +SEE ALSO +====================== + +ldms(7), Plugin_procnetdev(7), ldms_sampler_base(7) diff --git a/rtd/man2rst/Plugin_syspapi_sampler.rst b/rtd/man2rst/Plugin_syspapi_sampler.rst new file mode 100644 index 000000000..7aa35cb66 --- /dev/null +++ b/rtd/man2rst/Plugin_syspapi_sampler.rst @@ -0,0 +1,135 @@ +====================== +Plugin_syspapi_sampler +====================== + +:Date: 30 Sep 2019 + +.. contents:: + :depth: 3 +.. + +NAME +======================= + +Plugin_syspapi_sampler - man page for the LDMSD syspapi_sampler plugin + +SYNOPSIS +=========================== + +Within ldmsd_controller or a configuration file: **config** +**name=syspapi_sampler** **producer=**\ *PRODUCER* +**instance=**\ *INSTANCE* [ **schema=\ SCHEMA** ] [ +**component_id=\ COMPONENT_ID** ] [ **cfg_file=\ PATH** ] [ +**events=\ EVENTS** ] [ **cumulative=\ 0\ \|\ 1** ] [ +**auto_pause=\ 0\ \|\ 1** ] + +DESCRIPTION +============================== + +**syspapi_sampler** collects system-wide hardware event counters using +Linux perf event (see **perf_event_open**\ (2)), but use PAPI event +names. **libpapi** and **libpfm** are used to translate PAPI event names +to Linux perf event attributes. In the case of per-process (job) data +collection, please see **Plugin_papi_sampler**. There are two approaches +to define a list of events: 1) **events** option, or 2) PAPI JSON config +file. For the **events** option, simply list the events of interest +separated by comma (e.g. events=PAPI_TOT_INS,PAPI_TOT_CYC). For the PAPI +JSON config file (**cfg_file** option), the format of the file is as +follows: + + :: + + { + "schema": "my_syspapi", + "events": [ + ... + ] + } + +The **schema** is optional, but if specified in the JSON config file, it +precedes the schema name given at the **config** command. The **events** +is a list of PAPI event names (strings). + +If both **cfg_file** and **events** options are given to the config +command, the list are concatenated. Please note that an event that +appears on both lists will result in an error. + +**auto_pause**\ =\ *1* (which is the default) makes **syspapi_sampler** +paused the data sampling when receiving a notification from +**papi_sampler** that a job is active, and resumed the data sampling +when receiving a notification from **papi_sampler** that all jobs have +terminated. This is to prevent perf system resource contention. We have +seen all 0 counters on **papi_sampler** without any errors (could be a +silent error) when run it with active **syspapi_sampler**. + +CONFIG OPTIONS +================================= + +**name=syspapi_sampler** + This MUST be syspapi_sampler (the name of the plugin). + +**producer=**\ *PRODUCER* + The name of the data producer (e.g. hostname). + +**instance=**\ *INSTANCE* + The name of the set produced by this plugin. + +**schema=**\ *SCHEMA* + The optional schema name (default: syspapi_sampler). Please note that + the **"schema"** from the JSON **cfg_file** overrides this option. + +**component_id=**\ *COMPONENT_ID* + An integer identifying the component (default: *0*). + +**cfg_file=**\ *PATH* + The path to JSON-formatted config file. This is optional if + **events** option is specified. Otherwise, this option is required. + +**events=**\ *EVENTS* + The comma-separated list of PAPI events of interest (e.g. + *PAPI_TOT_INS,PAPI_TOT_CYC*). This is optional if **cfg_file** is + specified. Otherwise, this option is required. + +**cumulative=**\ *0*\ **\|**\ *1* + *0* (default) for non-cumulative data sampling (reset after read), or + *1* for cumulative data sampling. + +**auto_pause=**\ *0*\ **\|**\ *1* + *0* to ignore **papi_sampler** pause/resume notification, or *1* + (default) to pause/resume according to notifications from + **papi_sampler**. + +BUGS +======================= + +No known bugs. + +EXAMPLES +=========================== + +Plugin configuration example: + + :: + + load name=syspapi_sampler + config name=syspapi_sampler producer=${HOSTNAME} \ + instance=${HOSTNAME}/syspapi component_id=2 \ + cfg_file=/tmp/syspapi.json + start name=syspapi_sampler interval=1000000 offset=0 + +JSON cfg_file example: + + :: + + { + "events": [ + "PAPI_TOT_INS", + "PAPI_TOT_CYC" + ] + } + +SEE ALSO +=========================== + +**Plugin_papi_sampler**\ (7), **ldmsd**\ (8), **ldms_quickstart**\ (7), +**ldmsd_controller**\ (8), **ldms_sampler_base**\ (7). diff --git a/rtd/man2rst/Plugin_tutorial_sampler.rst b/rtd/man2rst/Plugin_tutorial_sampler.rst new file mode 100644 index 000000000..2a0188d7a --- /dev/null +++ b/rtd/man2rst/Plugin_tutorial_sampler.rst @@ -0,0 +1,99 @@ +======================= +Plugin_tutorial_sampler +======================= + +:Date: 24 Oct 2019 + +.. contents:: + :depth: 3 +.. + +NAME +======================== + +Plugin_tutorial_sampler - man page for the LDMS tutorial_sampler plugin + +SYNOPSIS +============================ + +| Within ldmsd_controller or a configuration file: +| config name=tutorial_sampler [ = ] + +DESCRIPTION +=============================== + +With LDMS (Lightweight Distributed Metric Service), plugins for the +ldmsd (ldms daemon) are configured via ldmsd_controller or a +configuration file. The tutorial_sampler plugin is a demo sampler +described in the LDMSCON2019 tutorial "LDMS v4: Sampler and Store +Writing". + +This sampler is a simplified version of test_sampler, with a fixed +number of sets and u64 data types only. Max sets is determined by +MAXSETS in the source. + +CONFIGURATION ATTRIBUTE SYNTAX +================================================== + +The tutorial_sampler plugin uses the sampler_base base class. This man +page covers only the configuration attributes, or those with default +values, specific to the this plugin; see ldms_sampler_base.man for the +attributes of the base class. + +**config** + | name= [schema=] [num_metrics=] + | configuration line + + name= + | + | This MUST be tutorial_sampler. + + num_metrics= + | + | Optional number of metrics for this set. Metrics will be U64. + Metric names will be 'metric\_%d'. If not specified, default + number of metrics is determined by DEFAULTNUMMETRICS in the + source. + + schema= + | + | Optional schema name. It is intended that any sets with + different metrics have a different schema. If not specified, + will default to \`tutorial_sampler\`. Therefore, if you are + creating multiple sets in this sampler, you will most likely + want to define schema for each set. + +BUGS +======================== + +No known bugs. + +EXAMPLES +============================ + +Within ldmsd_controller or a configuration file: + +:: + + config name=tutorial_sampler producer=localhost1 instance=localhost1/test1 schema=test1 component_id=1 + config name=tutorial_sampler producer=localhost1 instance=localhost1/test2 schema=test2 component_id=2 num_metrics=5 + config name=tutorial_sampler producer=localhost1 instance=localhost1/test3 schema=test3 component_id=1 num_metrics=2 + job_set=localhost1/jobid + start name=tutorial_sampler interval=1000000 + +> ldms_ls localhost1/test1: consistent, last update: Thu Oct 24 10:55:14 +2019 -0600 [223680us] M u64 component_id 1 D u64 job_id 0 D u64 app_id 0 +D u64 metric0 2 D u64 metric1 4 D u64 metric2 6 D u64 metric3 8 D u64 +metric4 10 D u64 metric5 12 D u64 metric6 14 D u64 metric7 16 D u64 +metric8 18 D u64 metric9 20 localhost1/test2: consistent, last update: +Thu Oct 24 10:55:14 2019 -0600 [223699us] M u64 component_id 2 D u64 +job_id 0 D u64 app_id 0 D u64 metric0 4 D u64 metric1 8 D u64 metric2 12 +D u64 metric3 16 D u64 metric4 20 localhost1/test3: consistent, last +update: Thu Oct 24 10:55:14 2019 -0600 [223717us] M u64 component_id 1 D +u64 job_id 0 D u64 app_id 0 D u64 metric0 6 D u64 metric1 12 + +SEE ALSO +============================ + +ldmsd(8), ldms_quickstart(7), ldmsd_controller(8), ldms_sampler_base(7), +test_sampler(7), Plugin_store_tutorial(7) diff --git a/rtd/man2rst/Plugin_tx2mon.rst b/rtd/man2rst/Plugin_tx2mon.rst new file mode 100644 index 000000000..0f41c1a73 --- /dev/null +++ b/rtd/man2rst/Plugin_tx2mon.rst @@ -0,0 +1,185 @@ +============= +Plugin_tx2mon +============= + +:Date: 25 Dec 2020 + +.. contents:: + :depth: 3 +.. + +NAME +============== + +Plugin_tx2mon - man page for the LDMS tx2mon plugin + +SYNOPSIS +================== + +| Within ldmsd configuration +| config name=tx2mon [ = ] + +DESCRIPTION +===================== + +The tx2mon plugin provides cpu and system-on-chip information from +/sys/bus/platform/devices/tx2mon/[socinfo, node_raw] and reports it +in the same units as the tx2mon command-line utility. + +CONFIGURATION ATTRIBUTE SYNTAX +======================================== + +The standard options from sampler_base apply. The specific options for +tx2mon are listed here + +**config** + | name=tx2mon [array=] [extra=] + [auto-schema=] + + schema= + | + | Optional schema name. It is required by most storage backends + that the same sampler on different nodes with different metric + subsets needs to have a unique schema name. Use auto-schema=1 + instead of or in addition to schema to automatically meet the + backend requirement. + + auto-schema= + | + | If true, change the schema name to tx2mon\_$X, where $X will be + a unique value derived from the data selection options. If both + schema and auto-schema=1 are given, the schema name given is + used as the base instead of "tx2mon". + + array= + | + | For per-core data, report all array value elements if true. + Report only maximum and minimum values if false. The default is + false. + + extra= + | + | For per-core data, report additional information of the internal + block frequencies and the set system metrics. These additional + values are static. If false, additional information will not be + reported. The default is false. + +METRICS +================= + +The sampler_base standard metrics are included. The following data is +reported in a set instance per socket. + +:: + + node Number of socket i from + /sys/bus/platform/devices/tx2mon/node_raw + +The metrics listed here are named as their respective fields in +tx2mon/mc_oper_region.h. Where applicable, metrics are converted to the +units listed here from the raw values. + +:: + + counter Snapshot counter of the cpu. + + + Include the following metrics when array=true: + freq_cpu[] Frequency reading of each core. + tmon_cpu[] Temperature reading of each core. (deg. C). + +Include the following metrics when array=false: + +:: + + freq_cpu_min Minimum value found in freq_cpu. + freq_cpu_max Maximum value found in freq_cpu. + tmon_cpu_min Minimum value found in tmon_cpu. (deg. C) + tmon_cpu_max Maximum value found in tmon_cpu. (deg. C) + +Include the following metrics unconditionally: + +:: + + tmon_soc_avg Average temperature on the SoC. (deg. C) + pwr_core Power consumed by all cores on the SoC. (Watt). + pwr_sram Power consumed by all internal SRAM on the SoC. (Watt). + pwr_mem Power consumed by the LLC ring on the SoC. (Watt) + pwr_soc Power consumed by SoC blocks that are misc. (Watt) + v_core Voltage consumed by all cores on the SoC. (V) + v_sram Voltage consumed by all internal SRAM on the SoC. (V) + v_mem Voltage consumed by the LLC ring on the SoC. (V) + v_soc Voltage consumed by SoC blocks that are misc. (V). + active_evt Provides a bit list of active events that are causing throttling. + Temperature Active event with a bit flag where 1 is true. + Power Active event with a bit flag where 1 is true. + External Active event with a bit flag where 1 is true. + Unk3 Active event with a bit flag where 1 is true. + Unk4 Active event with a bit flag where 1 is true. + Unk5 Active event with a bit flag where 1 is true. + temp_evt_cnt Total number of temperature events. + pwr_evt_cnt Total number of power events. + ext_evt_cnt Total number of exteral events. + temp_throttle_ms Time duration of all temperature events in ms. + pwr_throttle_ms Time duration of all power events in ms. + ext_throttle_ms Time duration of all external events in ms. + cpu_num Which processor the data comes from. + +Include the following metrics with extra=true: + +:: + + temp_abs_max Absolute maximum limit of temperature beyond + which the SoC will throttle voltage and frequency. + temp_soft_thresh Soft limit of temperature beyond which the SoC will + throttle voltage and frequency down. + temp_hard_thresh Hard limit of temperature beyond which the SoC will + throttle voltage and frequency down. + freq_mem_net Frequency reading of the SoC and ring connection. + freq_max Maximum limit of SoC frequency. Depends on the SKU. + freq_min Minimum limit of SoC frequency. Depends on the SKU. + freq_socs Internal block frequency of SOC South clock. (Mhz) + freq_socn Internal block frequency of SOC North clock. (Mhz) + +EXAMPLES +================== + +Within ldmsd_controller or a configuration file: + +:: + + load name=tx2mon + config name=tx2mon producer=vm1_1 component_id=1 instance=vm1_1/tx2mon + start name=tx2mon interval=1000000 + +NOTES +=============== + +By default, root privilege is required to read the data files produced +by tx2mon_kmod. The kernel module tx2mon_kmod must be loaded, e.g. by +"modprobe /lib/modules/$(uname -r)/extra/tx2mon_kmod.ko". + +The current generated schema names are: tx2mon, tx2mon_01, +tx2mon_11\_$n_core, and tx2mon_10\_$n_core, where the suffix is derived +as \_(array)(extra)[\_ncore]. "tx2mon" is used when tx2mon_00 would +occur. If present, $n_core is the size of the array metrics. + +There is additional power consumed by cross-socket interconnect, PCIe, +DDR and other IOs that is not currently reported by this tool. + +tx2mon reports on the sensors monitored by the on-chip management +controller. Some of the on-chip components (such as the IO blocks) do +not have sensors and therefore the voltage and power measurements of +these blocks are not provided by tx2mon. + +On systems that are not arm 64 (aarch64 from uname), the sampler does +nothing. On systems that are aarch64 but missing +/sys/bus/platform/devices/tx2mon, the sampler issues an error about the +missing tx2mon kernel module. + +SEE ALSO +================== + +ldmsd(8), ldms_sampler_base + +:: diff --git a/rtd/man2rst/Plugin_variable.rst b/rtd/man2rst/Plugin_variable.rst new file mode 100644 index 000000000..c0514fbeb --- /dev/null +++ b/rtd/man2rst/Plugin_variable.rst @@ -0,0 +1,88 @@ +=============== +Plugin_variable +=============== + +:Date: 08 Jul 2020 + +.. contents:: + :depth: 3 +.. + +NAME +================ + +Plugin_variable - man page for the LDMS variable plugin + +SYNOPSIS +==================== + +| Within ldmsd_controller or a configuration file: +| config name=variable [ = ] + +DESCRIPTION +======================= + +The variable plugin provides test data with a periodically redefined +schema and set. Currently the period is every 4th sample. The data of +the sampler is monotonically increasing integers. The data set size +changes with each redefinition. + +CONFIGURATION ATTRIBUTE SYNTAX +========================================== + +The variable plugin does not use the sampler_base base class, but +follows the naming conventions of sampler_base except for schema and +instance name. + +**config** + | name= [schema=] + | configuration line + + name= + | + | This MUST be variable. + + schema= + | + | Optional schema name prefix. The string given will be suffixed + with an integer N in the range 1-9 to create the schema name. + The schema will also contain N integer metrics. + + instance= + | + | Optional instance name prefix. The string given will be suffixed + with an integer in the range 1-9 to create the instance name. If + not specified, will default prefix is \`$HOST/variable\`. + +NOTES +================= + +The intent of the sampler is to simulate any sampler which may under +some condition redefine the same instance name and schema name for a set +after properly retiring a different definition using the same names. It +is not for production use. + +To collect CSV data from this sampler, configure 9 store policies +matching ${schema}[1-9], since the current storage policy mechanism does +not allow matching multiple schemas. + +BUGS +================ + +No known bugs. + +EXAMPLES +==================== + +Within ldmsd_controller or a configuration file: + +:: + + load name=variable + config name=variable producer=vm1_1 instance=vm1_1/variable + start name=variable interval=1000000 + +SEE ALSO +==================== + +ldmsd(8), ldms_quickstart(7), ldmsd_controller(8), ldms_sampler_base(7) diff --git a/rtd/man2rst/Plugin_variorum_sampler.rst b/rtd/man2rst/Plugin_variorum_sampler.rst new file mode 100644 index 000000000..43f01b5a8 --- /dev/null +++ b/rtd/man2rst/Plugin_variorum_sampler.rst @@ -0,0 +1,101 @@ +======================= +Plugin_variorum_sampler +======================= + +:Date: 27 Jun 2022 + +.. contents:: + :depth: 3 +.. + +NAME +======================== + +Plugin_variorum_sampler - man page for the LDMS Variorum plugin + +SYNOPSIS +============================ + +| Within ldmsd_controller or a configuration file: +| config name=variorum_sampler [common attributes] + +DESCRIPTION +=============================== + +With LDMS (Lightweight Distributed Metric Service), plugins for the +ldmsd (ldms daemon) are configured via ldmsd_controller or a +configuration file. The variorum_sampler plugin provides power data +using the JSON API in Variorum, a vendor-neutral library that provides +access to low-level hardware knobs. The sampler, when configured, +automatically detects the number of sockets on the host machine and then +provides, for each socket, an LDMS record containing power data. For +each socket, the values provided are: node power consumption in Watts +(identical across sockets); socket ID number; CPU power consumption in +Watts; GPU power consumption in Watts (aggregated across all GPUs on the +socket, and reported as -1 on unsupported platforms); and memory power +consumption in Watts. + +The variorum sampler depends on Variorum 0.6.0 or higher and Jansson. +The sampler cannot be built without these libraries. If either library +is installed in a non-standard location, paths to the respective install +directories should be provided to Autoconf using the +--with-libjansson-prefix and/or --with-libvariorum-prefix flag. + +CONFIGURATION ATTRIBUTE SYNTAX +================================================== + +The variorum sampler plugin uses the sampler_base base class. This man +page covers only the configuration attributes, or those with default +values, specific to the this plugin; see ldms_sampler_base.man for the +attributes of the base class. + +**config** + | name= exclude_ports= + | configuration line + + name= + | + | This MUST be variorum_sampler. + + schema= + | + | Optional schema name. It is intended that the same sampler on + different nodes with different metrics have a different schema. + If not specified, will default to \`variorum_sampler\`. + +BUGS +======================== + +No known bugs; however, if Variorum cannot access the hardware knobs, +the sampler will be unable to access any data. This will result in an +error being printed to the log file: "variorum_sampler: unable to obtain +JSON object data". This error can be resolved by ensuring that hardware +knob access is enabled using the requirements here: +https://variorum.readthedocs.io/en/latest/HWArchitectures.html + +EXAMPLES +============================ + +Within ldmsd_controller or a configuration file: + +:: + + load name=variorum_sampler + config name=variorum_sampler producer=vm1_1 instance=vm1_1/variorum_sampler + start name=variorum_sampler interval=1000000 + +AUTHORS +=========================== + +Jessica Hannebert (Colorado College, +internship at Lawrence Livermore National Laboratory). Tapasya Patki + (Lawrence Livermore National Laboratory). Kathleen +Shoga (Lawrence Livermore National Laboratory). +Stephanie Brink (Lawrence Livermore National +Laboratory). Barry Rountree (Lawrence Livermore +National Laboratory). + +SEE ALSO +============================ + +ldmsd(8), ldms_quickstart(7), ldmsd_controller(8), ldms_sampler_base(7) diff --git a/rtd/man2rst/Plugin_vmstat.rst b/rtd/man2rst/Plugin_vmstat.rst new file mode 100644 index 000000000..bbfd2f0ee --- /dev/null +++ b/rtd/man2rst/Plugin_vmstat.rst @@ -0,0 +1,70 @@ +============= +Plugin_vmstat +============= + +:Date: 04 Dec 2018 + +.. contents:: + :depth: 3 +.. + +NAME +============== + +Plugin_vmstat - man page for the LDMS vmstat plugin + +SYNOPSIS +================== + +| Within ldmsd_controller or in a configuration file +| config name=vmstat [ = ] + +DESCRIPTION +===================== + +With LDMS (Lightweight Distributed Metric Service), plugins for the +ldmsd (ldms daemon) are configured via ldmsd_controller or a +configuration file. The vmstat plugin provides info from /proc/vmstat. + +CONFIGURATION ATTRIBUTE SYNTAX +======================================== + +The vmstat plugin uses the sampler_base base class. This man page covers +only the configuration attributes, or those with default values, +specific to the this plugin; see ldms_sampler_base.man for the +attributes of the base class. + +**config** + | name= [schema=] + | configuration line + + name= + | + | This MUST be vmstat. + + schema= + | + | Optional schema name. It is intended that the same sampler on + different nodes with different metrics have a different schema. + If not specified, will default to \`vmstat\`. + +BUGS +============== + +No known bugs. + +EXAMPLES +================== + +Within ldmsd_controller or a configuration file: + +:: + + load name=vmstat + config name=vmstat producer=1 instance=vm1_1/vmstat + start name=vmstat interval=1000000 + +SEE ALSO +================== + +ldmsd(8), ldms_quickstart(7), ldmsd_controller(8), ldms_sampler_base(7) diff --git a/rtd/man2rst/ldms-csv-anonymize.rst b/rtd/man2rst/ldms-csv-anonymize.rst new file mode 100644 index 000000000..fe14aebe4 --- /dev/null +++ b/rtd/man2rst/ldms-csv-anonymize.rst @@ -0,0 +1,183 @@ +================== +ldms-csv-anonymize +================== + +:Date: 18 Apr 2019 + +.. contents:: + :depth: 3 +.. + +NAME +=================== + +ldms-csv-anonymize - anonymize columns of csv files + +SYNOPSIS +======================= + +ldms-csv-anonymize -h + +ldms-csv-anonymize [--input csv-file] [--out-dir OUT_DIR] [--col-sep +COL_SEP] [--seed SEED] [--save-maps SAVE_MAPS] [--imap IMAP] [--nmap +NMAP] [--pmap PMAP] [--hmap HMAP] [--debug] [M:C [M:C ...]] + +ldms-csv-anonymize --gen-args GEN_ARGS + +DESCRIPTION +========================== + +The ldms-csv-anonymize command rewrites ldms and slurm data files +column-wise with filters specified by the M:C arguments. M:C is a +mapping:column number pair or filename. M is one of int,path,name,host. +C is a nonzero number. Negative numbers count back from the last column. + +OPTIONS +====================== + +--input= + | + | Args is a file name or space-separated list of file names to be + processed. Filenames cannot contain whitespace. + +--out-dir= + | + | Path is a directory (must pre-exist and should not be the same as + any directory containing the input) which will be filled with the + changed files. The original files will are not changed. If an + output file name coincides with one of the inputs, the input data + may be lost or corrupted. + +--col-sep= + | + | Split columns at this character. The default is comma. + +--save-maps= + | + | The path prefix for the generated map files. If the resulting map + filenames coincide with an existing file, the existing file is + overwritten. + +--imap= + | + | An integer mapping file to preload. It must contain two columns of + integers and magic. Normally it is the output of a prior run. See + MAPS below. + +--nmap= + | + | A name mapping file to preload. It must contain two columns of + names and magic. Normally it is the output of a prior run. Each + real name is replaced with 'n' and a sequential number. See MAPS + below. + +--pmap= + | + | A path element mapping file to preload. It must contain two columns + of path elements and magic. Normally it is the output of a prior + run. Path elements are unqualified subdirectory names. Each unique + subdirectory name is replaced with 'p' and a sequential number, + allowing directory hierarchy to be preserved without revealing + application identities. See MAPS below. + +--hmap= + | + | A host name mapping file to preload. It must contain columns of + host elements and magic. It may be host name fragment information + or the output of a prior run. Any hostname found in the input data + which cannot be mapped to the host elements will cause an + anonymization error. There is no default handling of unknown hosts. + See MAPS below. + +--gen-args=[,M:H]\*, + | + | Creating the M:C specification needed in a data transformation run + can be done by first using the argument generation mode. Given a + file starting with a header line of column names and the list of + method:name pairs, this command displays the corresponding list of + M:C arguments needed for the data transformation. + +--debug + | + | Echo some details of the transformation as it runs. + +--seed + | + | Supply a seed to the random number generator. No random values are + used at this time in the processing, however. + +MAPS and MAGIC +============================= + +Map files all start with a line of the form "#anonymize-csv-map " +where kind is one of the supported M values. The columns of the file are +separated by whitespace. The first column is the item of input data to +be replaced and the second column is the replacement. Multiple items +from column 1 may have the same value in column 2. + +By default, map files are saved in the output directory as +anonmap_Xmap.txt, where X is replaced with a kind indicator (i, p, n, +h). The prefix option is used to relocate these outputs. They cannot be +suppressed. + +In the special case of host names and host lists, name fragment +substitutions are supported. Any appearance of a host list, such as +gw[1,3-5] is expanded to single hostnames. Each host name is split at +"-", and each fragment is checked for a replacement from the hmap file. +Any fragment not found in the hmap has right-side digits 0-9 stripped +and mapping the remainder is again attempted; if successful, the +stripped number is appended to the result, otherwise an error occurs. +The fragments are rejoined with "-". When all hosts in the appearance +have been rewritten, the host list is collapsed before output. + +The special host map element 'netdomains' is used to remove fully +qualified domain suffixes. It is a comma separated list of suffixes, and +order matters (subdomains should come before their root if both appear). +Suffix removal occurs before substitution. + +NOTES +==================== + +There is no column delete option; use cut(1) to remove entire columns. + +To ensure map consistency across multiple runs, use the map outputs as +the map inputs to the second and subsequent runs. + +EXAMPLES +======================= + +In bash: + +:: + + colargs=$(ldms-csv-anonymize \ + --gen-args=host:ProducerName,int:uid,name:username,jobid.HEADER) + + ldms-csv-anonymize $colargs \ + --out-dir=/tmp \ + --save-maps=anonjob_ \ + --hmap=/home/anonjob_hmap.txt \ + --input=/home/jobid.csv + +and in a host map file: + +:: + + #anonymize-csv-map host + netdomains .ca.sandia.gov,.sandia.gov + compute node + admin svc + +will cause compute01 to be replaced with node01 and admin7 to be +replaced with svc7. The .sandia.gov and .ca.sandia.gov domains will be +stripped. + +BUGS +=================== + +There is no pipeline filtering mode. + +SEE ALSO +======================= + +cut(1) diff --git a/rtd/man2rst/ldms-csv-export-sos.rst b/rtd/man2rst/ldms-csv-export-sos.rst new file mode 100644 index 000000000..28864c4bc --- /dev/null +++ b/rtd/man2rst/ldms-csv-export-sos.rst @@ -0,0 +1,213 @@ +=================== +ldms-csv-export-sos +=================== + +:Date: 18 Apr 2019 + +.. contents:: + :depth: 3 +.. + +NAME +==================== + +ldms-csv-export-sos - generate helper files needed by sos-import-csv + +SYNOPSIS +======================== + +ldms-csv-export-sos -h + +ldms-csv-export-sos [--data DATA] [--blacklist BLACKLIST] [--whitelist +WHITELIST] [--exclude EXCLUDE] [--include INCLUDE] [--schema-name +SCHEMA_NAME] [--schema-file SCHEMA_FILE] [--map-file MAP_FILE] +[--strip-udata] [--guess] [--widen] [--maxlines MAXLINES] [--assume +ASSUME] [--verbose] + +DESCRIPTION +=========================== + +The ldms-csv-export-sos command parses LDMS CSV file information to +generate corresponding map (and optionally schema) files used by +sos-import-csv. + +OPTIONS +======================= + +--data= + | + | DATA is a file name of a LDMS .HEADER, .KIND, or data file. The + file name and at least the first line of the file are digested to + determine the content and the column types. LDMS CSV file name + conventions ($schema[.$date] is associated with + $schema.HEADER.$date or $schema.KIND.$date in the same directory). + The file may be gzipped; if so, the matching data/HEADER/KIND files + must also be gzipped. + +--blacklist= + | + | BLACKLIST is the name of a file with column names to exclude from + the schema, one per line. leading # comments allowed in the file. + +--whitelist= + | + | WHITELIST is the name of a file with column names to include in the + schema, one per line. leading # comments allowed in the file. Any + other columns found are excluded. + +--exclude= + | + | LIST is a string of metric names separated by commas. Columns named + are excluded from the generated schema. + +--include= + | + | LIST is a string of metric names separated by commas. Columns named + are included in the generated schema and all other columns found + are excluded. + +--schema-name= + | + | NAME overrides the default schema name determined from the data + file name. + +--schema-file= + | + | Use an existing schema file FILE instead of generating a schema. + When not specified, a schema file is always generated. Schema files + may not be gzipped. + +--map-file= + | + | Override the output map file name derived from the data file name. + +--alias-file= + | + | Provide the list of metrics to rename when creating or matching a + schema discovered from a header line. + +--strip-udata + | + | Suppress output of .userdata fields and remove .value suffix from + schema element names. + +--guess + | + | Guess the ldms data column types. (can be slow on large files) + +--maxlines= + | + | Parse no more than MAXLINES to guess data types with the --guess + option. The default if unspecified is 100000 lines. + +--assume= + | + | Assume all unknown data columns are type ASSUME. + +--verbose + | + | Show process debugging details. + +--widen + | + | Widen numeric types discovered to 64 bits. + +METRIC FILTERING +================================ + +When an include or whitelist is specified, exclude and blacklist +arguments are ignored entirely. An include option cannot be used to +prune a blacklist file. + +When userdata is present in the CSV file, for these filters, metric +names should be written without the .value or .userdata suffix. + +NOTES +===================== + +The recommended export method is to use the .KIND file if available and +to use the options "--guess --widen --maxlines=2" for legacy LDMS files. +This tool is aware of the CSV conventions (up to LDMS v4) for columns +named Time, ProducerName, producer, compid, component_id, Time_usec, +DT_usec, jobid, job_id, app_id, uid, and names ending in .userdata. + +Both assume and guess options should be used judiciously. Know your data +before using SOS or any other database. The output schema file is +formatted for editability, and it should be adjusted before use with SOS +if any guess or assumption proves erroneous. + +BUGS +==================== + +There is no pipeline filtering mode. + +EXAMPLES +======================== + +To test sos-import-csv with the resulting files: + +:: + + + ldms-csv-export-sos --data=renamecsv.1553744481 \ + --strip-udata --schema-name=meminfo \ + --blacklist=exclude.renamecsv + + mkdir container + sos-db --path container --create + sos-schema --path container \ + --add renamecsv.SCHEMASOS.1553744481 + sos-import-csv \ + --path container \ + --csv renamecsv.1553744481 \ + --map renamecsv.MAPSOS.1553744481 \ + --schema meminfo \ + --status + sos_cmd -C container -l + sos_cmd -C container -q -S meminfo -X Time + +Other examples + +:: + + + # make schema and map from *81 with schema rename from file + ldms-csv-export-sos --data=renamecsv.1553744481 \ + --strip-udata --schema-name=meminfo \ + --blacklist=exclude.renamecsv + + # reuse schema and make map from *90 + ldms-csv-export-sos --data=renamecsv.1553744490 \ + --schema-file=renamecsv.SCHEMASOS.1553744481 + + # reuse schema and make map from *90 with alternate output name + ldms-csv-export-sos --data=renamecsv.1553744490 \ + --strip-udata \ + --schema-file=renamecsv.SCHEMASOS.1553744481 \ + --map-file=mymap + + # translate array example (when supported) + ldms-csv-export-sos --data=fptrans.HEADER --strip-udata + + # translate array with old schema (when supported) + ldms-csv-export-sos --data=fptrans2.HEADER \ + --schema-file=fptrans.SCHEMASOS + + # test input guess when x.14 does not exist + ldms-csv-export-sos --data=x.HEADER.14 --guess + + # test input guess when y.KIND.14 does not exist but y.14 does + ldms-csv-export-sos --data=y.HEADER.14 \ + --guess --maxlines=4000 + + # test input guess and widen + ldms-csv-export-sos --data=y.HEADER.14 \ + --guess --widen --maxlines=4 + + # test assume + ldms-csv-export-sos --data=y.HEADER.14 --assume=u32 + +SEE ALSO +======================== + +sos-import-csv(1) diff --git a/rtd/man2rst/ldms-ibnet-sampler-gen.rst b/rtd/man2rst/ldms-ibnet-sampler-gen.rst new file mode 100644 index 000000000..349134bae --- /dev/null +++ b/rtd/man2rst/ldms-ibnet-sampler-gen.rst @@ -0,0 +1,116 @@ +====================== +ldms-ibnet-sampler-gen +====================== + +:Date: 4 June 2020 + +.. contents:: + :depth: 3 +.. + +NAME +======================= + +ldms-ibnet-sampler-gen - man page for the LDMS ibnet plugin support +utility + +ldms-get-opa-network.sh - man page for the LDMS ibnet plugin support +utility + +SYNOPSIS +=========================== + +ldms-ibnet-sampler-gen --samplers --out +[options] + +DESCRIPTION +============================== + +The ldms-ibnet-sampler-gen script produces files splitting the ports in +the netfile among the hosts listed in the samplers file. The input is +expected to be the network dump of an approximately three-level FAT +tree. + +OPTIONS +========================== + +:: + + -h, --help show the help message and exit + --out OUTPREFIX prefix of output files + --net IBNDPFILE file name of output collected from 'ibnetdiscover -p' + --opa OPAFILE file name of output collected from 'ldms-get-opa-network.sh' + --samplers HOSTFILE file listing samplers as named in the node name map, one per line. + --lidnames dump lid,name map to stdout and exit. + --annotate annotate out sampler assignment files with node-name-map strings. + and lists of unassigned switch ports. + --sharp port to exclude in topology calculations (for sharp) + --tier0 generate tier0-1 graphs + --tier1 generate tier1-2 graphs + --tier2 generate tier2-3 graphs + --circo-tiers CIRCO_PREFIX + dump circo tier plots to files starting with prefix + given CIRCO_PREFIX. + --sfdp-tiers SFDP_PREFIX + dump circo tier plots to files starting with prefix + given SFDP_PREFIX. + --info print key intermediate results + --debug print miscellaneous debug messages + --dump_sw print switches parsed + --dump_ca print HCA list parsed + --dump_links print links parsed + --dump_tiers print tiers discovered + --dump_parse print parser debugging + +EXAMPLES +=========================== + +:: + + cat <cluster-samplers + admin1 qib0 + admin2 qib0 + admin3 qib0 + EOF + + ibnetdiscover -p > cluster-p-netdiscover + + # check lids for being parsed right + ldms-ibnet-sampler-gen --lidnames --net cluster-p-netdiscover --samplers x --out x | + sort -k2 -t, > lid.host.txt + + ldms-ibnet-sampler-gen --net cluster-p-netdiscover --samplers clustre-samplers --sharp 37 --annotate --out sbx + +:: + + cat <cluster-samplers + admin1 hfi1_0 + admin2 hfi1_0 + admin3 hfi1_0 + EOF + + ldms-get-opa-network.sh > cluster-opa-map + + # check lids for being parsed right + ldms-ibnet-sampler-gen --lidnames --opa cluster-opa-map --samplers cluster-samplers --out x |sort -k2 -t, > lid.host.txt + + ldms-ibnet-sampler-gen --opa cluster-opa-map --samplers cluster-samplers --out swx + +NOTES +======================== + +A Mellanox SHARP port appears as an HCA in a switch. Connections on the +sharp port should be ignored for topology decomposition and sampler load +balancing purposes, as they usually make the topology flat if included. + +This program does not directly invoke infiniband or omnipath utilities. +It does invoke (and require) graphviz utilities if the tier, circo, or +sfdp options are applied. + +Applying the --node-name-map option to ibnetdiscover when generating the +net file makes the results more readable. + +SEE ALSO +=========================== + +Plugin_ibnet(7), circo, dot, ldms-get-opa-network, ibnetdiscover diff --git a/rtd/man2rst/ldms-netlink-notifier.rst b/rtd/man2rst/ldms-netlink-notifier.rst new file mode 100644 index 000000000..806adacf5 --- /dev/null +++ b/rtd/man2rst/ldms-netlink-notifier.rst @@ -0,0 +1,4 @@ +.. contents:: + :depth: 3 +.. + diff --git a/rtd/man2rst/ldms-notify.rst b/rtd/man2rst/ldms-notify.rst new file mode 100644 index 000000000..806adacf5 --- /dev/null +++ b/rtd/man2rst/ldms-notify.rst @@ -0,0 +1,4 @@ +.. contents:: + :depth: 3 +.. + diff --git a/rtd/man2rst/ldms-plugins.rst b/rtd/man2rst/ldms-plugins.rst new file mode 100644 index 000000000..ef9d57b0e --- /dev/null +++ b/rtd/man2rst/ldms-plugins.rst @@ -0,0 +1,63 @@ +============ +ldms-plugins +============ + +:Date: 28 Feb 2018 + +.. contents:: + :depth: 3 +.. + +NAME +============= + +ldms-plugins.sh - Display information on installed LDMSD plugins. + +SYNOPSIS +================= + +ldms-plugins.sh [OPTION] [NAME] + +DESCRIPTION +==================== + +The ldms-plugins.sh command is used to query ldmsd for information on +installed plugins. + +OPTIONS +================ + +If the NAME is specified, only information for that plugin is displayed. +The names all, store, and sampler are interpreted as described in +ldmsd(8). + +-b + | + | Produce brief output, omitting usages. + +-n + | + | Produce names only. + +EXAMPLES +================= + +ldms-plugins.sh -b + +ldms-plugins.sh vmstat + +ldms-plugins.sh -n sampler + +ldms-plugins.sh -n store + +NOTES +============== + +Error messages from attempting to load plugins may appear if +additionally needed libraries cannot be found. This is usually a bug in +the setting of LD_LIBRARY_PATH. + +SEE ALSO +================= + +ldmsd(8) diff --git a/rtd/man2rst/ldms-reverse-conf.rst b/rtd/man2rst/ldms-reverse-conf.rst new file mode 100644 index 000000000..5b6113d60 --- /dev/null +++ b/rtd/man2rst/ldms-reverse-conf.rst @@ -0,0 +1,38 @@ +================= +ldms-reverse-conf +================= + +:Date: 6 Jun 2022 + +.. contents:: + :depth: 3 +.. + +NAME +================== + +ldms-reverse-conf.sh - generate a tear-down configuration file + +SYNOPSIS +====================== + +ldms-reverse-conf.sh + +DESCRIPTION +========================= + +The ldms-reverse-conf.sh command parses an LDMS control script in the +key/value language which sets up samplers, stores, producers, updaters, +and subscriptions, and attempts to generate the matching tear-down +script to stdout. Invoking the ldmsd_controller or ldmsctl with the +teardown script should yield and almost idle daemon (listeners are still +active). + +Typically, a daemon is configured and left to run. The intent of this +utility is to make it easy to deconfigure a running daemon in the proper +command order given the original scripted configuration. + +SEE ALSO +====================== + +ldmsctl(8), ldmsd_controller(8) diff --git a/rtd/man2rst/ldms-run-static-tests.rst b/rtd/man2rst/ldms-run-static-tests.rst new file mode 100644 index 000000000..774b543f7 --- /dev/null +++ b/rtd/man2rst/ldms-run-static-tests.rst @@ -0,0 +1,133 @@ +===================== +ldms-run-static-tests +===================== + +:Date: 21 Aug 2020 + +.. contents:: + :depth: 3 +.. + +NAME +====================== + +run-static-tests.test - Execute the program + +SYNOPSIS +========================== + +run-static-tests.test -l + +run-static-tests.test -h + +run-static-tests.test [test_dir] + +DESCRIPTION +============================= + +The run-static-tests.test initiates the ldms-static-test.test test on +each enabled plugin. The stdout/stderr of each ldms-static-test.sh +invocation will be redirected to a log file and its output tree. This +log file will then be tarred and compressed when ldms-static-test.sh has +finsihed. The return code of ldms-static-test.sh will then be checked by +this driver script. If the return value is 0, then the script will print +"PASS $testname" and if the return value is 1, the script will print +"FAIL $testname". Where testname is each invocation of +ldms-static-test.sh of the enabled plugins. Please see +ldms-static-test.man for more information. + +OPTIONS +========================= + +-l + | + | List the enabled plugins. + +-h + | + | List help message. + +LANGUAGE +========================== + +The following macro language is provided as extensions on bash. Other +bash use is also possible, but not recommended. + +ENVIRONMENT +============================= + +Uses the current set environment to run. Environment may need to be +configured before excuting this test script. + +input + | + | The name of the input file as specified when ldms-static-test.sh is + invoked for each enabled plugin. + +testname + | + | The base name (directories stripped) of the input file name. This + variable makes it possible to use similar input across many test + files when the name of the input file is the same as the plugin + tested. + +strict + | + | If the variable "strict" is used for KILL_LDMSD + (ldms-static-test(8)) the script will output "FAIL $testname" and + return an XFAIL to indicate an expected failure only if the test + case plugin is listed in static-test-list. The stderr of + ldms-static-test.sh will be redirected to the log file + test.$testname.log under the default output location of test_dir. + +file + | + | The file "static-test-list" located in ldms/scripts/ defines a list + of samplers that are expected to fail. If there is a failed test + and the sampler is listed in this file, then run-static-test.sh + will output an "XFAIL" and continue. Developers can modify this + list to meet their needs. + +bypass <1,0> + | + | This variable assignment is used to determine an expected failure + (1) or normal failure (0) of a sampler plugin. This variable is set + to (1) if the sampler is listed in $file and set to (0) otherwise. + Used to test the successful and expected failures of each sampler + plugin. + +NOTES +======================= + +Any other variable may be defined and exported for use in the +attribute/value expansion of values in plugin configuration. + +FILES +======================= + +*$input_file.$i* + | + | For each value of i specifed to start an ldmsd, a configuration + file named $input_file.$i must also exist. This configuration file + is used when starting the daemon. + +*test_dir* + | + | Test output directory of ldms-static-test.sh. The default output + location is \`pwd`/ldmstest/$testname. + +GENERATED FILES +================================= + +*$test_dir/test.$testname.log* + | + | The log file containing stderr and stdout of ldms-static-test.sh. + +*$test_dir/test.$testname.tgz* + | + | Location of the compressed file logfile. + +SEE ALSO +========================== + +ldmsd-static-test.man diff --git a/rtd/man2rst/ldms-sensors-config.rst b/rtd/man2rst/ldms-sensors-config.rst new file mode 100644 index 000000000..f6ac04a51 --- /dev/null +++ b/rtd/man2rst/ldms-sensors-config.rst @@ -0,0 +1,94 @@ +=================== +ldms-sensors-config +=================== + +:Date: 15 Dec 2018 + +.. contents:: + :depth: 3 +.. + +NAME +==================== + +ldms-sensors-config - generate LDMS filesingle plugin configuration +prototype + +SYNOPSIS +======================== + +ldms-sensors-config [--sensors=/path/to/sensors] +[--lscpu=/path/to/lscpu] [--test-lscpu=lscpu-log-file] +[--test-sensors=sensors-log-file] + +Run 'sensors' under strace to discover where some sensor files live on +the current system and generate a draft metric configuration file for +the LDMS filesingle sampler. + +DESCRIPTION +=========================== + +The ldms-sensors-config program generates a draft conf file for the +filesingle sampler. The user should tailor the selection, naming, data +storage type, and default values per Plugin_filesingle(7). + +OPTIONS +======================= + +--sensors= + | + | specify an alternate location of the sensors program. The default + is /usr/bin/sensors, and the PATH variable is not used to search + for alternatives. + +--nodash + | + | Replace all - characters in metric names with \_ characters. + +--lscpu= + | + | specify an alternate location of the lscpu program. The default is + /usr/bin/lscpu and the PATH variable is not used to search for + alternatives. + +--test-lscpu= + | + | Specify the location of a pre-collected strace log of lscpu to use + instead of lscpu run on the local system. Used for testing or + remote configuration. + +--test-sensors= + | + | Specify the location of a pre-collected strace log of sensors to + use instead of sensors run on the local system. Used for testing or + remote configuration. + +EXAMPLES +======================== + +The log file for sensors can be collected with: + +script -c 'strace -e trace=open,openat,read sensors -u' sensors.log + +The log file for lscpu can be collected with: + +script -c 'strace -e trace=open,openat lscpu' /tmp/lscpu.tmp \| grep +'^open.*cpuinfo_max_freq' > lscpu.log; rm /tmp/lscpu.tmp + +NOTES +===================== + +When using test input file(s), the live system data will be used if the +corresponding test file is not specified. + +Systems (kernels) lacking cpu frequency reporting produce no output from +lscpu. + +The use of --nodash is recommended for compatibility with downstream +analysis tools. White space appearing in metric names is unconditionally +transformed to \_. + +SEE ALSO +======================== + +sensors(1), lscpu(1), Plugin_filesingle(7), ldmsd. diff --git a/rtd/man2rst/ldms-static-test.rst b/rtd/man2rst/ldms-static-test.rst new file mode 100644 index 000000000..486a91f55 --- /dev/null +++ b/rtd/man2rst/ldms-static-test.rst @@ -0,0 +1,348 @@ +================ +ldms-static-test +================ + +:Date: 4 Oct 2020 + +.. contents:: + :depth: 3 +.. + +NAME +================= + +ldms-static-test.sh - Run a canned test scenario + +SYNOPSIS +===================== + +ldms-static-test.sh -l + +ldms-static-test.sh -h + +ldms-static-test.sh [test_dir] + +DESCRIPTION +======================== + +The ldms-static-test.sh command starts a canned test defined in the +input_file using a standard environment. The input file is written in a +simple bash macro language described in LANGUAGE below. Supporting +configuration file fragments will be used, as determined from the input +file. See FILES below. This tests ldmsd run with static configuration +files (as would normally happen as a system service) and shut down with +a signal. + +OPTIONS +==================== + +-l + | + | List the canned tests available. + +-h + | + | List help message. + +LANGUAGE +===================== + +The following macro language is provided as extensions on bash. Other +bash use is also possible, but not recommended. + +DAEMONS + | + | Give all the numbers that will be used in the LDMSD invocations + anywhere in the test. This causes port variables to be defined so + that any daemon can connect to any other by referencing $portN as + explained in ENVIRONMENT below. If omitted, the ordering and + aggregation relationships of LDMSD calls may be infeasible. + +LDMSD [conf-options] + | + | This starts a number of daemons described by daemon-numbers. The + numbers can be a given list, such as "1 2 3". The environment of + each daemon (and its config script) will contain the variable i set + to one of the given values, as described in ENVIRONMENT. For each + value of i, a configuration fragment $input_file.$i must also + exist. Use seq(1) to generate large number sequences. + +See CONFIGURATION OPTIONS below for the explanation of [conf-options]. + +MESSAGE [arguments] + | + | The expanded arguments are logged. + +LDMS_LS [ldms_ls_args] + | + | This invokes ldms_ls on the k-th ldmsd. + +KILL_LDMSD [strict] + | + | Kills the listed daemons. If optional keyword strict is supplied, + then missing daemons will cause the bypass variable to be set and + the script to include an error code when it exits. + +SLEEP + | + | Sleeps n seconds and logs a message about it. + +JOBDATA jobfile [daemon-numbers] + | + | Creates jobfile with data for the jobid plugin to parse. If daemon + numbers are specified, creates a jobfile.$k for each value of k + listed in daemon-numbers. Each file will have unique numeric + values, sequentially increasing. This does not provide data in the + slurm-plugin sampler binary format. + +vgon + | + | Turns on use of valgrind for any ldmsd or ldms_ls subsequently + started. + +vgoff + | + | Turns off use of valgrind for any ldmsd or ldms_ls subsequently + started. + +file_created + | + | Verifies the existence and readability of filename. + +rollover_created + | + | Verifies the existence and readability of rollover files matching + pattern filename.[0-9]\*. + +bypass=<0,1> + | + | This variable assignment disables (1) or enables (0) all the macros + described above. Typical use is to skip one or more operations + while debugging a test script. + +KILL_LDMSD_STRICT=<0,1> + | + | This variable allows the script author to control whether + KILL_LDMSD is strict by default or not. If enabled (1), the script + will exit with error code 1 following a failed KILL_LDMSD. If + disabled (0) the script will suppress error codes from killing + missing daemons. Typically used for debugging missing pid files and + unexpectedly dead daemons. Supplying the keyword ‘strict’ before + the numeric arguments to KILL_LDMSD also sets KILL_LDMSD_STRICT=1. + +portbase= + | + | The listening port numbers assigned to the daemons will be K+i, + where i is as described for macro LDMSD. It is a good idea (to + support automated testing) if portbase is set in so + that each test uses a unique range of ports. This enables tests to + proceed in parallel. + +CONFIGURATION OPTIONS +================================== + +The LDMSD command supports the following options. Note that all -P +options are processed before all -p options in a single LDMSD call. + +-p + | + | The prolog file is included before the usually expected input file. + The location of prolog files is handled as are the test input + files. See FILES below. Multiple -p options are allowed. + +-P + | + | The looped-prolog-file is included before the usually expected + input file, once for each value in daemon-csl. Daemon-csl is a + comma separated list of daemon numbers, e.g. a complete argument + example is "-P producer,3,4,5". The variable ${j} is substituted + with a daemon number from the list for each inclusion. + +The location of looped prolog files is handled as are the test input +files. See FILES below. Multiple -P options are allowed. + +-c + | + | Where multiple daemon numbers are specified, the input generated + for the first number is cloned to all subsequent daemons. See + FILES. This allows a single file to serve many similar daemon + instances in scale testing. + +-s + | + | After an ldmsd is started, wait wait_microseconds before checking + for the daemon PID file to exist. The appropriate wait time is + variable depending on the complexity of the configuration. If not + specified, the default is 2 seconds wait time. + +ENVIRONMENT +======================== + +The following variables can be set in the script to affect the launch of +ldmsd: + +LDMSD_EXTRA + | + | If set, these arguments are are appended to the ldmsd launch. + Typical use is to specify "-m MEMSIZE" or other unusual arguments. + The following flags are always determined for the user and must not + be present in LDMSD_EXTRA: -x -c -l -v -r. + +VG + | + | If valgrind is used (see vgon, vgoff), then $VG is the name of the + debugging tool wrapped around the launch of ldmsd. The default is + 'valgrind'. + +VGARGS + | + | If valgrind is used (see vgon, vgoff), then $VGARGS is appended to + the default valgrind arguments. + +VGTAG + | + | If valgrind is used (see vgon, vgoff), then $VGTAG is inserted in + the valgrind output file name when defined. A good practice is for + VGTAG to start with ".". + +KILL_NO_TEARDOWN + | + | Set KILL_NO_TEARDOWN=1 to suppress attempting configuration cleanup + during KILL_LDMSD. If set, ldmsd internal cleanup() function will + attempt partial cleanup, but possibly leave active data structures + to be reported by valgrind. The following variables are visible to + the input file and the configuration file. + +i + | + | Daemon configuration files and commands can refer to ${i} where i + is the integer daemon number supplied via LDMSD for the specific + daemon using the script. + +portN + | + | Daemon configuration files and commands can refer to ${portN} where + N is any value of 'i' described above. portN is the data port + number of the N-th daemon. + +input + | + | The name of the input file as specified when invoking this command. + +testname + | + | The base name (directories stripped) of the input file name. This + variable makes it possible to use similar input across many test + files when the name of the input file is the same as the plugin + tested. + +TESTDIR + | + | Root directory of the testing setup. + +STOREDIR + | + | A directory that should be used for store output configuration. + +LOGDIR + | + | A directory that should be used for log outputs. + +LDMS_AUTH_FILE + | + | Secret file used for daemon communication. + +XPRT + | + | The transport used. It may be specified in the environment to + override the default 'sock', and it is exported to the executed + daemon environment. + +HOST + | + | The host name used for a specific interface. It may be specified in + the environment to override the default 'localhost', and it is + exported to the executed daemon environment. + +NOTES +================== + +Any other variable may be defined and exported for use in the +attribute/value expansion of values in plugin configuration. + +EXIT CODES +======================= + +Expected exit codes are 0 and 1. If the exit codes is 0, then the +program will proceed. If the exit code is 1 then the script will stop +and notify the user. + +FILES +================== + +*$input_file.$i* + | + | For each value of i specifed to start an ldmsd, a configuration + file named $input_file.$i must also exist. This configuration file + is used when starting the daemon. + +Exception: For any single "LDMSD -c ", only +$input_file.$i for the first listed number is needed; the first file +will be used for all subsequent numbers and any matching files except +the first are ignored. Where prologs are also specified, the regular +prolog inclusion process is applied to the first file. + +*[test_dir]* + | + | If test_dir is supplied, it is used as the test output directory. + The default output location is \`pwd`/ldmstest/$testname. + +*$docdir/examples/static-test/$input_file* + | + | If input_file is not found in the current directory, it is checked + for in $docdir/examples/static-test/$input_file. + +GENERATED FILES +============================ + +*$test_dir/logs/vg.$k$VGTAG.%p* + | *$test_dir/logs/vgls.$k$VGTAG.%p* + | The valgrind log for the kth daemon with PID %p or the valgrind log + for ldms_ls of the kth daemon with PID %p, if valgrind is active. + +*$test_dir/logs/$k.txt* + | + | The log for the kth daemon. + +*$test_dir/logs/teardown.$k.txt* + | + | The teardown log for the kth daemon. + +*$test_dir/run/conf.$k* + | + | The input for the kth daemon. + +*$test_dir/run/revconf.$k* + | + | The input for the kth daemon teardown. + +*$test_dir/run/env.$k* + | + | The environment present for the kth daemon. + +*$test_dir/run/start.$k* + | + | The start command of the kth daemon. + +*$test_dir/store/* + | + | The root of store output locations. + +*$test_dir/run/ldmsd/secret* + | + | The secret file for authentication. + +SEE ALSO +===================== + +seq(1) diff --git a/rtd/man2rst/ldms_auth_munge.rst b/rtd/man2rst/ldms_auth_munge.rst new file mode 100644 index 000000000..2c6cd9714 --- /dev/null +++ b/rtd/man2rst/ldms_auth_munge.rst @@ -0,0 +1,33 @@ +=============== +ldms_auth_munge +=============== + +:Date: 10 May 2018 + +.. contents:: + :depth: 3 +.. + +NAME +================ + +ldms_auth_munge - LDMS authentication using munge + +SYNOPSIS +==================== + +*ldms_app* **-a munge [-A socket=**\ *PATH*\ **]** + +DESCRIPTION +======================= + +**ldms_auth_munge** relies on **munge** service (see **munge**\ (7)) to +authenticate users. Munge daemon (**munged**) must be up and running. +The optional **socket** option can be used to specify the path to munged +unix domain socket in the case that munged wasn't using the default +path. + +SEE ALSO +==================== + +**munge**\ (7), **munged**\ (8) diff --git a/rtd/man2rst/ldms_auth_naive.rst b/rtd/man2rst/ldms_auth_naive.rst new file mode 100644 index 000000000..2fc6288f1 --- /dev/null +++ b/rtd/man2rst/ldms_auth_naive.rst @@ -0,0 +1,29 @@ +=============== +ldms_auth_naive +=============== + +:Date: 28 Feb 2018 + +.. contents:: + :depth: 3 +.. + +NAME +================ + +ldms_auth_naive - naive LDMS authentication implementation FOR TESTING + +SYNOPSIS +==================== + +*ldms_app* **-a naive** **[-A uid=**\ *UID*\ **]** **[-A +gid=**\ *GID*\ **]** + +DESCRIPTION +======================= + +**ldms_auth_naive** LDMS authentication plugin naively believes the +peer's credential declaration. The purpose of this plugin is purely for +testing the permission control of various objects in **ldmsd**. The +**uid** and **gid** options are used to specify the user credential. If +**uid** and/or **gid** are not specified, the default is -1. diff --git a/rtd/man2rst/ldms_auth_none.rst b/rtd/man2rst/ldms_auth_none.rst new file mode 100644 index 000000000..f1da1e2ee --- /dev/null +++ b/rtd/man2rst/ldms_auth_none.rst @@ -0,0 +1,29 @@ +============== +ldms_auth_none +============== + +:Date: 28 Feb 2018 + +.. contents:: + :depth: 3 +.. + +NAME +=============== + +ldms_auth_none - LDMS authentication disabled + +SYNOPSIS +=================== + +*ldms_app* **-a none [Default]** + +DESCRIPTION +====================== + +**ldms_auth_none** enables running without authentication of query +sources. Since "-a none" is the default it need not be specified (e.g., +running "ldmsd -x sock:1024 -a none" is equivalent to simply running +"ldmsd -x sock:1024"). Using this authentication type there will be NO +checks on identities associated with data and/or meta-data information +accesses. diff --git a/rtd/man2rst/ldms_auth_ovis.rst b/rtd/man2rst/ldms_auth_ovis.rst new file mode 100644 index 000000000..4d622eb2c --- /dev/null +++ b/rtd/man2rst/ldms_auth_ovis.rst @@ -0,0 +1,67 @@ +============== +ldms_auth_ovis +============== + +:Date: 28 Feb 2018 + +.. contents:: + :depth: 3 +.. + +NAME +=============== + +ldms_auth_ovis - LDMS authentication using ovis_auth library + +SYNOPSIS +=================== + +*ldms_app* **-a ovis [-A conf=**\ *PATH*\ **]** + +DESCRIPTION +====================== + +**ldms_auth_ovis** uses shared secret to authenticate the connection. +The secret is a text file containing the line: + + secretword=X + +where X is a string at least 8 characters long. Lines starting with # in +the file are ignored. + +Four locations are checked in order for the secret: + +1) the full file path given on the command line via "-A conf=authfile", + +2) the full file path given in environment variable LDMS_AUTH_FILE, + +3) $HOME/.ldmsauth.conf, and + +4) $SYSCONFDIR/ldmsauth.conf (e.g. /etc/ldmsauth.conf). + +where $HOME is taken from */etc/password* and $SYSCONFDIR is determined +at ldms compile time. If one of these is not set, the search continues +with the next location. A failure in reading one, if the file exists, +ends the search and is a failure to authenticate. + +The secret file permissions must be set to 600 or more restrictive. + +ENVIRONMENT +====================== + +"LDMS_AUTH_FILE" is a full file path for a secretword file. It is not +necessary, if the file is in one of the other checked locations. + +NOTES +================ + +Authentication can be disabled at ldms build time by configuring your +ldms build with --disable-ovis_auth. Then no secretword file is required +or checked. + +BUGS +=============== + +Networked file system users should verify the privacy of their secret +files, as various access control list schemes might be more permissive +than the standard permissions bits. diff --git a/rtd/man2rst/ldms_authentication.rst b/rtd/man2rst/ldms_authentication.rst new file mode 100644 index 000000000..f8b73736b --- /dev/null +++ b/rtd/man2rst/ldms_authentication.rst @@ -0,0 +1,71 @@ +=================== +ldms_authentication +=================== + +:Date: 28 Feb 2018 + +.. contents:: + :depth: 3 +.. + +NAME +==================== + +ldms_authentication - Authentication in LDMS transports + +DESCRIPTION +=========================== + +LDMS applications use authentication plugins in LDMS transports to +authenticate the peers. In other words, not only **ldmsd** authenticates +the client connections, the clients (**ldms_ls**, **ldmsctl**, +**ldmsd_controller**, and other **ldmsd**) authenticate the **ldmsd** +too. + +**ldmsd**, **ldms_ls**, **ldmsd_controller**, and **ldmsctl** use the +following options for authentication purpose: + +**-a** *AUTH_PLUGIN* + Specifying the name of the authentication plugin. The default is + "none" (no authentication). + +**-A** *NAME*\ **=**\ *VALUE* + Specifying options to the authentication plugin. This option can be + given multiple times. + +**auth** configuration object has been introduced in **ldmsd** version +4.3.4. It describes an authentication domain in the configuration file +with **auth_add** command. **listen** and **prdcr_add** config commands +can refer to **auth** object created by **auth_add** command to specify +the authentication domain a listening port or a producer connection +belong to. If no **auth** option is specified, **listen** and +**prdcr_add** commands fall back to use the authentication method +specified by **-a, -A** CLI options (which is default to **none**). + +Please consult the manual of the plugin for more details. + +LIST OF LDMS_AUTH PLUGINS +========================================= + +**none** + Authentication will NOT be used (allow all connections) (see + **ldms_auth_none**\ (7)). + +**ovis** + The shared secret authentication using ovis_ldms (see + **ldms_auth_ovis**\ (7)). + +**naive** + The naive authentication for testing. (see **ldms_auth_naive**\ (7)). + +**munge** + User credential authentication using Munge. (see + **ldms_auth_munge**\ (7)). + +SEE ALSO +======================== + +**ldms_auth_none**\ (7), **ldms_auth_ovis**\ (7), +**ldms_auth_naive**\ (7), **ldms_auth_munge**\ (7), **ldmsctl**\ (8), +**ldmsd**\ (8), **ldms_ls**\ (8), **ldmsd_controller**\ (8), +**ldms_quickstart**\ (7), **ldms_build_install**\ (7) diff --git a/rtd/man2rst/ldms_build_install.rst b/rtd/man2rst/ldms_build_install.rst new file mode 100644 index 000000000..e212f2ca7 --- /dev/null +++ b/rtd/man2rst/ldms_build_install.rst @@ -0,0 +1,315 @@ +================== +ldms_build_install +================== + +:Date: 22 Dec 2016 + +.. contents:: + :depth: 3 +.. + +NAME +=================== + +ldms_build_install - Instructions for building and installing ldms + +INTRODUCTION +=========================== + +OVIS is a modular system for HPC data collection, transport, storage, +analysis, visualization, and response. The Lightweight Distributed +Metric Service (LDMS) is the OVIS data collection and transport system. +LDMS provides capabilities for lightweight run-time collection of +high-fidelity data. Data can be accessed on-node or transported off +node. Additionally, LDMS can store data in a variety of storage options. + +This entire source encompasses a number of the modular components of +OVIS. The top level subdirectory ldms contains the ldms source. This +document covers building only the ldms component from the top level +directory. + +DESCRIPTION +========================== + +This document covers building only the ldms component from the top level +directory. + +ldms is built via the following steps: + + :: + + build prerequisties + cd top_level_directory + mkdir build + cd build + make + make install + + + This document describes the steps involved in building the prerequisties and in doing the configure. + A description of the arguments for configure can be found by invoking + + ./configure --help + + at BOTH the top level and in the ldms subdirectory. + +PREREQUISTES: +============================ + +- libevent-2.0 is a requirement. It can be built from source obtained + from libevent.org or it can be installed from rpm or similar on your + system via a utility like yum. If you do the latter, then you need to + install both the libevent and libevent-devel packages. + +- If you intend to use the aries_mmr sampler, then you will need to + install Cray's gpcd library. More information on this can be found in + the Plugin.aries_mmr man page. (This is the recommended method for + getting HSN metrics for the Aries). + +- If you intend to use the hsn metrics in the cray_aries_r_sampler or + the cray_gemini_r_sampler, you will need to configure gpcdr. More + information on this can be found in the Plugin.cray_sampler_variants + man page. (This is the recommended method for the Gemini). + +- Use the gnu compiler for building ldms. (This may necessitate a + module change on some platforms). + +The remaining instructions will include paths to where the headers and +libraries of these prerequisties are installed. + +CONFIGURATION OPTIONS +==================================== + +There are configuration options at the top level, in ldms, and in the +ovis_ldms support directories. This section is thus split into these +three sections, however the configuration arguments are all combined as +arguments to the top level configure. The list of configuration options +give here is not comprehensive, rather it refers to the most common +arguments. + +TOP LEVEL OPTIONS +----------------- + +A number of top level "enable|disable-feature" options exist. The +defaults are chosen for a generic linux build to work by default. + +**--enable|disable-rpath** + | + | Disable this. Do not hardcode runtime library paths. + +**--enable|disable-ldms** + | + | Enable this. Default enabled. + +**--enable|disable-sos** + | + | Used to enable or disable sos. Enable only if you are going to use + the store_sos plugin. Default disable. + +**--enable|disable-ocm|baler|me|komondor** + | + | Disable all of these. All default disabled. + +OVIS_LIB LEVEL OPTIONS +---------------------- + +A number of top level "enable|disable-feature" options exist. The +defaults are chosen for a generic linux build to work by default. + +**--enable|disable-auth** + | + | Enables or disables authentication. Default enabled. + +**--enable|disable-sock** + | + | Enables or disables the sock transport. Default enabled. + +**--enable|disable-rdma** + | + | Enables or disables the rdma transport. Default disabled + +**--enable|disable-ugni** + | + | Enables or disables the ugni transport. The is cray-specific for + rdma over gemini or aries. Default disabled. + +LDMS LEVEL OPTIONS +------------------ + +A number of "enable|disable-feature options" exist. In addition a number +of "with" options exist to specify paths to files/libraries/etc. The +defaults are chosen for a generic linux build to work by default. + +General Options +--------------- + +**--enable|disable-ovis_auth** + | + | If --enable, then disable/enable authentication. Default enabled. + +**--enable|disable-python** + | + | Enable the ldms python api and the configuration tools that depend + on the API. Default: enabled if python and cython detected. + **--enable|disable-readline** + | Enable or disable the readline module. It is necessary to enable if + you want to use the configuration tools interactively; if you are + going to use a script interface to the configuration tools (usual + method), then this can be disabled. + +**--with-libevent**\ *[=path]* + | + | Specify libevent path [default=/usr] + +Generic Sampler Options +----------------------- + +**--enable|disable-meminfo|procinterrupts|procnfs|procnetdev|vmstat** + | + | Enable or disable generic linux samplers for data in /proc. Default + enabled. + +**--enable|disable-lustre** + | + | Enable or disable the lustre module. Default enabled. + +Cray-specific Sampler Options +----------------------------- + +**--enable|disable-kgnilnd** + | + | Enable the kgnilnd sampler. Default disabled. + +**--enable|disable-cray_system_sampler** + | + | Enable or disable the cray_system_sampler module. Default disabled. + If you enable this, then consider the following options: + + **--enable-gemini-gpcdr** + | + | Enable the gemini-gpcdr version of the cray_system_sampler. + Default disabled. Both the gemini and aries versions can be + built simultaneously. + + **--enable-aries-gpcdr** + | + | Enable the aries-gpcdr version of the cray_system_sampler. + Default disabled. For the Aries, we recommended getting the HSN + metrics via aries-mmr, instead of the aries-gpcdr sampler. Still + build the aries-gpcdr sampler, but run it without the HSN part + of the metric collection. Both the gemini and aries versions can + be built simultaneously. + + **--enable-cray-nvidia**\ OR\ **--with-cray-nvidia-inc**\ [=path] + | + | For gemini systems with gpus, Enable the cray-nvidia metric + sampling in the cray_gemini_r_sampler. You need not specify + --enable-cray-nvidia if you are instead specifying the path to + the include file via --with-cray-nvidia-inc. + + **--enable|disable-lustre** + | + | Enable or disable the lustre module for use in the + cray_system_sampler. Default enabled. + + **--with-rca**\ *[=path]* + | + | Specify the path to the rca includes via --with-rca + [default=/usr]. + + **--with-krca**\ *[=path]* + | + | Specify the path to the krca includes via --with-krca + [default=/usr]. + + **--with-cray-hss-devel**\ *[=path]* + | + | Specify the path to the hss-devel includes via + --with-cray-hss-devel [default=/usr]. + +**--enable|disable-aries-mmr** + | + | Enable or disable the aries-mmr module. Default disabled. If you + enable this, then consider the following options: + + **--with-aries-libgpcd**\ *LIBDIR,INCDIR* + | + | Locations of gpcd library and headers for aries_mmr sampler. + E.g. --with-aries-libgpcd=/special/libs,/private/headerdir + +Store Options +------------- + +**--enable|disable-csv** + | + | Enable the csv stores (store_csv and store_function_csv). Default + enable. **--enable|disable-sos** + | Enable or disable the sos stores. Enable this only if you are going + to use the store_sos plugin. Default disable. + +INSTALL DIRECTORY SETUP +====================================== + +The build will go into prefix (/XXX/Build/build_ovis in the examples +section below). + +- bin - python-based utility commands, such as ldmsd_controller. Also + test scripts. + +- include - subdurectories with header files + +- lib - libraries. At the top level are libraries for the ldms + infrastructure (e.g., libldms.so, libzap.so, etc). There is a + subdirectory, which will be called either ovis-ldms or ovis-lib which + contains all the libraries for the plugins (samplers, such as + libmeminfo.so; stores, such as libstore_csv.so; and transports, such + as libzap_sock.so). + +- lib64 - python library + +- sbin - C-based utility commands, such as ldms_ls and ldmsd. + +- share - documentation, including man pages. + +NOTES +==================== + +This document does not cover putting the install into a cray-system +image. Nor does it over setting up init scripts to run ldms as a system +service (for any type of linux platform). + +EXAMPLES +======================= + +configure.sh script for a Cray XC install with the cray-specific +samplers only: + +:: + + PREFIX=/XXX/Build/build_ovis + LIBDIR=${PREFIX}/lib + + # add --enable-FEATURE here + ENABLE="--enable-ugni --enable-ldms-python --enable-kgnilnd --enable-lustre --enable-aries_mmr --enable-cray_system_sampler --enable-aries-gpcdr" + + # add --disable-FEATURE here + DISABLE="--disable-rpath --disable-readline --disable-mmap --disable-baler --disable-sos" + + # libevent2 prefix + LIBEVENT_PREFIX=/XXX/Build/libevent-2.0_build + + WITH="--with-rca=/opt/cray/rca/default/ --with-krca=/opt/cray/krca/default --with-cray-hss-devel=/opt/cray-hss-devel/default/ --with-pkglibdir=ovis-ldms --with-aries-libgpcd=/XXX/Build/gpcd/lib/,/XXX/Build/gpcd/include/" + + + if [ -n "$LIBEVENT_PREFIX" ]; then + WITH="$WITH --with-libevent=$LIBEVENT_PREFIX" + fi + + CFLAGS='-g -O0' + +SEE ALSO +======================= + +ldms_authentication(8), ldms_quickstart(7), ldmsd(8), +Plugin_cray_sampler_variants(7), Plugin_aries_mmr(7), +Plugin_store_csv(7), Plugin_store_function_csv(7) diff --git a/rtd/man2rst/ldms_csv_time_drops.rst b/rtd/man2rst/ldms_csv_time_drops.rst new file mode 100644 index 000000000..47fe683e5 --- /dev/null +++ b/rtd/man2rst/ldms_csv_time_drops.rst @@ -0,0 +1,169 @@ +=================== +ldms_csv_time_drops +=================== + +:Date: 07 Jul 2022 + +.. contents:: + :depth: 3 +.. + +NAME +==================== + +ldms_csv_time_drops the LDMS CSV data quality check + +SYNOPSIS +======================== + +| ldms_csv_time_drops +| ldms_csv_time_drops_range + +DESCRIPTION +=========================== + +LDMS CSV store file quality checker. For each input file, the interval, +gaps and duplicates in the data are reported. When multiple files are +given, they must be given in chronological order of the data contained. + +INTERVAL +======================== + +The interval is determined per file by examining the rounded time +differences of sequential samples on each host and taking the most +common value. 0 length intervals are ignored. If more than one 'most +common' interval is found across the hosts of a single file, the maximum +interval seen in any file is reported as 'interval' and the minimum is +reported as 'short_interval'. + +GAPS +==================== + +Gaps in the data are computed using the assumption of a uniform sampling +interval across all hosts on the aggregate timestamp data from all the +input files. A missing file or daemon down-time within the range of the +data set will appear a gap. + +DUPLICATES +========================== + +An identical timestamp reappearing on the same host will be reported. +The later time reported for a duplicate is the latest time seen across +any host in the same file preceeding the line location of the duplicate. + +INPUT +===================== + +The LDMS csv store column format is assumed, in particular that the +first column is the timestamp and any row beginning with # is a header +to be ignored. Columns 1-4 are assumed to be + +:: + + + Time,Time_usec,ProducerName,component_id + +OUTPUT FORMATS +============================== + +Per-file summary: + +:: + + + lines + oldest + newest + interval + +If multiple intervals found in a file + +:: + + + short_interval + +Per gap output for ldms_csv_time_drops_range: + +:: + + + is missing between + + and + +Per gap output for ldms_csv_time_drops: + +:: + + + missing + +Duplicates are reported as: + +:: + + + written again at + +BUGS +==================== + +Sub-second intervals are not supported. + +EXAMPLES +======================== + +For input test.csv containing: + +:: + + + 1.1,100000,host1,1 + 1.1,100000,host2,2 + 1.1,100000,host3,3 + 2.1,100000,host1,1 + 2.1,100000,host2,2 + 3.1,100000,host1,1 + 3.1,100000,host2,2 + 3.1,100000,host3,3 + 4.1,100000,host1,1 + 4.1,100000,host3,3 + 5.1,100000,host1,1 + 2.1,100000,host1,1 + 5.1,100000,host2,2 + 5.1,100000,host3,3 + + output of 'ldms_csv_time_drops test.csv' + + lines 14 + oldest 1.100000 + newest 5.100000 + interval 1 seconds + host1 2.000001 written again at 5.000001 + host2 missing 4 + host3 missing 2 + + output of 'ldms_csv_time_drops_range test.csv' + + lines 14 + oldest 1.100000 + newest 5.100000 + interval 1 seconds + host1 2.100000 written again at 5.100000 + host2 is missing 1 steps between + 3.100000 + and 5.100000 + host3 is missing 1 steps between + 1.100000 + and 3.100000 + + + Find the interval of data in a file foo.csv + + ldms_csv_time_drops foo.csv |grep ^interval + +SEE ALSO +======================== + +Plugin_store_csv(7) diff --git a/rtd/man2rst/ldms_dstat_schema_name.rst b/rtd/man2rst/ldms_dstat_schema_name.rst new file mode 100644 index 000000000..01ee3748c --- /dev/null +++ b/rtd/man2rst/ldms_dstat_schema_name.rst @@ -0,0 +1,49 @@ +====================== +ldms_dstat_schema_name +====================== + +:Date: 17 Nov 2020 + +.. contents:: + :depth: 3 +.. + +NAME +======================= + +ldms_dstat_schema_name - man page for the LDMS dstat plugin support +utility + +SYNOPSIS +=========================== + +ldms_dstat_schema_name + +DESCRIPTION +============================== + +The dstat plugin optionally generates a schema name including a short +hash of certain configuration data. ldms_dstat_schema_name provides the +user with the schema name the dstat plugin will generate for the given +options. + +CONFIGURATION ATTRIBUTE SYNTAX +================================================= + +See Plugin_dstat(7). + +EXAMPLES +=========================== + +:: + + ldms_dstat_schema_name auto-schema=1 fd=1 + + yields + + dstat_10 + +SEE ALSO +=========================== + +Plugin_dstat(7) diff --git a/rtd/man2rst/ldms_ibnet_schema_name.rst b/rtd/man2rst/ldms_ibnet_schema_name.rst new file mode 100644 index 000000000..b340837da --- /dev/null +++ b/rtd/man2rst/ldms_ibnet_schema_name.rst @@ -0,0 +1,76 @@ +====================== +ldms_ibnet_schema_name +====================== + +:Date: 4 June 2020 + +.. contents:: + :depth: 3 +.. + +NAME +======================= + +ldms_ibnet_schema_name - man page for the LDMS ibnet plugin support +utility + +SYNOPSIS +=========================== + +ldms_ibnet_schema_name + +DESCRIPTION +============================== + +The ibnet plugin generates a schema name including a hash of certain +configuration data. ldms_ibnet_schema_name provides the user with the +resulting name before running ldmsd so that store plugins can be +configured. + +CONFIGURATION ATTRIBUTE SYNTAX +================================================= + +See Plugin_ibnet(7). + +EXAMPLES +=========================== + +:: + + ldms_ibnet_schema_name node-name-map=/path/map timing=2 metric-conf=/path/metricsubsets schema=myibnet + + when file /path/metricsubsets contains + + extended + xmtsl + rcvsl + xmtdisc + rcverr + oprcvcounters + flowctlcounters + vloppackets + vlopdata + vlxmitflowctlerrors/t + vlxmitcounters/t + swportvlcong + rcvcc/t + slrcvfecn + slrcvbecn + xmitcc/t + vlxmittimecc + smplctl/t + + yields + + myibnet_7fffe_tn + +NOTES +======================== + +If the timing option is greater than 0, the name of the overall timing +set will be as for the result given with "\_timing" appended. + +SEE ALSO +=========================== + +Plugin_ibnet(7) diff --git a/rtd/man2rst/ldms_ls.rst b/rtd/man2rst/ldms_ls.rst new file mode 100644 index 000000000..39dfc5024 --- /dev/null +++ b/rtd/man2rst/ldms_ls.rst @@ -0,0 +1,224 @@ +======= +ldms_ls +======= + +:Date: 28 Feb 2018 + +.. contents:: + :depth: 3 +.. + +NAME +======== + +ldms_ls - Query an ldmsd for metric set values + +SYNOPSIS +============ + +ldms_ls [OPTION...] [NAME] + +DESCRIPTION +=============== + +The ldms_ls command is used to query an ldmsd (ldms daemon) for metric +set values. + +ENVIRONMENT +=============== + +The following environment variables must be set: +------------------------------------------------ + +LD_LIBRARY_PATH + include the path to ovis/lib and libevent2. On some system, lib64 + rather than lib is required. + +PATH + include the path to ovis/sbin + +The following environment variables may be set to override compiled defaults: +----------------------------------------------------------------------------- + +ZAP_LIBPATH + path to ovis/lib/ovis-ldms + +LDMSD_PLUGIN_LIBPATH + path to ovis/lib/ovis-ldms + +The following environment variables are optional: +------------------------------------------------- + +LDMS_LS_MEM_SZ + The size of memory reserved for metric sets. See the -m option. + +OPTIONS +=========== + +If the NAME is specified on the command line without -E/-S/-I, only information for that instance = NAME is displayed. + +**-E** *NAME* + | + | Indicates that the NAME is a regular expression. + +**-S** *NAME* + | + | Indicates that the NAME is a schema name. + +**-I** *NAME* + | + | Indicates that the NAME is an instance name. This is the default. + +**-h** *HOST* + | + | HOST to query. Default is localhost. + +**-x** *TRANSPORT* + TRANSPORT to use for the query. values are sock, rdma, or ugni (Cray + XE/XK/XC). Default is sock. + +**-p** *PORT* + PORT of the HOST to use for the query. Default is LDMS_DEFAULT_PORT. + +**-l** + Display long listing. Outputs details of the metric set, including + timestamp, metric names, metric types, and values. + +**-a** *AUTH* + The name of the LDMS Authentication plugin. Please see + **ldms_authentication**\ (7) for more details. (default: "none"). + +**-A** *NAME*\ **=**\ *VALUE* + The name-value options for the LDMS Authentication plugin. This + option can be given multiple times. Please see + **ldms_authentication**\ (7) for more information and consult the + plugin manual for the option details. + +**-m** *MEMORY_SIZE* + | + | MEMORY_SIZE is the size of memory reserved for metric sets. This + value has precedence over the value of the LDMS_LS_MEM_SZ + environment variable. The given size must be less than 1 petabytes. + For example, 20M or 20mb are 20 megabytes. Unless a specific set is + being queried, this should usually match the size of pre-allocated + memory specified when starting the remote ldmsd being queried. + +**-u** + Display the user data for the metrics. (Usually compid) + +**-v** + Display metadata information. Specifying this option multiple times + increases the verbosity. + +**-V** + Display LDMS version information and then exit. + +**-w** *WAIT_SEC* + WAIT_SEC is the time to wait before giving up on the server. Default + is 10 sec. + +DEFAULTS +============ + +**ldms_ls** with no arguments defaults to **ldms_ls -p** *XXX* **-h** +*localhost* **-x** *sock* where XXX is the LDMS_DEFAULT_PORT. + +NOTES +========= + +None. + +BUGS +======== + +No known bugs. + +EXAMPLES +============ + +:: + + 1) $ldms_ls -h vm1 -x sock -p 60000 + vm1_1/meminfo + vm1_1/vmstat + + + + 2) $ldms_ls -h vm1 -x sock -p 60000 -l + vm1_1/meminfo: consistent, last update: Thu Oct 29 08:04:44 2015 [202552us] + D u64 MemTotal 132165188 + D u64 MemFree 129767048 + D u64 Buffers 0 + D u64 Cached 46780 + D u64 SwapCached 0 + D u64 Active 16116 + D u64 Inactive 8596 + D u64 Active(anon) 10440 + D u64 Inactive(anon) 220 + D u64 Active(file) 5676 + D u64 Inactive(file) 8376 + D u64 Unevictable 35400 + D u64 Mlocked 6032 + + + + + The output format of the data is as follows: + M/D + indicates metadata vs data values + Metrictype + in the example above, unsigned int 64. + Value + Value of the metric + + 3) For a non-existent set: + $ldms_ls -h vm1 -x sock -p 60000 -l vm1_1/foo + ldms_ls: No such file or directory + ldms_ls: lookup failed for set 'vm1_1/foo' + + 4) Display metadata: + ldms_ls -h vm1 -x sock -p 60000 -v + vm1_1/meminfo: consistent, last update: Fri Dec 16 17:12:08 2016 [5091us] + METADATA -------- + Producer Name : vm1_1 + Instance Name : vm1_1/meminfo + Schema Name : meminfo + Size : 1816 + Metric Count : 43 + GN : 2 + DATA ------------ + Timestamp : Fri Dec 16 17:12:08 2016 [5091us] + Duration : [0.000072s] + Consistent : TRUE + Size : 384 + GN : 985 + ----------------- + + 5) Regular Expression: + $ldms_ls -h vm1 -x sock -p 60000 -E vm1 + vm1_1/meminfo + vm1_1/vmstat + + $ldms_ls -h vm1 -x sock -p 60000 -E vms + vm1_1/vmstat + + $ldms_ls -h vm1 -x sock -p 60000 -E -I memin + vm1_1/meminfo + + $ldms_ls -h vm1 -x sock -p 60000 -E -S ^vmstat$ + vm1_1/vmstat + + $ldms_ls -h vm1 -x sock -p 60000 -E -S cpu + ldms_ls: No metric sets matched the given criteria + +If the -E option is not given, the given string will be taken literally, +i.e., it is equivalent to giving -E ^foo$. + +The regular expression option can be used with the -v and -l options. In +this case ldms_ls will display only the information of the metric sets +that matched the given regular expression. + +SEE ALSO +============ + +ldms_authentication(7), ldmsd(8), ldms_quickstart(7), diff --git a/rtd/man2rst/ldms_quickstart.rst b/rtd/man2rst/ldms_quickstart.rst new file mode 100644 index 000000000..0cff2344d --- /dev/null +++ b/rtd/man2rst/ldms_quickstart.rst @@ -0,0 +1,836 @@ +=============== +ldms_quickstart +=============== + +:Date: 12 Dec 2016 + +.. contents:: + :depth: 3 +.. + +NAME +================ + +LDMS_QuickStart - man page for Quick Start of LDMS + +INTRODUCTION +======================== + +LDMS is the Lightweight Distributed Metric Service. LDMS is a +distributed data collection, transport, and storage tool that supports a +wide variety of configuration options. There are three main functional +components described below. + +*Samplers* run one or more plugins that periodically sample data of +interest. Each plugin defines a group of metrics called a metric set. +The sampling frequency is user defined and can be dynamically changed. A +host can simultaneously run multiple plugins. Configuration flags +determine whether the sampling plugins run synchronously or +asynchonously (both on a host and across hosts). Memory allocated for a +particular metric set is overwritten by each successive sampling. The +host daemon does not retain sample history; plugins do not typically +retain history, but can be written to do so. + +*Aggregators* collect data in a pull fashion from samplers and/or other +aggregators. The collection frequency is user defined and operates +independently of other collection operations and sampling operations. +Distinct metric sets can be collected at different frequencies. Once +started, the aggregation schedule cannot be altered without restarting +the aggregator. Fan-in refers to the number of hosts collected from by a +single aggregator. Maximum fan-in varies by transport but is roughly +9,000:1 for the socket transport and for the RDMA transport over +Infiniband. It is > 15000:1 for RDMA over the Cray Gemini transport. +Daisy chaining is not limited to two levels; multiple aggregators may +aggregate from the same sampler or aggregator ldmsd. Fan-in at higher +levels is limited by the aggregator host capabilities (CPU, memory, +network bandwidth, and storage bandwidth). + +*Storage* plugins write in a variety of formats. Comma Separated Value +(CSV) file storage of metric sets plugins are provided. Storage occurs +when a valid updated metric set data is collected by an aggregator that +has been configured to write that data to storage. Collection of a +metric set whose data has not been updated or is incomplete does not +result in a write to storage in any format. + +The host daemon is the same base code in all cases; differentiation is +based on configuration of plugins for sampling or storage and on +configuring aggregation of data from other host daemons. + +DESCRIPTION +======================= + +Quick Start instructions for LDMS (Lightweight Distributed Metric +Service). + +This man page describes how to configure and run LDMS daemons (ldmsd) to +perform the following tasks: + +- collect data + +- aggregate data from multiple ldmsds + +- store collected data to files. + +There are three basic configurations that will be addressed: + +- configuring an ldmsd with collector plugins + +- configuring a ldmsd to aggregate information from other ldmsds + +- configuring a store_csv storage plugin on an ldmsd. + +The order in which these configurations should be performed does not +matter with respect to collectors and aggregators. + +While a complete listing of flags and parameters can be seen by running +ldmsd and the configuration tools with the --help directive, this +document describes the flags and parameters required for running a basic +setup. + +There are no run scripts provided in the current release; the examples +here can be used in the creation of such. + +Arrangement of this document +======================================== + +This document is arranged as follows: + + 1) Prerequisites + + 2) Build and install + + 3) Configuring and Starting an ldmsd (general) + + 4) through 8) Example ldmsd configurations and queries + + 9) Protection Domain Tags (Cray Only) + + 10) Troubleshooting + + 11) About the man pages + +1) PREREQUISITES +============================ + +- All sections below assume the build directory is /tmp/opt/ovis. + +- libevent-2.0 is a requirement. + +- Python 2.7 or Python 2.6 with the argparse module is required for + ldmsd_controller + +2) BUILD/INSTALL: +============================= + +There is a separate document with build/install instructions. + +The default ldms build in v3 has authentication turned on. This document +does not include use of the authentication flags; the instructions here +are as if you had built with --disable_ovis_auth. For more information +on authentication, see the ldms_authentication man page. + +3) CONFIGURING AND STARTING AN LDMSD +================================================ + +3-1) Environment Variables for LDMS +----------------------------------- + +You will need to set the following environment variables when running +LDMS daemons. This assumes that ldms has been installed in to +/tmp/opt/ovis. + +:: + + export LD_LIBRARY_PATH=/tmp/opt/ovis/lib/:/tmp/opt/ovis/lib/ovis-ldms/:/lib:$LD_LIBRARY_PATH + export ZAP_LIBPATH=/tmp/opt/ovis/lib/ovis-ldms + export LDMSD_PLUGIN_LIBPATH=/tmp/opt/ovis/lib/ovis-ldms + export PATH=/tmp/opt/ovis/sbin/:/tmp/opt/ovis/bin:$PATH + export LDMSD_SOCKPATH=/tmp/run/ldmsd + +LDMSD_SOCKPATH determines the location for the unix domain socket +(described in the ldmsd args below). The default is /var/run/ldmsd. Make +sure you use a location that is writeable if you are running as +non-root. + +3-2) Options for Configuring Plugins of an ldmsd +------------------------------------------------ + +Plugins for an ldmsd can be configured via a configuration file +specified as an argument to the "-c" flag. Also, ldmsd_controller is a +configuration tool that can work in interactive mode and can also can be +directed commands/scripts to a socket. The plugin configuration commands +are the same in all cases. + +In the instructions below, we briefly illustrate use of the +configuration script to ldmsd vs ldmsd_controller. Some environmental +variables have been supressed in this section for clarity. In all +subsequent examples (Sections 4+), we provide versbose detail for the +ldmsd configuration script method only. Altering this to use the other +methods should then be obvious. + +3-2a) Configuring an ldmsd via a configuration script +----------------------------------------------------- + +This is the most usual mode of configuring ldms in production scenarios +and can also be used for test scenarios. + +Example commands for configuring a sampler: + +:: + + > more config.file + + load name=meminfo + config name=meminfo producer=vm1_1 instance=vm1_1/meminfo + start name=meminfo interval=1000000 + +The path to the configuration script is then provided to the ldmsd via +the "-c" flag when it is started: + +Example ldmsd start command with a configuration script: + +:: + + ldmsd -x sock:60000 -S tmp/ldmsd/sock1 -l /tmp/log/logfile -v DEBUG -c ./config.file + +3-2b) Configuring ldmsd via ldmsd_controller +-------------------------------------------- + +You can use ldmsd_controller to connect to the ldmsd at any time to +issue plugin commands. This is most often used for dynamically issuing +commands to a running ldmsd. + +Example ldmsd start command without a configuration script: + +:: + + ldmsd -x sock:60000 -S tmp/ldmsd/sock1 -l /tmp/log/logfile -v DEBUG + +Call the ldmsd_controller interactively and enter the same commands as +you would in the configuration script. + +:: + + ldmsd_controller --host vm1_1 --port=61000 + ldmsd_controller> load name=meminfo + ldmsd_controller> config name=meminfo producer=vm1_1 instance=vm1_1/meminfo + ldmsd_controller> start name=meminfo interval=1000000 + ldmsd_controller> quit + +Relatedly, you can run ldmsd_controller with the commands in script +form. For example: + +:: + + > more config.sh + + #!/bin/bash + echo "load name=meminfo" + echo "config name=meminfo producer=vm1_1 instance=vm1_1/meminfo" + echo "start name=meminfo interval=1000000" + +Call the ldmsd_controller with the script: + +:: + + ldmsd_controller --host vm1_1 --port=60000 --script ./config.sh + +ldmsd_contoller may be executed multiple times to issues different +commands to the same ldmsd. + +3-3) Starting an ldmsd +---------------------- + +3-3a) Set environment variables, as described above. + +3-3b) Run ldmsd: + +:: + + /ldmsd -x : -S -l -v -c config.file + +Notes: + +- Transport is one of: sock, rdma, ugni (ugni is Cray specific for + using RDMA over the Gemini/Aries network) + +- The configuration file contains the commands to configure the + plugins. + +- The unix domain socket can be used to communicate configuration + information to an ldmsd. The default path for this is + /var/run/ldmsd/. To change this the environment variable + LDMSD_SOCKPATH must be set to the desired path (e.g. export + LDMSD_SOCKPATH=/tmp/run/ldmsd) + +- No log can be can be obtained by using LOG_LEVEL QUIET, or specifying + /dev/null for the log file, or using command line redirection. + +- The default is to run as a background process but the -F flag can be + specified for foreground + +- A script can be made to start ldmsd and collectors on a host where + that script contains the information to execute the command. + +3-3c) Examples for launching ldmsd: + +- Start an ldmsd on the socket transport with a log file and a + configuration file. + +:: + + /tmp/opt/ovis/sbin/ldmsd -x sock:60000 -S /var/run/ldmsd/metric_socket -l /tmp/opt/ovis/logs/1 -c config.file + + Same but with log level QUIET + /tmp/opt/ovis/sbin/ldmsd -x sock:60000 -S /var/run/ldmsd/metric_socket -l /tmp/opt/ovis/logs/1 -c config.file -V QUIET + +- Start 2 instances of ldmsd on host vm1 + +:: + + Note: Make sure to use different socket names and listen on different ports if you are on the same host. + /tmp/opt/ovis/sbin/ldmsd -x sock:60000 -S /var/run/ldmsd/metric_socket_vm1_1 -l /tmp/opt/ovis/logs/vm_1 -c config.file + /tmp/opt/ovis/sbin/ldmsd -x sock:60001 -S /var/run/ldmsd/metric_socket_vm1_2 -l /tmp/opt/ovis/logs/vm_2 -c config.file + +4) EXAMPLE: CONFIGURE AN LDMSD WITH SAMPLER PLUGINS +=============================================================== + +4-1) Create the configuration file for the sampler plugins: +----------------------------------------------------------- + +Configure a "meminfo" collector plugin to collect every second. + +:: + + load name=meminfo + config name=meminfo producer=vm1_1 instance=vm1_1/meminfo + start name=meminfo interval=1000000 + + + Notes: + For synchronous operation include "offset=<#usec>" in start line (e.g. start name=meminfo interval=xxx offset=yyy). + This will cause the sampler to target interval + yyy aligned to the second and micro second + (e.g. every 5 seconds with an offset of 0 usec would ideally result in collections at 00:00:00, 00:00:05, 00:00:10, etc. + whereas with an offset of 100,000 usec it would be 00:00:00.1, 00:00:05.1, 00:00:10.1, etc) + Different plugins may have additional configuration parameters. + +4-2) Set environment variables, as described above. +--------------------------------------------------- + +4-3) Start the ldmsd with the config file, as described above. e.g., +-------------------------------------------------------------------- + + ldmsd -x sock:60000 -S tmp/ldmsd/sock1 -l /tmp/log/logfile -v DEBUG + -c ./config.file + +4-4) Verifying the collector +---------------------------- + +At this point the ldmsd collector should be checked using the utility +ldms_ls (See Using ldms_ls below) + +5) EXAMPLE: CONFIGURE AN AGGREGATOR USING LDMSD_CONTROLLER +====================================================================== + +5-1) Start 2 separate ldmsds, one on host vm1_1 and one on host vm1_2, with sampler plugins, as described above +--------------------------------------------------------------------------------------------------------------- + +5-2) Write a script to add producers and start collecting from them: +-------------------------------------------------------------------- + +This adds vm1_1 as a producer with its sets collected at 2 second +intervals and vm1_2 as a producer with its sets collected at 5 second +intervals. Here the "name" of the producer must match the "producer" +name given to the sampler. + +The first set of lines adds the producers. The second set of lines +establishes the aggregation from them. at the specified intervals. + +:: + + > more add_prdcr.config + prdcr_add name=vm1_2 host=vm1 type=active xprt=sock port=60001 interval=20000000 + prdcr_start name=vm1_2 + prdcr_add name=vm1_1 host=vm1 type=active xprt=sock port=60000 interval=20000000 + prdcr_start name=vm1_1 + updtr_add name=policy2_h1 interval=2000000 offset=0 + updtr_prdcr_add name=policy2_h1 regex=vm1_1 + updtr_start name=policy2_h1 + updtr_add name=policy5_h2 interval=5000000 offset=0 + updtr_prdcr_add name=policy5_h2 regex=vm1_2 + updtr_start name=policy5_h2 + +5-3) Set environment variables, as described above +-------------------------------------------------- + +5-4) Start an ldmsd on your host to aggregate using the configuration file +-------------------------------------------------------------------------- + + /tmp/opt/ovis/sbin/ldmsd -x sock:60002 -S + /var/run/ldmsd/metric_socket_agg -l /tmp/opt/ovis/logs/vm1_agg -c + ./add_prdcr.sh + +Notes: + +- There is no requirement that aggregator intervals match collection + intervals + +- Because the collection and aggregation processes operate + asynchronously there is the potential for duplicate data collection + as well as missed samples. The first is handled by the storage + plugins by comparing generation numbers and not storing duplicates. + The second implies either a loss in fidelity (if collecting counter + data) or a loss of data points here and there (if collecting + differences of counter values or non counter values). This can be + handled using the synchronous option on both collector and aggregator + but is not covered here. + +5-4) At this point the ldmsd collector should be checked using the utility ldms_ls +---------------------------------------------------------------------------------- + +(See Using ldms_ls below). In this case you should see metric sets for +both vm1_1 and vm1_2 displayed when you query the aggregator ldmsd using +ldms_ls. + +6) EXAMPLE: CONFIGURE AN LDMS AGGREGATOR WITH A STORAGE PLUGIN +========================================================================== + +6-1) Add storage configuration lines to the configuration file described above. +------------------------------------------------------------------------------- + +This adds a store_csv to store sets whose schema are meminfo or vmstat +and whose instance name matches the regex. A set's schema and instance +names will be seen in the output of ldms_ls (described below). + +> more add_store.sh load name=store_csv config name=store_csv +path=<> action=init altheader=0 rollover=30 rolltype=1 +strgp_add name=policy_mem plugin=store_csv container=csv schema=meminfo +strgp_prdcr_add name=policy_mem regex=vm\* strgp_start +name=policy_vmstat strgp_add name=policy_vmstat plugin=store_csv +container=csv schema=vmstat strgp_prdcr_add name=policy_vmstat +regex=vm\* strgp_start name=policy_vmstat + +Notes: + +- For the csv store, the whole path must pre-exist. + +- See the Plugin_store_csv man page for more info on the plugin + configuration arguments. + +- If you want to collect on a host and store that data on the same + host, run two ldmsd's: one with a collector plugin only and one as an + aggegrator with a store plugin only. + +6-2) Set environment variables, as described above +-------------------------------------------------- + +6-3) Start the aggregator with the full configuration file (both aggregator and store lines), as described above +---------------------------------------------------------------------------------------------------------------- + +6-4) Verify the store +--------------------- + +Go to data store and verify files have been created and are being +written to + +:: + + cd <>/ + ls -ltr + +You can now utilize this data. + +Data will flush to the store when the OS flushes data unless an advanced +flag is used. Thus, in a default configuration, if you have a small +number of nodes and/or a long interval, you may not see data appear in +the store for a few minutes. + +7) EXAMPLES: USING LDMS_LS TO DISPLAY SETS/METRICS FROM AN LDMSD +============================================================================ + +7-1) Set environment variables, as described above +-------------------------------------------------- + +7-2a) Query ldmsd on host vm1 listening on port 60000 (sampler) using the sock transport for metric sets being served by that ldmsd +----------------------------------------------------------------------------------------------------------------------------------- + +:: + + ldms_ls -h vm1 -x sock -p 60000 + Should return: + vm1_1/meminfo + vm1_1/vmstat + +7-2b) Query ldmsd on host vm1 listening on port 60002 (aggregator) using the sock transport for metric sets being served by that ldmsd +-------------------------------------------------------------------------------------------------------------------------------------- + +:: + + ldms_ls -h vm1 -x sock -p 60002 + Should return: + vm1_1/meminfo + vm1_1/vmstat + vm1_2/meminfo + vm1_2/vmstat + +7-2c) Query ldmsd on host vm1 listening on port 60000 using the sock transport for the names and contents of metric sets being served by that ldmsd. +---------------------------------------------------------------------------------------------------------------------------------------------------- + +Should return: Set names (vm1_1/meminfo and vm1_1/vmstat in this case) +as well as all names and values associated with each set respectively. +Only vm1_1/meminfo shown here. + +:: + + > ldms_ls -h vm1 -x sock -p 60000 -l + vm1_1/meminfo: consistent, last update: Wed Jul 31 21:51:08 2013 [246540us] + U64 33084652 MemTotal + U64 32092964 MemFree + U64 0 Buffers + U64 49244 Cached + U64 0 SwapCached + U64 13536 Active + U64 39844 Inactive + U64 5664 Active(anon) + U64 13540 Inactive(anon) + U64 7872 Active(file) + U64 26304 Inactive(file) + U64 2996 Unevictable + U64 2988 Mlocked + U64 0 SwapTotal + U64 0 SwapFree + U64 0 Dirty + U64 0 Writeback + U64 7164 AnonPages + U64 6324 Mapped + U64 12544 Shmem + U64 84576 Slab + U64 3948 SReclaimable + U64 80628 SUnreclaim + U64 1608 KernelStack + U64 804 PageTables + U64 0 NFS_Unstable + U64 0 Bounce + U64 0 WritebackTmp + U64 16542324 CommitLimit + U64 73764 Committed_AS + U64 34359738367 VmallocTotal + U64 3467004 VmallocUsed + U64 34356268363 VmallocChunk + U64 0 HugePages_Total + U64 0 HugePages_Free + U64 0 HugePages_Rsvd + U64 0 HugePages_Surp + U64 2048 Hugepagesize + U64 565248 DirectMap4k + U64 5726208 DirectMap2M + U64 27262976 DirectMap1G + +7-2d) Query for a non-existent set: +=============================================== + +:: + + ldms_ls -h vm1 -x sock -p 60000 -l vm1_1/foo + ldms_ls: No such file or directory + ldms_ls: lookup failed for set 'vm1_1/foo' + +7-2e) Display metadata about sets contained by vm1 ldmsd listening on port 60000 +============================================================================================ + +:: + + ldms_ls -h vm1 -x sock -p 60000 -v + vm1_1/meminfo: consistent, last update: Fri Dec 16 17:12:08 2016 [5091us] + METADATA -------- + Producer Name : vm1_1 + Instance Name : vm1_1/meminfo + Schema Name : meminfo + Size : 1816 + Metric Count : 43 + GN : 2 + DATA ------------ + Timestamp : Fri Dec 16 17:12:08 2016 [5091us] + Duration : [0.000072s] + Consistent : TRUE + Size : 384 + GN : 985 + ----------------- + +8) STOP AN LDMSD +============================ + +To kill all ldmsd on a host +--------------------------- + +:: + + killall ldmsd + +9) PROTECTION DOMAIN TAGS (Cray) +============================================ + +9-1) Cray XE/XK: +---------------- + +If you are going to be using the "ugni" transport (RDMA over Gemini) you +will need to run with either system (as root) or user (as user) ptags. +While root CAN run using any ptag the fact that its use is unknown to +ALPS could cause collisions with applications. + +To see current ptags: +--------------------- + +:: + + > apstat -P + PDomainID Type Uid PTag Cookie + LDMS system 0 84 0xa9380000 + +To create a userspace ptag: +--------------------------- + +:: + + apmgr pdomain -c + + Example: + > apmgr pdomain -c foo + > apstat -P + PDomainID Type Uid PTag Cookie + LDMS system 0 84 0xa9380000 + foo user 12345 233 0xa1230000 + +Note: A system administrator will have to setup system ptags and/or +enable users to set up ptags. + +To remove a userspace ptag: +--------------------------- + +:: + + apmgr pdomain -r + +Note: The userid of the ptag being removed must match that of the user +running the command or root + +PTAG-Related Enviroment variables for ldms (XE/XK) +-------------------------------------------------- + +Set the following environment variables for either user or system ptags +(example shows user ptag values): + +:: + + export ZAP_UGNI_PTAG 233 + export ZAP_UGNI_COOKIE 0xa1230000 + +Starting ldms from aprun with ptags +----------------------------------- + +When running with user space ptags you must specify the ptag name when +using aprun + +:: + + aprun <> -p foo ldmsd <> + or + aprun <> -p foo ldms_ls <> + +Note: On some systems you will run aprun after a qsub -I or within a +script specified in qsub or similiar. + +9-2) Cray XC, CLE <= 5.2: +------------------------- + +If you are going to be using the "ugni" transport (RDMA over Aries) you +will need to run with either system (as root) or user (as user) ptags. +While root CAN run using any ptag the fact that its use is unknown to +ALPS could cause collisions with applications. + +To see current ptags: +--------------------- + +:: + + > apstat -P + PDomainID Type Uid Cookie Cookie2 + LDMS system 0 0x86b80000 0 + +To create a userspace ptag: +--------------------------- + +:: + + apmgr pdomain -c + + Example: + > apmgr pdomain -c foo + > apstat -P + PDomainID Type Uid Cookie Cookie2 + LDMS system 0 0x86b80000 0 + foo user 20596 0x86bb0000 0x86bc0000 + +Note: A system administrator will have to setup system ptags and/or +enable users to set up ptags. + +To remove a userspace ptag: +--------------------------- + +:: + + apmgr pdomain -r + +Note: The userid of the ptag being removed must match that of the user +running the command or root + +PTAG-Related Enviroment variables for ldms (XC) +----------------------------------------------- + +Set the following environment variables. On XC the ptag value doesn't +matter but ZAP_UGNI_PTAG must be defined. Set the Cookie (not Cookie2) +for either user or system ptag. + +:: + + export ZAP_UGNI_PTAG=0 + export ZAP_UGNI_COOKIE=0x86bb0000 + +Starting ldms from aprun with ptags +----------------------------------- + +When running with user space ptags you must specify the ptag name when +using aprun + +:: + + aprun <> -p foo ldmsd <> + or + aprun <> -p foo ldms_ls <> + +Note: On some systems you will run aprun after a qsub -I or within a +script specified in qsub or similiar. + +10) TROUBLESHOOTING +=============================== + +What causes the following error: libibverbs: Warning: RLIMIT_MEMLOCK is 32768 bytes? +------------------------------------------------------------------------------------ + +Running as a user with "max locked memory" set too low. The following is +an example of trying to run ldms_ls as a user with "max locked memory" +set to 32k: + +:: + + ldms_ls -h -x rdma -p + libibverbs: Warning: RLIMIT_MEMLOCK is 32768 bytes. + This will severely limit memory registrations. + RDMA: recv_buf reg_mr failed: error 12 + ldms_ls: Cannot allocate memory + +Why doesn't my ldmsd start? +--------------------------- + +Possible options: + +- Check for existing /var/run/ldms/metric_socket or similar. Sockets + can be left if an ldmsd did not clean up upon termination. kill -9 + may leave the socket hanging around. + +- The port you are trying to use may already be in use on the node. The + following shows the logfile output of such a case: + +:: + + Tue Sep 24 08:36:54 2013: Started LDMS Daemon version 2.1.0 + Tue Sep 24 08:36:54 2013: Process 123456 listening on transport ugni:60020 + Tue Sep 24 08:36:54 2013: EV_WARN: Can't change condition callbacks once they have been initialized. + Tue Sep 24 08:36:54 2013: Error 12 listening on the 'ugni' transport. + Tue Sep 24 08:36:54 2013: LDMS Daemon exiting...status 7 + If using the -l flag make sure that your log directory exists prior to running + If writing to a store with this particular lmdsd make sure that your store directory exists prior to running + If you are running on a Cray with transport ugni using a user space PTag, check that you called aprun with the -p flag + aprun -N 1 -n -p run_my_ldmsd.sh + +How can I find what process is using the port? +---------------------------------------------- + + netstat -abno + +Why arent all my hosts/sets adding to the aggregator? +----------------------------------------------------- + +Possible options: + +- use -m flag on the aggregator to use more memory when adding a lot of + hosts + +- use -p on the aggregator to use more processors + +Why isn't my ldmsd storing its own set to the store? +---------------------------------------------------- + +Currently, this is not supported. You can use a separate ldmsd on the +same host to gather another ldmsd's data for that host. + +Why is my aggregator not responding (CRAY XE/XK)? +------------------------------------------------- + +Running a ldmsd aggregator as a user but trying to aggregate from a +ldmsd that uses a system ptag can result in the aggregator hanging +(alive but not responding and not writing to the store). The following +is the logfile output of such an aggregator: + +:: + + Tue Sep 24 08:42:40 2013: Connected to host 'nid00081:60020' + Tue Sep 24 08:42:42 2013: cq_thread_proc: Error 11 monitoring the CQ. + +11) MAN PAGES +========================= + +ldms comes with man pages. In the build process these will be installed +in /ovis/share/man. Man pages are in the following +catagories: + +General +------- + +General pages address information, such as ldms_build_install, +ldms_quickstart, and ldms_authentication. + +Utilities +--------- + +Utilities pages address the various utilities and commands such as +ldmsd, ldmsd_controller, and ldms_ls. + +Plugins +------- + +Plugin pages address all plugins, both samplers and stores. Naming +convention for these pages is Plugin_XXX. For example: Plugin_aries_mmr, +Plugin_cray_system_sampler_variants, Plugin_kgnilnd, Plugin_meminfo, +Plugin_procinterrupts, Plugin_procnetdev, Plugin_procnfs, +Plugin_store_csv, Plugin_store_function_csv, Plugin_store_sos, and +Plugin_vmstat. + +NOTES +================= + +As part of the install, test scripts are placed in /tmp/opt/ovis/bin. +These scripts may serve as additional examples. These are being +converted from using the obsolete ldmsctl tool to the ldmsd_controller +tool, so they may not be fully updated at any given time. + +BUGS +================ + +No known bugs. + +SEE ALSO +==================== + +ldms_build_install(7), ldmsd(8), ldmsd_controller(8), +ldms_authentication(7), ldms_build_install(7), ldms_ls(8) diff --git a/rtd/man2rst/ldms_rdc_schema_name.rst b/rtd/man2rst/ldms_rdc_schema_name.rst new file mode 100644 index 000000000..562696011 --- /dev/null +++ b/rtd/man2rst/ldms_rdc_schema_name.rst @@ -0,0 +1,63 @@ +==================== +ldms_rdc_schema_name +==================== + +:Date: 2 April 2021 + +.. contents:: + :depth: 3 +.. + +NAME +===================== + +ldms_rdc_schema_name - man page for the LDMS rdc_sampler plugin support +utility + +SYNOPSIS +========================= + +ldms_rdc_schema_name -h ldms_rdc_schema_name [-d] + +DESCRIPTION +============================ + +The rdc_sampler plugin generates a schema name including a hash of +certain configuration data. ldms_rdc_schema_name provides the user with +the resulting name before running ldmsd so that store plugins can be +configured. + +CONFIGURATION ATTRIBUTE SYNTAX +=============================================== + +See Plugin_rdc_sampler(7). + +EXAMPLES +========================= + +:: + + #ldms_rdc_schema_name -h + + + # ldms_rdc_schema_name metrics=base schema=myrdc_sampler | grep -v ERROR + myrdc_sampler_51dcba58 + + # ldms_rdc_schema_name metrics=xgmi + rdc_sampler_device_e3e41d59 + + # ldms_rdc_schema_name -d metrics=xgni + + +NOTES +====================== + +The rdc libraries loaded by the plugin and the program may emit +inconsequential error messages to stdout. One such begins with +" ERROR RdcLibraryLoader.cc". + +SEE ALSO +========================= + +Plugin_rdc_sampler(7) diff --git a/rtd/man2rst/ldms_sampler_base.rst b/rtd/man2rst/ldms_sampler_base.rst new file mode 100644 index 000000000..8af324bb7 --- /dev/null +++ b/rtd/man2rst/ldms_sampler_base.rst @@ -0,0 +1,132 @@ +================= +ldms_sampler_base +================= + +:Date: 04 Feb 2018 + +.. contents:: + :depth: 3 +.. + +NAME +================== + +sampler_base - man page for the LDMS sampler_base which is the base +class for sampler + +SYNOPSIS +====================== + +Configuration variable base class for LDMS samplers. + +DESCRIPTION +========================= + +With LDMS (Lightweight Distributed Metric Service), sampler plugins for +the ldmsd (ldms daemon) should inherit from the sampler_base base class. +This class defines variables that should be common to all samplers. It +also adds them to the sampler set set and handles their value +assignment. + +In order to configure a plugin, one should consult both the plugin +specific man page for the information and configuration arguments +specific to the plugin and this man page for the arguments in the +sampler_base. + +CONFIGURATION ATTRIBUTE SYNTAX +============================================ + +**config** + name= producer= instance= + [component_id=] [schema=] [job_set= job_id= + app_id= job_start= job_end=] + +| +| configuration line + + name= + | + | This will be the name of the plugin being loaded. + + producer= + | + | A unique name for the host providing the data. + + instance= + | + | A unique name for the metric set. + + schema= + | + | Optional schema name. It is intended that the same sampler on + different nodes with different metrics have a different schema. + Defaults to the sampler name. + + component_id= + | + | Optional unique number for the component being monitored, + Defaults to zero. + + job_set= + | + | The instance name of the set containing the job data, default is + 'job_info'. + + job_id= + | + | The name of the metric containing the Job Id, default is + 'job_id'. + + app_id= + | + | The name of the metric containing the Application Id, default is + 'app_id'. + + job_start= + | + | The name of the metric containing the Job start time, default is + 'job_start'. + + job_end= + | + | The name of the metric containing the Job end time, default is + 'job_end'. + +NOTES +=================== + +- This man page does not cover usage of the base class for plugin + writers. + +- Not all plugins may have been converted to use the base class. The + plugin specific man page should refer to the sampler_base where this + has occurred. + +BUGS +================== + +No known bugs. + +EXAMPLES +====================== + +Within ldmsd_controller or a configuration file: + +:: + + load name=meminfo + config name=meminfo producer=vm1_1 instance=vm1_1/meminfo + start name=meminfo interval=1000000 + +SEE ALSO +====================== + +ldmsd(8), ldms_quickstart(7), ldmsd_controller(8), +Plugin_all_example(7), Plugin_aries_linkstatus(7), Plugin_aries_mmr(7), +Plugin_array_example(7), Plugin_clock(7), +Plugin_cray_sampler_variants(7), Plugin_cray_dvs_sampler(7), +Plugin_procdiskstats(7), Plugin_fptrans(7), Plugin_kgnilnd(7), +Plugin_lnet_stats(7), Plugin_meminfo(7), Plugin_msr_interlagos(7), +Plugin_perfevent(7), Plugin_procinterrupts(7), Plugin_procnetdev(7), +Plugin_procnfs(7), Plugin_rapl(7), Plugin_sampler_atasmart(7), +Plugin_sysclassib(7), Plugin_synthetic(7), Plugin_vmstat(7) diff --git a/rtd/man2rst/ldmsctl.rst b/rtd/man2rst/ldmsctl.rst new file mode 100644 index 000000000..fcb2d25d1 --- /dev/null +++ b/rtd/man2rst/ldmsctl.rst @@ -0,0 +1,791 @@ +======= +ldmsctl +======= + +:Date: 19 Nov 2019 + +.. contents:: + :depth: 3 +.. + +NAME +======== + +ldmsctl - Issue control commands to ldmsd. + +SYNOPSIS +============ + +ldmsctl [OPTION...] + +DESCRIPTION +=============== + +After LDMS (lightweight Distributed Metric Service) version 3.4, ldmsctl +is an LDMS daemon C-interface that can be used to dynamically configure +an LDMS daemon instead of ldmsd_controller when Python is not available. +After the ldmsctl is started commands can be entered at the prompt or +(usually) a command script can be created and piped into the ldmsctl. + +LDMS version 4 requires ldmsctl to use LDMS transport (data channel) to +connect to **ldmsd** to levarage LDMS Authentication plugin in the +transport. Please note that the **ldmsd** may have multiple data +channels, one of which can be dedicated for management use. + +ENVIRONMENT +=============== + +The following environment variables must be set (includes environment +variables needed for the actions, for example, paths to the sampler +libraries to be added): + +LD_LIBRARY_PATH + path_to_ovis_build/lib:path_to_ovis_build/lib/ovis-ldms:path_to_libevent_2.0_build/lib + +ZAP_LIBPATH + path_to_ovis_build/lib/ovis-ldms + +LDMSD_PLUGIN_LIBPATH + path_to_ovis_build/lib/ovis-ldms + +PATH + path_to_ovis_build/sbin:path_to_ovis_build/bin + +OPTIONS +=========== + +**-h** *HOST* + HOST is the hostname to connect to the LDMS daemon + +**-p** *PORT* + PORT is the port to connect to the LDMS daemon + +**-x** *XPRT* + XPRT is the transport one of sock, ugni, or rdma. Only use with the + option -i + +**-a** *AUTH* + AUTH is the name of the LDMS Authentication plugin to be used for the + connection. Please see **ldms_authentication**\ (7) for more + information. If this option is not given, the default is "none" (no + authentication). + +**-A** *NAME*\ **=**\ *VALUE* + Passing the *NAME*\ =\ *VALUE* option to the LDMS Authentication + plugin. This command line option can be given multiple times. Please + see **ldms_authentication**\ (7) for more information, and consult + the plugin manual page for plugin-specific options. + +**-s** *SOURCE* + SOURCE is the path to a configuration file + +**-X** *COMMAND* + COMMAND is a shell command to be executed. The output will be sent to + ldmsd. + +**-V** + Display LDMS version information and then exit. + +REGULAR EXPRESSION +====================== + +The regular expression specified in *regex=* option of the commands is a +POSIX Extended (modern) Regular Expression. In short, "\*+?{}|^$." are +special regular expression characters. Please see **regex(7)** for more +information. + +PLUGIN COMMAND SYNTAX +========================= + +Load a plugin +------------- + +| **load** attr= + + **name** *name* + | + | The plugin name + +List the usage of the loaded plugins +------------------------------------ + +**usage** + +unload a plugin +--------------- + +| **term** attr= + + **name** *name* + | + | The plugin name + +Send a configuration command to the specified plugin. +----------------------------------------------------- + +**config** attr= + + **name** *name* + | + | The plugin name + + **attr=value** + | + | Plugin specific attr=value tuples + + **Attributes specific for sampler plugins (Some sampler plugins + may have additional** attributes) + + **producer** *producer* + | + | A unique name for the host providing the data + + **instance** *instance* + | + | The set instance name. The name must be unique among all + metric sets in all LDMS daemons. + + **[component_id** *component_id*\ **]** + | + | A unique number for the comopnent being monitored. The + default is zero. + + **[schema** *schema*\ **]** + | + | The name of the metric set schema. + + **[job_set** *job_set*\ **]** + | + | The set instance name of the set containing the job data. The + default is 'job_info'. + + **[uid** *uid*\ **]** + | + | The user id of the set's owner. The default is the returned + value of geteuid(). + + **[gid** *gid*\ **]** + | + | The group id of the set's owner. The default is the returned + value of getegid(). + + **[perm** *perm*\ **]** + | + | The sampler plugin instance access permission. The default is + 0440. + +Start a sampler plugin +---------------------- + +**start** attr= + + **name** *name* + | + | The plugin name. + + **interval** *interval* + | + | The sample interval in microseconds. + + **[offset** *offset*\ **]** + | + | Offset (shift) from the sample mark in microseconds. Offset can + be positive or negative with magnitude up to 1/2 the sample + interval. If this offset is specified, including 0, collection + will be synchronous; if the offset is not specified, collection + will be asynchronous. Optional. + +Stop a sampler plugin +--------------------- + +**stop** attr= + + **name** *name* + | + | The plugin name. + +PRODUCER COMMAND SYNTAX +=========================== + +Add a producer to the aggregator +-------------------------------- + +| **prdcr_add** attr= + + **name** *name* + | + | The producer name. The producer name must be unique in an + aggregator. It is independent of any attributes specified for + the metric sets or hosts. + + **xprt** *xprt* + | + | The transport name [sock, rdma, ugni] + + **host** *host* + | + | The hostname of the host + + **type** *conn_type* + | + | The connection type [active, passive] + + **interval** *interval* + | + | The connection retry interval + + **[perm** *permission*\ **]** + | + | The permission to modify the producer in the future + +Delete a producer from the aggregator +------------------------------------- + +| The producer cannot be in use or running +| **prdcr_del** attr= + + **name** *name* + | + | The producer name + +Start a producer +---------------- + +**prdcr_start** attr= + + **name** *name* + | + | The producer name + + **[interval** *interval*\ **]** + | + | The connection retry interval in microsec. If unspecified, the + previously configured value will be used. Optional. + +Start all producers matching a regular expression +------------------------------------------------- + +**prdcr_start_regex** attr= + + **regex** *regex* + | + | A regular expression + + **[interval** *interval*\ **]** + | + | The connection retry interval in microsec. If unspecified, the + previously configured value will be used. Optional. + +Stop a producer +--------------- + +**prdcr_stop** attr= + + **name** *name* + | + | The producer name + +Stop all producers matching a regular expression +------------------------------------------------ + +**prdcr_stop_regex** attr= + + **regex** *regex* + | + | A regular expression + +Query producer status +--------------------- + +**prdcr_status** attr= + + **[name** *name*\ **]** + | + | The producer name. If none is given, the statuses of all + producers are reported. + +Subscribe for stream data from all matching producers +----------------------------------------------------- + +**prdcr_subsribe** + + **regex** *regex* + | + | The regular expression matching producer name + + **stream** *stream* + | + | The stream name + +UPDATER COMMAND SYNTAX +========================== + +Add an updater process that will periodically sample producer metric sets +------------------------------------------------------------------------- + +**updtr_add** attr= + + **name** *name* + | + | The update policy name. The policy name should be unique. It is + independent of any attributes specified for the metric sets or + hosts. + + **interval** *interval* + | + | The update/collect interval + + **[offset** *offset*\ **]** + | + | Offset for synchronized aggregation. Optional. + + **[push** *onchange|true*\ **]** + | + | Push mode: 'onchange' and 'true'. 'onchange' means the Updater + will get an update whenever the set source ends a transaction or + pushes the update. 'true' means the Updater will receive an + update only when the set source pushes the update. If \`push\` + is used, \`auto_interval\` cannot be \`true\`. + + **[auto_interval** *true|false* **]** + If true, the updater will schedule set updates according to the + update hint. The sets with no hints will not be updated. If false, + the updater will schedule the set updates according to the given + interval and offset values. If not specified, the value is + *false*. + + **[perm** *permission*\ **]** + | + | The permission to modify the updater in the future + +Remove an updater from the configuration +---------------------------------------- + +**updtr_del** attr= + + **name** *name* + | + | The update policy name + +Add a match condition that specifies the sets to update. +-------------------------------------------------------- + +**updtr_match_add** attr= + + **name** *name* + | + | The update policy name + + **regex** *regex* + | + | The regular expression + + **match** *match (inst|schema)* + | + | The value with which to compare; if match=inst, the expression + will match the set's instance name, if match=schema, the + expression will match the set's schema name. + +Remove a match condition from the Updater. +------------------------------------------ + +**updtr_match_del** attr= + + **name** *name* + | + | The update policy name + + **regex** *regex* + | + | The regular expression + + **match** *match (inst|schema)* + | + | The value with which to compare; if match=inst, the expression + will match the set's instance name, if match=schema, the + expression will match the set's schema name. + +Add matching producers to an updater policy +------------------------------------------- + +This is required before starting the updater. + +**updtr_prdcr_add** attr= + + **name** *name* + | + | The update policy name + + **regex** *regex* + | + | A regular expression matching zero or more producers + +Remove matching producers to an updater policy +---------------------------------------------- + +**updtr_prdcr_del** attr= + + **name** *name* + | + | The update policy name + + **regex** *regex* + | + | A regular expression matching zero or more producers + +Start updaters. +--------------- + +**updtr_start** attr= + + **name** *name* + | + | The update policy name + + **[interval** *interval*\ **]** + | + | The update interval in micro-seconds. If this is not specified, + the previously configured value will be used. Optional. + + **[offset** *offset*\ **]** + | + | Offset for synchronized aggregation. Optional. + +Stop an updater. +---------------- + +The Updater must be stopped in order to change it's configuration. + +**updtr_stop** attr= + + **name** *name* + | + | The update policy name + +Query the updater status +------------------------ + +**updtr_status** attr= + + **[name** *name*\ **]** + | + | The updater name. If none is given, the statuses of all updaters + are reported. + +STORE COMMAND SYNTAX +======================== + +Create a Storage Policy and open/create the storage instance. +------------------------------------------------------------- + +**strgp_add** attr= + + **name** *name* + | + | The unique storage policy name. + + **plugin** *plugin* + | + | The name of the storage backend. + + **container** *container* + | + | The storage backend container name. + + **schema** *schema* + | + | The schema name of the metric set to store. + + **[perm** *permission*\ **]** + | + | The permission to modify the storage in the future + +Remove a Storage Policy +----------------------- + +| All updaters must be stopped in order for a storage policy to be + deleted +| **strgp_del** attr= + + **name** *name* + | + | The storage policy name + +Add a regular expression used to identify the producers this storage policy will apply to. +------------------------------------------------------------------------------------------ + +| If no producers are added to the storage policy, the storage policy + will apply on all producers. +| **strgp_prdcr_add** attr= + + **name** *name* + | + | The storage policy name + + **regex** *name* + | + | A regular expression matching metric set producers. + +Remove a regular expression from the producer match list +-------------------------------------------------------- + +**strgp_prdcr_del** attr= + + | **name** *name* + | The storage policy name + + **regex** *regex* + | + | The regex of the producer to remove. + +Add the name of a metric to store +--------------------------------- + +**strgp_metric_add** attr= + + | **name** *name* + | The storage policy name + + **metric** *metric* + | + | The metric name. If the metric list is NULL, all metrics in the + metric set will be stored. + +Remove a metric from the set of stored metrics. +----------------------------------------------- + +**strgp_metric_del** attr= + + | **name** *name* + | The storage policy name + + **metric** *metric* + | + | The metric to remove + +Start a storage policy. +----------------------- + +**strgp_start** attr= + + | **name** *name* + | The storage policy name + +Stop a storage policy. +---------------------- + +A storage policy must be stopped in order to change its configuration. + +**strgp_stop** attr= + + | **name** *name* + | The storage policy name + +Query the storage policy status +------------------------------- + +**strgp_status** attr= + + **[name** *name*\ **]** + | + | The storage policy name. If none is given, the statuses of all + storage policies are reported. + +FAILOVER COMMAND SYNTAX +=========================== + +Please see **ldmsd_failover**\ (7). + +SETGROUP COMMAND SYNTAX +=========================== + +Please see **ldmsd_setgroup**\ (7). + +STREAM COMMAND SYNTAX +========================= + +Publish data to the named stream +-------------------------------- + +**plublish** attr= + + **name** *name* + | + | The stream name + + **data** *data* + | + | The data to publish + +Subscribe to a stream +--------------------- + +**subscribe** attr= + + **name** *name* + | + | The stream name + +LDMS DAEMON COMMAND SYNTAX +============================== + +Changing the verbosity level of ldmsd +------------------------------------- + +**loglevel** attr= + + | **level** *level* + | Verbosity levels [DEBUG, INFO, ERROR, CRITICAL, QUIET] + +Exit the connected LDMS daemon gracefully +----------------------------------------- + +**daemon_exit** + +Query the connected LDMS daemon status +-------------------------------------- + +**daemon_status** + +Tell the daemon to dump it's internal state to the log file. +------------------------------------------------------------ + +**status** [name=] + + | **[**\ *type]* + | Reports only the specified objects. The choices are prdcr, updtr + and strgp. + + | prdcr: list the state of all producers. + | updtr: list the state of all update policies. + | strgp: list the state of all storage policies. + + [name *value*] + The object name of which the status will be reported. + +MISC COMMAND SYNTAX +======================= + +Display the list of available commands +-------------------------------------- + +| +| **help** + + | [*command]* + | If a command is given, the help of the command will be printed. + Otherwise, only the available command names are printed. + +Set the user data value for a metric in a metric set. +----------------------------------------------------- + +| +| **udata** attr= + + **set** *set* + | + | The sampler plugin name + + **metric** *metric* + | + | The metric name + + **udata** *udata* + | + | The desired user-data. This is a 64b unsigned integer. + +Set the user data of multiple metrics using regular expression. +--------------------------------------------------------------- + +| The user data of the first matched metric is set to the base value. + The base value is incremented by the given 'incr' value and then sets + to the user data of the consecutive matched metric and so on. +| **udata_regex** attr= + + **set** *set* + | + | The metric set name. + + **regex** *regex* + | + | A regular expression to match metric names to be set + + **base** *base* + | + | The base value of user data (uint64) + + **[incr** *incr*\ **]** + | + | Increment value (int). The default is 0. If incr is 0, the user + data of all matched metrics are set to the base value. Optional. + +Get the LDMS version the running LDMSD is based on. +--------------------------------------------------- + +**version** + +NOTES +========= + +- ldmsctl is currently kept for backwards compatibility purposes with + LDMS v2 commands. ldmsctl still works in version 3, however with + ldmsctl, some capabilitites use v2 pathways as opposed to v3. + +- ldmsctl will be removed in a future release. It is not recommended + that you use this with v2. + +BUGS +======== + +No known bugs. + +EXAMPLES +============ + +1) Run ldmsctl + +:: + + $/tmp/opt/ovis/sbin/ldmsctl -h vm1_2 -p 10001 -x sock + ldmsctl> + +2) After starting ldmsctl, configure "meminfo" collector plugin to +collect every second. + +:: + + Note: interval=<# usec> e.g interval=1000000 defines a one second interval. + ldmsctl> load name=meminfo + ldmsctl> config name=meminfo component_id=1 set=vm1_1/meminfo + ldmsctl> start name=meminfo interval=1000000 + ldmsctl> quit + +3) Configure collectors on host "vm1" via bash script called collect.sh + +:: + + #!/bin/bash + # Configure "meminfo" collector plugin to collect every second (1000000 usec) on vm1_2 + echo "load name=meminfo" + echo "config name=meminfo component_id=2 set=vm1_2/meminfo" + echo "start name=meminfo interval=1000000" + # Configure "vmstat" collector plugin to collect every second (1000000 usec) on vm1_2 + echo "load name=vmstat" + echo "config name=vmstat component_id=2 set=vm1_2/vmstat" + echo "start name=vmstat interval=1000000" + + Make collect.sh executable + chmod +x collect.sh + + Execute collect.sh (Note: When executing this across many nodes you would use pdsh to execute the script on all nodes + in parallel) + > ldmsd -x sock:11111 -l ldmsd.log + > ldmsctl -x sock -p 11111 -h localhost -X collect.sh + +:: + +SEE ALSO +============ + +ldms_authentication(7), ldmsd(8), ldms_ls(8), ldmsd_controller(8), +ldms_quickstart(7) diff --git a/rtd/man2rst/ldmsd.rst b/rtd/man2rst/ldmsd.rst new file mode 100644 index 000000000..567b354aa --- /dev/null +++ b/rtd/man2rst/ldmsd.rst @@ -0,0 +1,391 @@ +===== +ldmsd +===== + +:Date: 28 Feb 2018 + +.. contents:: + :depth: 3 +.. + +NAME +====== + +ldmsd - Start an ldms daemon + +SYNOPSIS +========== + +ldmsd [OPTION...] + +DESCRIPTION +============= + +The ldmsd command can be used to start an ldms daemon. Plugin +configuration of the ldmsd can be done via the a configuration file or +the ldmsd_controller. + +Starting ldmsd with the configuration file option enables you to +statically configure a sampler without requiring python. Dynamically +configuring samplers with ldmsd_controller requires python. Currently, +v2's ldmsctl can still be used to dynamically configure a sampler +without requiring python. This capability will be replaced and it is not +recommended that you use this option. + +ENVIRONMENT +============= + +The ldmsd-check-env program will dump currently set environment variables that may influence ldmsd and plugin behavior. +----------------------------------------------------------------------------------------------------------------------- + +The following environment variables must often be set: + +LD_LIBRARY_PATH + Path to ovis/lib and libevent2/lib, if not in a system default path. + Depending on the system these may be lib64 instead of lib. + +PATH + Include the path to sbin directory containing ldmsd. + +The following environment variables may be set to override compiled-in defaults: +-------------------------------------------------------------------------------- + +ZAP_LIBPATH + Path to ovis/lib/ovis-ldms + +LDMSD_PLUGIN_LIBPATH + Path to ovis/lib/ovis-ldms + +LDMSD_PIDFILE + Full path name of pidfile overriding the default /var/run/ldmsd.pid + unless the command line argument "-r pidfilepath" is present. + +LDMSD_LOG_TIME_SEC + If present, log messages are stamped with the epoch time rather than + the date string. This is useful when sub-second information is + desired or correlating log messages with other epoch-stamped data. + +LDMSD_SOCKPATH + Path to the unix domain socket for the ldmsd. Default is created + within /var/run. If you must change the default (e.g., not running as + root and hence /var/run is not writeable), set this variable (e.g., + /tmp/run/ldmsd) or specify "-S socketpath" to ldmsd. + +LDMSD_MEM_SZ + The size of memory reserved for metric sets. Set this variable or + specify "-m" to ldmsd. See the -m option for further details. If both + are specified, the -m option takes precedence over this environment + variable. + +LDMSD_UPDTR_OFFSET_INCR + The increment to the offset hint in microseconds. This is only for + updaters that determine the update interval and offset automatically. + For example, the offset hint is 100000 which is 100 millisecond of + the second. The updater offset will be 100000 + + LDMSD_UPDTR_OFFSET_INCR. The default is 100000 (100 milliseconds). + +CRAY Specific Environment variables for ugni transport +------------------------------------------------------ + +ZAP_UGNI_PTAG For XE/XK, the PTag value as given by apstat -P. For XC, +The value does not matter but the environment variable must be set. + +ZAP_UGNI_COOKIE + For XE/XK, the Cookie value corresponding to the PTag value as given + by apstat -P For XC, the Cookie value (not Cookie2) as given by + apstat -P + +ZAP_UGNI_CQ_DEPTH + Optional value for the CQ depth. Default is 2048. + +ZAP_UGNI_STATE_INTERVAL + Optional. If set, then ldmsd will check all nodes' states via rca + interface. States for all nodes are checked and stored at intervals + determined by this environment variable. The stored values are + checked against before contacting a node. If you choose to use this + option, then the rule of thumb is to set ZAP_UGNI_STATE_INTERVAL and + ZAP_UGNI_STATE_OFFSET such that the node states are checked before + the metric set update occurs (see interval and offset in + ldmsd_controller) + +ZAP_UGNI_STATE_OFFSET + Optional. Only relevant if ZAP_UGNI_STATE_INTERVAL is set. Defaults + to zero. Offset from zero for checking the nodes state (see + ZAP_UGNI_STATE_INTERVAL, above). + +OPTIONS +========= + +General/Configuration Options: +------------------------------ + +**-F** + Run in foreground mode; don't daemonize the program. Default is + false. + +**-B, --banner** *version-file-mode [0, 1, 2]* + When run in daemon mode, controls the existence of the banner file. + Mode 0 suppresses the version file. Mode 1 deletes it at daemon exit. + Mode >= 2 leaves it in place for debugging after daemon exit. Default + mode is 1. The banner contains the software and protocol versions + information, which is also logged at the INFO level. The banner file + name is always the pidfile name with .version appended. + +**-c** *CONFIG_PATH* + The path to configuration file (optional, default: ). The + configuration file contains a batch of ldmsd controlling commands, + such as \`load\` for loading a plugin, and \`prdcr_add\` for defining + a ldmsd producer to aggregate from (see **ldmsd_controller**\ (8) for + a complete list of commands, or simply run **ldmsd_controller** then + **help**). The commands in the configuration file are executed + sequentially, except for **prdcr_start**, **updtr_start**, + **strgp_start**, and **failover_start** that will be deferred. If + **failover_start** is present, the failover service will start first + (among the deferred). Then, upon failover pairing success or failure, + the other deferred configuration objects will be started. Please also + note that while failover service is in use, prdcr, updtr, and strgp + cannot be altered (start, stop, or reconfigure) over in-band + configuration. See also REORDERED COMMANDS below. + +**-m, --set_memory** *MEMORY_SIZE* + | + | MEMORY_SIZE is the maximum size of pre-allocated memory for metric + sets. The given size must be less than 1 petabytes. For example, + 20M or 20mb are 20 megabytes. The default is adequate for most + ldmsd acting in the collector role. For aggregating ldmsd, a rough + estimate of preallocated memory needed is (Number of nodes + aggregated) x (Number of metric sets per node) x 4k. Data sets + containing arrays may require more. The estimate can be checked by + enabling DEBUG logging and examining the mm_stat bytes_used+holes + value at ldmsd exit. + +**-n, --daemon_name** *NAME* + | + | The name of the daemon. By default, it is "*HOSTNAME:PORT*". The + failover feature uses the daemon name to verify the buddy name, and + the producer name of kernel metric sets is the daemon name. + +**-r, --pid_file** *pid_file* + The path to the pid file and prefix of the .version banner file for + daemon mode. + +**-V** + Display LDMS version information and then exit. + +**-u** plugin_name + Display the usage for named plugin. Special names all, sampler, and + store match all, sampler type, and store type plugins, respectively. + +Communication Options: +---------------------- + +**-x** *XPRT:PORT:HOST* + | + | Specifies the transport type to listen on. May be specified more + than once for multiple transports. The XPRT string is one of + 'rdma', 'sock', or 'ugni' (CRAY XE/XK/XC). A transport specific + port number must be specified following a ':', e.g. rdma:10000. An + optional host or address may be specified after the port, e.g. + rdma:10000:node1-ib, to listen to a specific address. + +The listening transports can also be specified in the configuration file +using **listen** command, e.g. \`listen xprt=sock port=1234 +host=node1-ib\`. Please see **ldmsd_controller**\ (8) section **LISTEN +COMMAND SYNTAX** for more details. + +**-a, --default_auth** *AUTH* + Specify the default LDMS Authentication method for the LDMS + connections in this daemon (when the connections do not specify + authentication method/domain). Please see + **ldms_authentication**\ (7) for more information. If this option is + not given, the default is "none" (no authentication). Also see + **ldmsd_controller**\ (8) section **AUTHENTICATION COMMAND SYNTAX** + for how to define an authentication domain. + +**-A, --default_auth_args** *NAME*\ **=**\ *VALUE* + Passing the *NAME*\ =\ *VALUE* option to the LDMS Authentication + plugin. This command line option can be given multiple times. Please + see **ldms_authentication**\ (7) for more information, and consult + the plugin manual page for plugin-specific options. + +Log Verbosity Options: +---------------------- + +**-l, --log_file** *LOGFILE* + | + | LOGFILE is the path to the log file for status messages. Default is + stdout unless given. The syslog facility is used if LOGFILE is + exactly "syslog". Silence can be obtained by specifying /dev/null + for the log file or using command line redirection as illustrated + below. + +**-v, --log_level** *LOG_LEVEL* + | + | LOG_LEVEL can be one of DEBUG, INFO, ERROR, CRITICAL or QUIET. The + default level is ERROR. QUIET produces only user-requested output. + (Note: this has changed from the previous release where q + designated no (QUIET) logging). + +**-t** + Truncate the log file if it already exists. + +**-L,**\ *--log_config* ** \| \| ** + | + | Append configuration replay messages or configuration debugging + messages to the log indicated by -l (when PATH is omitted) or to + the file named PATH. Bit values of CINT correspond to: + +:: + + 0: no messages + 1: debug messages from the generic 'request' handler + 2: config history messages in replayable format + 4: query history messages in replayable format + 8: failover debugging messages + 16: include delta time prefix when using PATH + 32: include epoch timestamp prefix when using PATH + +These values may be added together to enable multiple outputs. All +messages are logged at the user-requested level, LDMSD_LALL. CINT values +2, 26 and 27 are often interesting. When CINT is omitted, 1 is the +default. When PATH is used, the log messages are flushed to as they are +generated. + +Kernel Metric Options: +---------------------- + +**-k, --publish_kernel** + Publish kernel metrics. + +**-s, --kernel_set_file** *SETFILE* + Text file containing kernel metric sets to publish. Default: + /proc/sys/kldms/set_list + +Thread Options: +--------------- + +**-P, --worker_threads** *THR_COUNT* + | + | THR_COUNT is the number of event threads to start. + +SPECIFYING COMMAND-LINE OPTIONS IN CONFIGURATION FILES +======================================================== + +Users can use the 'option' command to specify some command-line options +in a configuration file. + + option + +Command-line options supported by the 'option' command and the corresponding attributes +--------------------------------------------------------------------------------------- + +**-a,**\ *--default_auth* + +**-A,**\ *--default_auth_args* + +**-B,**\ *--banner* + +**-k,**\ *--publish_kernel* + +**-l,**\ *--log_file* **PATH** + +**-m,**\ *--set_memory* + +**-n,**\ *--daemon_name* + +**-P,**\ *--worker_threads* + +**-r,**\ *--pid_file* + +**-s,**\ *--kernel_set_path* + +**-v,**\ *--log_level* + +**-L,**\ *--log_config* **** + +Specifying the listen endpoints in configuraton files +----------------------------------------------------- + +Users can use the 'listen' command to define the listen endpoints. For example, + listen xprt=sock port=411 + +Example +------- + +> cat ldmsd.conf + +:: + + # cmd-line options + option --log_file /opt/ovis/var/ldmsd.log --log_level ERROR + option -m 2GB -P 16 + option -a munge + listen xprt=ugni port=411 + # meminfo + load name=meminfo + config name=meminfo producer=nid0001 instance=nid0001/meminfo + start name=meminfo interval=1000000 offset=0 + +RUNNING LDMSD ON CRAY XE/XK/XC SYSTEMS USING APRUN +==================================================== + +ldsmd can be run as either a user or as root using the appropriate PTag +and cookie. + +Check (or set) the PTag and cookie. + + Cray XE/XK Systems: + + :: + + > apstat -P + PDomainID Type Uid PTag Cookie + LDMS system 0 84 0xa9380000 + foo user 22398 243 0x2bb0000 + + Cray XC Systems: + > apstat -P + PDomainID Type Uid Cookie Cookie2 + LDMS system 0 0x86b80000 0 + foo user 20596 0x86bb0000 0x86bc0000 + + Set the environment variables ZAP_UGNI_PTAG and ZAP_UGNI_COOKIE with + the appropriate ptag and cookie. + + Run ldmsd directly or as part of a script launched from aprun. In + either case, Use aprun with the correct -p when running. + +REORDERED COMMANDS +==================== + +Certain commands in are reordered when processing input scripts +specified with -c. Items related to failover are handled as described in +the '-c' section above. Other commands are promoted to run before any +non-promoted commands from the loaded script. In particular, env, +loglevel, listen, auth, and option are promoted. + +NOTES +======= + +OCM flags are unsupported at this time. + +BUGS +====== + +None known. + +EXAMPLES +========== + +:: + + $/tmp/opt/ovis/sbin/ldmsd -x sock:60000 -p unix:/var/run/ldmsd/metric_socket -l /tmp/opt/ovis/logs/1 + + + $/tmp/opt/ovis/sbin/ldmsd -x sock:60000 -p sock:61000 -p unix:/var/runldmsd/metric_socket + +SEE ALSO +========== + +ldms_authentication(7), ldmsctl(8), ldms_ls(8), ldmsd_controller(8), +ldms_quickstart(7) diff --git a/rtd/man2rst/ldmsd_controller.rst b/rtd/man2rst/ldmsd_controller.rst new file mode 100644 index 000000000..8451c0775 --- /dev/null +++ b/rtd/man2rst/ldmsd_controller.rst @@ -0,0 +1,866 @@ +================ +ldmsd_controller +================ + +:Date: 19 Nov 2019 + +.. contents:: + :depth: 3 +.. + +NAME +================= + +ldmsd_controller - a python program to configure an ldms daemon. + +SYNOPSIS +===================== + +**ldmsd_controller** [OPTIONS] + +ldmsd_controller> [ = ] + +DESCRIPTION +======================== + +With LDMS (Lightweight Distributed Metric Service), the ldmsd can be +configured via the ldmsd_controller. + +If ldms is built with --enable-readline, one can invoke the +ldmsd_controller from the command line and obtain an input interface +with feedback. In many instances, instances, however, it is prefered to +execute scripts and send the output commands to an ldmsd instead. + +ENVIRONMENT +======================== + +Note: python2.6 with the additional installation of the argparse module +OR python2.7 (which has the argparse module) is required. + +PYTHONPATH + /lib[64]/pythonX.Y/site-packages/ + +PATH + /bin + +LDMSD_CONTROLLER OPTIONS +===================================== + +**-h,--host** *HOST* + Hostname of **ldmsd** to connect to + +**-p,--port** *PORT* + The port of **ldmsd** to connect to + +**-x,--xprt** *XPRT* + The transport type (**sock**, **rdma**, **ugni**). + +**-a,--auth** *AUTH* + The LDMS authentication plugin. Please see + **ldms_authentication**\ (7) for more information. + +**-A,--auth-arg** *NAME=VALUE* + Options *NAME*\ =\ *VALUE* Passing the *NAME*\ =\ *VALUE* option to + the LDMS Authentication plugin. This command line option can be given + multiple times. Please see **ldms_authentication**\ (7) for more + information, and consult the plugin manual page for plugin-specific + options. + +**--source** *SOURCE* + | + | Path to the config file + +**--script** *SCRIPT* + | + | Execute the script and send the output commands to the connected + ldmsd + +**-?** + Display help + +**--help** + Display help + +REGULAR EXPRESSION +=============================== + +The regular expression specified in *regex=* option of the commands is a +POSIX Extended (modern) Regular Expression. In short, "\*+?{}|^$." are +special regular expression characters. Please see **regex(7)** for more +information. + +PLUGIN COMMAND SYNTAX +================================== + +Load a plugin +------------- + +| **load** attr= + + **name** *name* + | + | The plugin name + +List the usage of the loaded plugins +------------------------------------ + +**usage** + +unload a plugin +--------------- + +| **term** attr= + + **name** *name* + | + | The plugin name + +Send a configuration command to the specified plugin. +----------------------------------------------------- + +**config** attr= + + **name** *name* + | + | The plugin name + + **attr=value** + | + | Plugin specific attr=value tuples + + **Attributes specific for sampler plugins (Some sampler plugins + may have additional** attributes) + + **producer** *producer* + | + | A unique name for the host providing the data + + **instance** *instance* + | + | The set instance name. The name must be unique among all + metric sets in all LDMS daemons. + + **[component_id** *component_id*\ **]** + | + | A unique number for the comopnent being monitored. The + default is zero. + + **[schema** *schema*\ **]** + | + | The name of the metric set schema. + + **[job_set** *job_set*\ **]** + | + | The set instance name of the set containing the job data. The + default is 'job_info'. + + **[uid** *uid*\ **]** + | + | The user id of the set's owner. The default is the returned + value of geteuid(). + + **[gid** *gid*\ **]** + | + | The group id of the set's owner. The default is the returned + value of getegid(). + + **[perm** *perm*\ **]** + | + | The sampler plugin instance access permission. The default is + 0440. + +Start a sampler plugin +---------------------- + +**start** attr= + + **name** *name* + | + | The plugin name. + + **interval** *interval* + | + | The sample interval in microseconds. + + **[offset** *offset*\ **]** + | + | Offset (shift) from the sample mark in microseconds. Offset can + be positive or negative with magnitude up to 1/2 the sample + interval. If this offset is specified, including 0, collection + will be synchronous; if the offset is not specified, collection + will be asynchronous. Optional. + +Stop a sampler plugin +--------------------- + +**stop** attr= + + **name** *name* + | + | The plugin name. + +AUTHENTICATION COMMAND SYNTAX +========================================== + +Add an authentication domain +---------------------------- + +**auth_add** **name**\ =\ *NAME* **plugin**\ =\ *PLUGIN* [ ... *PLUGIN +ATTRIBUTES* ... ] + + **name**\ =\ *NAME* + | + | The name of the authentication domain. This is the name referred + to by **listen** and **prdcr_add** commands. + + **plugin**\ =\ *none*\ \|\ *ovis*\ \|\ *munge* + | + | The LDMS Authentication Plugin for this domain. + + [ ... *PLUGIN ATTRIBUTES* ... ] + | + | Arbitrary plugin attributes. Please consult the manual of the + authentication plugin for more information. + +LISTEN COMMAND SYNTAX +================================== + +Instruct ldmsd to listen to a port +---------------------------------- + +**listen** **port**\ =\ *PORT* +**xprt**\ =\ *sock*\ \|\ *rdma*\ \|\ *ugni*\ \|\ *fabric* +[**host**\ =\ *HOST*] [**auth**\ =\ *AUTH_REF*] + + **port**\ =\ *PORT* + | + | The port to listen to. Also, please be sure not to use ephemeral + port (ports in the range of + **/proc/sys/net/ip4/ip_local_port_range**). + + **xprt**\ =\ *sock*\ \|\ *rdma*\ \|\ *ugni*\ \|\ *fabric* + | + | The type of the transport. + + **host**\ =\ *HOST* + | + | An optional hostname or IP address to bind. If not given, listen + to all addresses (0.0.0.0 or PORT). + + **auth**\ =\ *AUTH_REF* + | + | Instruct **ldmsd** to use *AUTH_REF* (a name reference to + **auth** object created by **auth_add** command) to authenticate + connections on this port. If not given, the port uses the + default authentication method specified on the CLI options (see + **ldmsd**\ (8) option **-a**). + +PRODUCER COMMAND SYNTAX +==================================== + +Add a producer to the aggregator +-------------------------------- + +| **prdcr_add** attr= + + **name** *name* + | + | The producer name. The producer name must be unique in an + aggregator. It is independent of any attributes specified for + the metric sets or hosts. + + **xprt** *xprt* + | + | The transport name [sock, rdma, ugni] + + **host** *host* + | + | The hostname of the host + + **type** *conn_type* + | + | The connection type [active, passive] + + **interval** *interval* + | + | The connection retry interval + + **[perm** *permission*\ **]** + | + | The permission to modify the producer in the future + + **[auth** *AUTH_REF*\ **]** + | + | Instruct **ldmsd** to use *AUTH_REF* (a name reference to + **auth** object created by **auth_add** command) with the + connections to this producer. If not given, the default + authentication method specified on the CLI options (see + **ldmsd**\ (8) option **-a**) is used. + +Delete a producer from the aggregator +------------------------------------- + +| The producer cannot be in use or running +| **prdcr_del** attr= + + **name** *name* + | + | The producer name + +Start a producer +---------------- + +**prdcr_start** attr= + + **name** *name* + | + | The producer name + + **[interval** *interval*\ **]** + | + | The connection retry interval in microsec. If unspecified, the + previously configured value will be used. Optional. + +Start all producers matching a regular expression +------------------------------------------------- + +**prdcr_start_regex** attr= + + **regex** *regex* + | + | A regular expression + + **[interval** *interval*\ **]** + | + | The connection retry interval in microsec. If unspecified, the + previously configured value will be used. Optional. + +Stop a producer +--------------- + +**prdcr_stop** attr= + + **name** *name* + | + | The producer name + +Stop all producers matching a regular expression +------------------------------------------------ + +**prdcr_stop_regex** attr= + + **regex** *regex* + | + | A regular expression + +Query producer status +--------------------- + +**prdcr_status** attr= + + **[name** *name*\ **]** + | + | The producer name. If none is given, the statuses of all + producers are reported. + +Subscribe for stream data from all matching producers +----------------------------------------------------- + +**prdcr_subsribe** + + **regex** *regex* + | + | The regular expression matching producer name + + **stream** *stream* + | + | The stream name + +UPDATER COMMAND SYNTAX +=================================== + +Add an updater process that will periodically sample producer metric sets +------------------------------------------------------------------------- + +**updtr_add** attr= + + **name** *name* + | + | The update policy name. The policy name should be unique. It is + independent of any attributes specified for the metric sets or + hosts. + + **interval** *interval* + | + | The update/collect interval + + **[offset** *offset*\ **]** + | + | Offset for synchronized aggregation. Optional. + + **[push** *onchange|true*\ **]** + | + | Push mode: 'onchange' and 'true'. 'onchange' means the Updater + will get an update whenever the set source ends a transaction or + pushes the update. 'true' means the Updater will receive an + update only when the set source pushes the update. If \`push\` + is used, \`auto_interval\` cannot be \`true\`. + + **[auto_interval** *true|false* **]** + If true, the updater will schedule set updates according to the + update hint. The sets with no hints will not be updated. If false, + the updater will schedule the set updates according to the given + interval and offset values. If not specified, the value is + *false*. + + **[perm** *permission*\ **]** + | + | The permission to modify the updater in the future + +Remove an updater from the configuration +---------------------------------------- + +**updtr_del** attr= + + **name** *name* + | + | The update policy name + +Add a match condition that specifies the sets to update. +-------------------------------------------------------- + +**updtr_match_add** attr= + + **name** *name* + | + | The update policy name + + **regex** *regex* + | + | The regular expression + + **match** *match (inst|schema)* + | + | The value with which to compare; if match=inst, the expression + will match the set's instance name, if match=schema, the + expression will match the set's schema name. + +Remove a match condition from the Updater. +------------------------------------------ + +**updtr_match_del** attr= + + **name** *name* + | + | The update policy name + + **regex** *regex* + | + | The regular expression + + **match** *match (inst|schema)* + | + | The value with which to compare; if match=inst, the expression + will match the set's instance name, if match=schema, the + expression will match the set's schema name. + +Add matching producers to an updater policy +------------------------------------------- + +This is required before starting the updater. + +**updtr_prdcr_add** attr= + + **name** *name* + | + | The update policy name + + **regex** *regex* + | + | A regular expression matching zero or more producers + +Remove matching producers to an updater policy +---------------------------------------------- + +**updtr_prdcr_del** attr= + + **name** *name* + | + | The update policy name + + **regex** *regex* + | + | A regular expression matching zero or more producers + +Start updaters. +--------------- + +**updtr_start** attr= + + **name** *name* + | + | The update policy name + + **[interval** *interval*\ **]** + | + | The update interval in micro-seconds. If this is not specified, + the previously configured value will be used. Optional. + + **[offset** *offset*\ **]** + | + | Offset for synchronized aggregation. Optional. + +Stop an updater. +---------------- + +The Updater must be stopped in order to change it's configuration. + +**updtr_stop** attr= + + **name** *name* + | + | The update policy name + +Query the updater status +------------------------ + +**updtr_status** attr= + + **[name** *name*\ **]** + | + | The updater name. If none is given, the statuses of all updaters + are reported. + +STORE COMMAND SYNTAX +================================= + +Create a Storage Policy and open/create the storage instance. +------------------------------------------------------------- + +**strgp_add** attr= + + **name** *name* + | + | The unique storage policy name. + + **plugin** *plugin* + | + | The name of the storage backend. + + **container** *container* + | + | The storage backend container name. + + **schema** *schema* + | + | The schema name of the metric set to store. + + **[perm** *permission*\ **]** + | + | The permission to modify the storage in the future + +Remove a Storage Policy +----------------------- + +| All updaters must be stopped in order for a storage policy to be + deleted +| **strgp_del** attr= + + **name** *name* + | + | The storage policy name + +Add a regular expression used to identify the producers this storage policy will apply to. +------------------------------------------------------------------------------------------ + +| If no producers are added to the storage policy, the storage policy + will apply on all producers. +| **strgp_prdcr_add** attr= + + **name** *name* + | + | The storage policy name + + **regex** *name* + | + | A regular expression matching metric set producers. + +Remove a regular expression from the producer match list +-------------------------------------------------------- + +**strgp_prdcr_del** attr= + + | **name** *name* + | The storage policy name + + **regex** *regex* + | + | The regex of the producer to remove. + +Add the name of a metric to store +--------------------------------- + +**strgp_metric_add** attr= + + | **name** *name* + | The storage policy name + + **metric** *metric* + | + | The metric name. If the metric list is NULL, all metrics in the + metric set will be stored. + +Remove a metric from the set of stored metrics. +----------------------------------------------- + +**strgp_metric_del** attr= + + | **name** *name* + | The storage policy name + + **metric** *metric* + | + | The metric to remove + +Start a storage policy. +----------------------- + +**strgp_start** attr= + + | **name** *name* + | The storage policy name + +Stop a storage policy. +---------------------- + +A storage policy must be stopped in order to change its configuration. + +**strgp_stop** attr= + + | **name** *name* + | The storage policy name + +Query the storage policy status +------------------------------- + +**strgp_status** attr= + + **[name** *name*\ **]** + | + | The storage policy name. If none is given, the statuses of all + storage policies are reported. + +FAILOVER COMMAND SYNTAX +==================================== + +Please see **ldmsd_failover**\ (7). + +SETGROUP COMMAND SYNTAX +==================================== + +Please see **ldmsd_setgroup**\ (7). + +STREAM COMMAND SYNTAX +================================== + +Publish data to the named stream +-------------------------------- + +**plublish** attr= + + **name** *name* + | + | The stream name + + **data** *data* + | + | The data to publish + +Subscribe to a stream +--------------------- + +**subscribe** attr= + + **name** *name* + | + | The stream name + +LDMS DAEMON COMMAND SYNTAX +======================================= + +Changing the verbosity level of ldmsd +------------------------------------- + +**loglevel** attr= + + | **level** *level* + | Verbosity levels [DEBUG, INFO, ERROR, CRITICAL, QUIET] + +Exit the connected LDMS daemon gracefully +----------------------------------------- + +**daemon_exit** + +Query the connected LDMS daemon status +-------------------------------------- + +**daemon_status** + +Tell the daemon to dump it's internal state to the log file. +------------------------------------------------------------ + +**status** [name=] + + | **[**\ *type]* + | Reports only the specified objects. The choices are prdcr, updtr + and strgp. + + | prdcr: list the state of all producers. + | updtr: list the state of all update policies. + | strgp: list the state of all storage policies. + + [name *value*] + The object name of which the status will be reported. + +MISC COMMAND SYNTAX +================================ + +Display the list of available commands +-------------------------------------- + +| +| **help** + + | [*command]* + | If a command is given, the help of the command will be printed. + Otherwise, only the available command names are printed. + +Set the user data value for a metric in a metric set. +----------------------------------------------------- + +| +| **udata** attr= + + **set** *set* + | + | The sampler plugin name + + **metric** *metric* + | + | The metric name + + **udata** *udata* + | + | The desired user-data. This is a 64b unsigned integer. + +Set the user data of multiple metrics using regular expression. +--------------------------------------------------------------- + +| The user data of the first matched metric is set to the base value. + The base value is incremented by the given 'incr' value and then sets + to the user data of the consecutive matched metric and so on. +| **udata_regex** attr= + + **set** *set* + | + | The metric set name. + + **regex** *regex* + | + | A regular expression to match metric names to be set + + **base** *base* + | + | The base value of user data (uint64) + + **[incr** *incr*\ **]** + | + | Increment value (int). The default is 0. If incr is 0, the user + data of all matched metrics are set to the base value. Optional. + +Get the LDMS version the running LDMSD is based on. +--------------------------------------------------- + +**version** + +Launch a subshell to do arbitrary commands +------------------------------------------ + +**!**\ shell-command + +Comment (a skipped line) +------------------------ + +**#**\ comment-string + +BUGS +================= + +No known bugs. + +EXAMPLES +===================== + +Example of a script to add producers to updaters +------------------------------------------------ + +:: + + > more add_prdcr.sh + #!/bin/bash + + SOCKDIR=/XXX/run/ldmsd + portbase=61100 + port1=`expr $portbase + 1` + port2=`expr $portbase + 2` + port3=`expr $portbase + 3` + + echo "prdcr_add name=localhost2 host=localhost type=active xprt=sock port=$port2 interval=20000000" + echo "prdcr_start name=localhost2" + echo "prdcr_add name=localhost1 host=localhost type=active xprt=sock port=$port1 interval=20000000" + echo "prdcr_start name=localhost1" + echo "updtr_add name=policy5_h1 interval=2000000 offset=0" + echo "updtr_prdcr_add name=policy5_h1 regex=localhost1" + echo "updtr_start name=policy5_h1" + echo "updtr_add name=policy5_h2 interval=5000000 offset=0" + echo "updtr_prdcr_add name=policy5_h2 regex=localhost2" + echo "updtr_start name=policy5_h2" + +Example of a script to add and start stores +------------------------------------------- + +:: + + > more add_store.sh + #!/bin/bash + + # whole path must exist + STORE_PATH=/XXX/ldmstest/store + mkdir -p $STORE_PATH + sleep 1 + + # CSV + echo "load name=store_csv" + echo "config name=store_csv path=$STORE_PATH action=init altheader=0 rollover=30 rolltype=1" + echo "config name=store_csv action=custom container=csv schema=cray_aries_r altheader=1 userdata=0" + + echo "strgp_add name=policy_mem plugin=store_csv container=csv schema=meminfo" + echo "strgp_start name=policy_mem" + + #echo "strgp_add name=csv_memfoo_policy plugin=store_csv container=meminfo schema=meminfo_foo" + #echo "strgp_prdcr_add name=csv_memfoo_policy regex=localhost*" + #echo "strgp_start name=csv_memfoo_policy" + +Example to start an ldmsd and use ldmsd_controller to call a script +------------------------------------------------------------------- + +:: + + > ldmsd -x sock:11111 -l log.txt + > ldmsd_controller --host localhost --port 11111 --xprt sock --script myscript.sh + +SEE ALSO +===================== + +ldmsd(8), ldmsctl(8), ldms_quickstart(7), ldmsd_failover(7), +ldmsd_setgroup(7) diff --git a/rtd/man2rst/ldmsd_decomposition.rst b/rtd/man2rst/ldmsd_decomposition.rst new file mode 100644 index 000000000..0204971f1 --- /dev/null +++ b/rtd/man2rst/ldmsd_decomposition.rst @@ -0,0 +1,317 @@ +=================== +ldmsd_decomposition +=================== + +:Date: 2 Jun 2022 + +.. contents:: + :depth: 3 +.. + +NAME +==================== + +ldmsd_decomposition - manual for LDMSD decomposition + +DESCRIPTION +=========================== + +A decomposition is a routine that converts LDMS set into one or more +rows before feeding them to the store. Currently, only **store_sos**, +**store_csv**, and **store_kafka** support decomposition. To use +decomposition, simply specify +**decomposition=**\ *DECOMP_CONFIG_JSON_FILE* option in the +**strgp_add** command. There are three types of decompositions: +**static**, **as_is**, and \`flex\`. **static** decomposition statically +and strictly decompose LDMS set according to the definitions in the +*DECOMP_CONFIG_JSON_FILE*. **as_is** decomposition on the other hand +takes all metrics and converts them as-is into rows. **flex** +decomposition applies various decompositions by LDMS schema digest +mapping from the configuration. + +Please see section **STATIC DECOMPOSITION**, **AS_IS DECOMPOSITION** , +and **FLEX DECOMPOSITION** for more information. + +More decomposition types may be added in the future. The decomposition +mechanism is pluggable. Please see **as_is**, **static**, and **flex** +decomposition implementation in \`ldms/src/decomp/\` directory in the +source tree for more information. + +STATIC DECOMPOSITION +==================================== + +The **static** decomposition statically and strictly converts LDMS set +to one or more rows according to the *DECOMP_CONFIG_JSON_FILE*. The +format of the JSON configuration file is as follows: + +:: + + { + "type": "static", + "rows": [ + { + "schema": "OUTPUT_ROW_SCHEMA", + "cols": [ + { "src":"LDMS_METRIC_NAME", "dst":"OUTPUT_COL_NAME","type":"TYPE", + "array_len": ARRAY_LEN_IF_TYPE_IS_ARRAY, + "rec_member": "REC_MEMBER_NAME_IF_SRC_IS_RECORD", + "fill": "FILL_VALUE" + }, + ... + ], + "indices": [ + { "name":"INDEX_NAME", "cols":[ OUTPUT_COLUMNS, ... ] }, + ... + ] + }, + ... + ] + } + +The "rows" is an array of row definition object, each of which defines +an output row. The "schema" attribute specifies the output schema name, +which is the schema name used by the storage plugin to identify the row +schema. Each row definition contains "cols" which is a list of column +definitions, and "indices" which is a list of index definitions. Each +column definition is an object with at least "src" describing the metric +name, "dst" describing the output column name, and "type" describing the +value type of the column. If the type is an array, "array_len" is +required. If the "src" is a list of record, "rec_member" is required to +specify the record member for the output column. The "fill" value is +used to fill in the output column in the case that the "src" metric is +not present in the LDMS set (e.g. in the case of meminfo). + +Each index definition object contains "name" (the name of the index) and +"cols" which is the names of the OUTPUT columns comprising the index. + +The **"timestamp"**, **"producer"**, and **"instance"** are special +"src" that refer to update timestamp, producer name and instance name of +the set respectively. + +The following is an example of a static decomposition definition +converting meminfo set into two schemas, "meminfo_filter" (select a few +metrics) and "meminfo_directmap" (select a few direct map metrics with +"fill" since DirectMap varies by CPU architecture). + +:: + + { + "type": "static", + "rows": [ + { + "schema": "meminfo_filter", + "cols": [ + { "src":"timestamp", "dst":"ts", "type":"ts" }, + { "src":"producer", "dst":"prdcr", "type":"char_array", "array_len":64 }, + { "src":"instance", "dst":"inst", "type":"char_array", "array_len":64 }, + { "src":"component_id", "dst":"comp_id", "type":"u64" }, + { "src":"MemFree", "dst":"free", "type":"u64" }, + { "src":"MemActive", "dst":"active", "type":"u64" } + ], + "indices": [ + { "name":"time_comp", "cols":["ts", "comp_id"] }, + { "name":"time", "cols":["ts"] } + ] + }, + { + "schema": "meminfo_directmap", + "cols": [ + { "src":"timestamp", "dst":"ts", "type":"ts" }, + { "src":"component_id", "dst":"comp_id", "type":"u64" }, + { "src":"DirectMap4k", "dst":"directmap4k", "type":"u64", "fill": 0 }, + { "src":"DirectMap2M", "dst":"directmap2M", "type":"u64", "fill": 0 }, + { "src":"DirectMap4M", "dst":"directmap4M", "type":"u64", "fill": 0 }, + { "src":"DirectMap1G", "dst":"directmap1G", "type":"u64", "fill": 0 } + ], + "indices": [ + { "name":"time_comp", "cols":["ts", "comp_id"] }, + { "name":"time", "cols":["ts"] } + ] + } + ] + } + +The following is an example of a static decomposition with "rec_member" +usage. + +:: + + { + "type": "static", + "rows": [ + { + "schema": "netdev2_small", + "cols": [ + { "src":"timestamp", "dst":"ts", "type":"ts" }, + { "src":"producer", "dst":"prdcr", "type":"char_array", "array_len":64 }, + { "src":"instance", "dst":"inst", "type":"char_array", "array_len":64 }, + { "src":"component_id", "dst":"comp_id", "type":"u64" }, + { "src":"netdev_list", "rec_member":"name", + "dst":"netdev.name", "type":"char_array", "array_len":16 }, + { "src":"netdev_list", "rec_member":"rx_bytes", + "dst":"netdev.rx_bytes", "type":"u64" }, + { "src":"netdev_list", "rec_member":"tx_bytes", + "dst":"netdev.tx_bytes", "type":"u64" }, + ], + "indices": [ + { "name":"time_comp", "cols":["ts", "comp_id"] }, + { "name":"time", "cols":["ts"] } + ] + } + ] + } + +In this case, if the "netdev_list" has N members, the decomposition will +expand the set into N rows. + +AS_IS DECOMPOSITION +=================================== + +The **as_is** decomposition generate rows as-is according to metrics in +the LDMS set. To avoid schema conflict, such as meminfo collecting from +heterogeneous CPU architectures, **as_is** decomposition appends the +short LDMS schema digest (7 characters) to the row schema name before +submitting the rows to the storage plugin. For example, "meminfo" LDMS +schema may turn into "meminfo_8d2b8bd" row schema. The **as_is** +decomposition configuration only takes "indices" attribute which defines +indices for the output rows. When encountering a list of primitives, the +as_is decomposition expands the set into multiple rows (the non-list +metrics' values are repeated). When encountering a list of records, in +addition to expanding rows, the decomposition also expand the record +into multiple columns with the name formatted as +"LIST_NAME.REC_MEMBER_NAME". The "timestamp" is not a metric in the set +but it is used in all storage plugins. So, the "timestamp" column is +prepended to each of the output rows. + +The format of the JSON configuration is as follows: + +:: + + { + "type": "as_is", + "indices": [ + { "name": "INDEX_NAME", "cols": [ COLUMN_NAMES, ... ] }, + ... + ] + } + +The following is an **as_is** decomposition configuration example with +two indices: + +:: + + { + "type": "as_is", + "indices": [ + { "name": "time", "cols": [ "timestamp" ] }, + { "name": "time_comp", "cols": [ "timestamp", "component_id" ] } + ] + } + +FLEX DECOMPOSITION +================================== + +The **flex** decomposition applies various decompositions by LDMS schema +digests specified in the configuration. The configurations of the +applied decompositions are also specified in \`flex\` decomposition file +as follows: + +:: + + { + "type": "flex", + /* defining decompositions to be applied */ + "decomposition": { + "": { + "type": "", + ... + }, + ... + }, + /* specifying digests and the decompositions to apply */ + "digest": { + "": "", + "": [ "", "" ], + ... + "*": "" /* optional : the unmatched */ + } + } + +**Example:** In the following example, the "meminfo" LDMS sets have 2 +digests due to different metrics from different architecture. The +configuration then maps those digests to "meminfo" static decomposition +(producing "meminfo_filter" rows). It also showcases the ability to +apply multiple decompositions to a matching digest. The procnetdev2 sets +with digest +"E8B9CC8D83FB4E5B779071E801CA351B69DCB9E9CE2601A0B127A2977F11C62A" will +have "netdev2" static decomposition and "the_default" as-is +decomposition applied to them. The sets that do not match any specific +digest will match the "\*" digest. In this example, "the_default" as-is +decomposition is applied. + +:: + + { + "type": "flex", + "decomposition": { + "meminfo": { + "type": "static", + "rows": [ + { + "schema": "meminfo_filter", + "cols": [ + { "src":"timestamp", "dst":"ts", "type":"ts" }, + { "src":"producer", "dst":"prdcr", "type":"char_array", "array_len":64 }, + { "src":"instance", "dst":"inst", "type":"char_array", "array_len":64 }, + { "src":"component_id", "dst":"comp_id", "type":"u64" }, + { "src":"MemFree", "dst":"free", "type":"u64" }, + { "src":"MemActive", "dst":"active", "type":"u64" } + ], + "indices": [ + { "name":"time_comp", "cols":["ts", "comp_id"] }, + { "name":"time", "cols":["ts"] } + ] + } + ] + }, + "netdev2" : { + "type" : "static", + "rows": [ + { + "schema": "procnetdev2", + "cols": [ + { "src":"timestamp", "dst":"ts","type":"ts" }, + { "src":"component_id", "dst":"comp_id","type":"u64" }, + { "src":"netdev_list", "rec_member":"name", "dst":"dev.name", + "type":"char_array", "array_len": 16 }, + { "src":"netdev_list", "rec_member":"rx_bytes", "dst":"dev.rx_bytes", + "type":"u64" }, + { "src":"netdev_list", "rec_member":"tx_bytes", "dst":"dev.tx_bytes", + "type":"u64" } + ], + "indices": [ + { "name":"time_comp", "cols":["ts", "comp_id"] } + ] + } + ] + }, + "the_default": { + "type": "as_is", + "indices": [ + { "name": "time", "cols": [ "timestamp" ] }, + { "name": "time_comp", "cols": [ "timestamp", "component_id" ] } + ] + } + }, + "digest": { + "71B03E47E7C9033E359DB5225BC6314A589D8772F4BC0866B6E79A698C8799C0": "meminfo", + "59DD05D768CFF8F175496848486275822A6A9795286FD9B534FDB9434EAF4D50": "meminfo", + "E8B9CC8D83FB4E5B779071E801CA351B69DCB9E9CE2601A0B127A2977F11C62A": [ "netdev2", "the_default" ], + "*": "the_default" + } + } + +SEE ALSO +======================== + +Plugin_store_sos(7), Plugin_store_csv(7), Plugin_store_kafka(7) diff --git a/rtd/man2rst/ldmsd_failover.rst b/rtd/man2rst/ldmsd_failover.rst new file mode 100644 index 000000000..650c08a3d --- /dev/null +++ b/rtd/man2rst/ldmsd_failover.rst @@ -0,0 +1,289 @@ +============== +ldmsd_failover +============== + +:Date: 13 Aug 2018 + +.. contents:: + :depth: 3 +.. + +NAME +=============== + +ldmsd_failover - explanation, configuration, and commands for ldmsd +failover + +SYNOPSIS +=================== + +failover_config + host=\ *HOST* port=\ *PORT* xprt=\ *XPRT* [peer_name=\ *NAME*] + [interval=\ *USEC*] [timeout_factor=\ *FLOAT*] [auto_switch=\ *0|1*] + +failover_start + +failover_stop + +failover_status + +failover_peercfg_start + +failover_peercfg_stop + +DESCRIPTION +====================== + +**ldmsd** can be configured to form a failover pair with another +**ldmsd**. In a nutshell, when a failover pair is formed, the ldmsd's +exchange their updater and producer configuration so that when one goes +down, the other will take over the LDMS set aggregation load +(**failover**). + +**Ping-echo** mechanism is used to detect the service unavailability. +Each ldmsd in the pair sends ping requests to the other, the peer echo +back along with its status. When the echo has not been received within +the timeout period (see below), the peer configuration is automatically +started (failover). + +The following paragraphs explain ldmsd configuration commands relating +to ldmsd failover feature. + +**failover_config** configure failover feature in an ldmsd. The failover +service must be stopped before configuring it. The following list +describes the command parameters. + + host=HOST + The hostname of the failover partner. This is optional in + re-configuration. + + port=PORT + The LDMS port of the failover partner. This is optional in + re-configuration. + + xprt=XPRT + The LDMS transport type (sock, rdma, or ugni) of the failover + partner. This is optional in re-configuration. + + peer_name=NAME + (Optional) The ldmsd name of the failover parter (please see + option **-n** in **ldmsd**\ (8)). If this is specified, the ldmsd + will only accept a pairing with other ldmsd with matching name. + Otherwise, the ldmsd will pair with any ldmsd requesting a + failover pairing. + + interval=uSEC + (Optional) The interval (in micro-seconds) for ping and transport + re-connecting. The default is 1000000 (1 sec). + + timeout_factor=FLOAT + (Optional) The echo timeout factor. The echo timeout is calculated + by **timeout_factor \* interval**. The default is 2. + + auto_switch=0|1 + (Optional) If this is on (1), ldmsd will start **peercfg** or stop + **peercfg** automatically. Otherwise, the user need to issue + **failover_peercfg_start** or **failover_peercfg_stop** manually. + By default, this value is 1. + +**failover_start** is a command to start the (configured) failover +service. After the failover service has started, it will pair with the +peer, retreiving peer configurations and start peer configurations when +it believes that the peer is not in service (with \`auto_switch=1\`, +otherwise it does nothing). + +Please also note that when the failover service is in use (after +**failover_start**), prdcr, updtr, and strgp cannot be altered over the +in-band configuration (start, stop, or reconfigure). The failover +service must be stopped (**failover_stop**) before altering those +configuration objects. + +**failover_stop** is a command to stop the failover service. When the +service is stopped, the peer configurations will also be stopped and +removed from the local memory. The peer also won't be able to pair with +local ldmsd when the failover service is stopped. Issuing +**failover_stop** after the pairing process succeeded will stop failover +service on both daemons in the pair. + +**failover_status** is a command to report (via **ldmsd_controller**) +the failover statuses. + +**failover_peercfg_start** is a command to manually start peer +configruation. Please note that if the **auto_switch** is 1, the ldmsd +will automatically stop peer configuration when it receives the echo +from the peer. + +**failover_peercfg_stop** is a command to manually stop peer +configuration. Please note that if the **auto_switch** is 1, the ldmsd +will automatically start peercfg when the echo has timed out. + +FAILOVER: AUTOMATIC PEERCFG ACTIVATION +================================================= + +The peer configuration is automatically activated when an echo-timeout +event occurred (with \`auto_switch=1\`). The echo-timeout is calculated +based on ping interval, ping-echo round-trip time, \`timeout_factor\` +and moving standard deviation of ping-echo round-trip time as follows: + +rt_time[N] is an array of last N ping-echo round-trip time. + +base = max( max(rt_time), ping_interval ) timeout1 = base + 4 \* +SD(rt_time) timeout2 = base*timeout_factor + +timeout = max( timeout1, timeout2 ) + +EXAMPLES +=================== + +Let's consider the following setup: + +:: + + .-------. + | a20 | + |-------| + | s00/a | + | s00/b | + | s01/a | + | s01/b | + | s02/a | + | s02/b | + | s03/a | + | s03/b | + '-------' + ^ + | + .-----------'-----------. + | | + .-------. .-------. + | a10 | | a11 | + |-------| |-------| + | s00/a | pair | s02/a | + | s00/b |...............| s02/b | + | s01/a | | s03/a | + | s01/b | | s03/b | + '-------' '-------' + ^ ^ + | | + .----'---. .-'------. + | | | | + .-------..-------. .-------..-------. + | s00 || s01 | | s02 || s03 | + |-------||-------| |-------||-------| + | s00/a || s01/a | | s02/a || s03/a | + | s00/b || s01/b | | s02/b || s03/b | + '-------''-------' '-------''-------' + +In this setup, we have 4 sampler daemons (*s00* - *s03*), 2 level-1 +aggregator (*a10*, *a11*), and 1 level-2 aggregator (*a20*). Each +sampler daemon contain set *a* and set *b*, which are prefixed by the +sampler daemon name. The level-1 aggregators are configured to be a +failover pair, aggregating sets from the sampler daemons as shown in the +picture. And the level-2 aggregator is configured to aggregate sets from +the level-1 aggregators. + +The following is a list of configuration and CLI options to achieve the +setup shown above: + +:: + + # a20.cfg + prdcr_add name=prdcr_a10 host=a10.hostname port=12345 xprt=sock \ + type=active interval=1000000 + prdcr_start name=prdcr_a10 + prdcr_add name=prdcr_a11 host=a11.hostname port=12345 xprt=sock \ + type=active interval=1000000 + prdcr_start name=prdcr_a11 + updtr_add name=upd interval=1000000 offset=0 + updtr_prdcr_add name=upd regex.* + updtr_start upd + + # a10.cfg + prdcr_add name=prdcr_s00 host=s00.hostname port=12345 xprt=sock \ + type=active interval=1000000 + prdcr_start name=prdcr_s00 + prdcr_add name=prdcr_s01 host=s01.hostname port=12345 xprt=sock \ + type=active interval=1000000 + prdcr_start name=prdcr_s01 + updtr_add name=upd interval=1000000 offset=0 + updtr_prdcr_add name=upd regex.* + updtr_start upd + failover_config host=a11.hostname port=12345 xprt=sock \ + interval=1000000 peer_name=a11 + failover_start + # a10 CLI + $ ldmsd -c a10.cfg -x sock:12345 -n a10 + # name this daemon "a10" + + # a11.cfg + prdcr_add name=prdcr_s02 host=s02.hostname port=12345 xprt=sock \ + type=active interval=1000000 + prdcr_start name=prdcr_s02 + prdcr_add name=prdcr_s03 host=s03 port=12345 xprt=sock \ + type=active interval=1000000 + prdcr_start name=prdcr_s03 + updtr_add name=upd interval=1000000 offset=0 + updtr_prdcr_add name=upd regex.* + updtr_start upd + failover_config host=a10.hostname port=12345 xprt=sock \ + interval=1000000 peer_name=a10 + failover_start + # a11 CLI + $ ldmsd -c a11 -x sock:12345 -n a11 + # name this daemon "a11" + + # sampler config are omitted (irrelevant). + +With this setup, when *a10* died, *a11* will start aggregating sets from +*s00* and *s01*. When this is done, *a20* will still get all of the sets +through *a11* depicted in the following figure. + +:: + + .-------. + | a20 | + |-------| + | s00/a | + | s00/b | + | s01/a | + | s01/b | + | s02/a | + | s02/b | + | s03/a | + | s03/b | + '-------' + ^ + | + '-----------. + | + xxxxxxxxx .-------. + x a10 x | a11 | + x-------x |-------| + x s00/a x | s00/a | + x s00/b x | s00/b | + x s01/a x | s01/a | + x s01/b x | s01/b | + xxxxxxxxx | s02/a | + | s02/b | + | s03/a | + | s03/b | + '-------' + ^ + | + .--------.-----------------.-'------. + | | | | + .-------..-------. .-------..-------. + | s00 || s01 | | s02 || s03 | + |-------||-------| |-------||-------| + | s00/a || s01/a | | s02/a || s03/a | + | s00/b || s01/b | | s02/b || s03/b | + '-------''-------' '-------''-------' + +When *a10* heartbeat is back, *a11* will stop its producers/updaters +that were working in place of *a10*. The LDMS network is then recovered +back to the original state in the first figure. + +SEE ALSO +=================== + +**ldmsd**\ (8), **ldms_quickstart**\ (7), **ldmsd_controller**\ (8) diff --git a/rtd/man2rst/ldmsd_setgroup.rst b/rtd/man2rst/ldmsd_setgroup.rst new file mode 100644 index 000000000..f9bdecbaf --- /dev/null +++ b/rtd/man2rst/ldmsd_setgroup.rst @@ -0,0 +1,236 @@ +============== +ldmsd_setgroup +============== + +:Date: 5 Jul 2018 + +.. contents:: + :depth: 3 +.. + +NAME +=============== + +ldmsd_setgroup - explanation, configuration, and commands for ldmsd set +group + +SYNOPSIS +=================== + +setgroup_add + name=\ *GROUP_NAME* [producer=\ *PRODUCER*] [interval=\ *USEC*] + [offset=\ *USEC*] + +setgroup_mod + name=\ *GROUP_NAME* [interval=\ *USEC*] [offset=\ *USEC*] + +setgroup_del + name=\ *GROUP_NAME* + +setgroup_ins + name=\ *GROUP_NAME* instance=\ *COMMA_SEPARATED_LIST_OF_INSTANCES* + +setgroup_rm + name=\ *GROUP_NAME* instance=\ *COMMA_SEPARATED_LIST_OF_INSTANCES* + +DESCRIPTION +====================== + +An **ldmsd setgroup** (referred to as **setgroup** for short) is an +**ldms_set** with special information for LDMS daemon (**ldmsd**). The +setgroup information contains a list of other sets so that the LDMSD +**updtr** can update all the sets in the collection at once +(iteratively). This will help administrators in configuration, and help +sampler plugin developer to manage their collection of sets. For an +example usage of \`ldmsd_group\_*\` APIs, please see **grptest.c**, and +\`ldmsd_group\_*()\` declarations (with doxygen doc) in **ldmsd.h**. In +this manual page, we will focus on LDMSD commands that manage the +setgroup from the configuration side. The description for each command +and its parameters is as follows. + +**setgroup_add** adds (creates) a new setgroup. The following list +describes the command parameters: + + name=GROUP_NAME + The name of the setgroup. + + [producer=PRODUCER] + (Optional) The producer name of the setgroup. If not set, the name + of the LDMSD (the **-n** option) is used. + + [interval=USEC] + (Optional) The micro-second update interval hint. + + [offset=USEC] + (Optional) The micro-second update offset hint. + +**setgroup_mod** modifies (mutable) attributes of the setgroup. The list +of parameters is as follows: + + name=GROUP_NAME + The name of the setgroup. + + [interval=USEC] + (Optional) The micro-second update interval hint. + + [offset=USEC] + (Optional) The micro-second update offset hint. + +**setgroup_ins** inserts a list of set instances into the setgroup. + + name=GROUP_NAME + The name of the setgroup. + + instance=COMMA_SEPARATED_LIST_OF_INSTANCES + A comma-separated list of set instances. + +**setgroup_rm** removes a list of set instances from the setgroup. + + name=GROUP_NAME + The name of the setgroup. + + instance=COMMA_SEPARATED_LIST_OF_INSTANCES + A comma-separated list of set instances. + +**setgroup_del** deletes the setgroup. + + name=GROUP_NAME + The name of the setgroup. + +EXAMPLE +================== + +In this example, we will have 2 **ldmsd**'s, namely **sampler** and +**aggregator** for the sampler daemon and the aggregator daemon +respectively. The sampler will have \`meminfo\`, \`set_0\`, \`set_1\`, +\`set_2\`, \`set_3\` as its regular sets. \`thegroup\` will be the +setgroup created in the sampler that contains \`meminfo\` and \`set_0\`. +The aggregator will be setup to update only \`thegroup\`. + +:: + + ### sampler.conf + # It is OK to add the group first, please also not that our group has no + # update hint so that the updater in the aggregator can control its update + # interval. + setgroup_add name=thegroup + + # Insert meminfo and set_0 into the group + setgroup_ins name=thegroup instance=meminfo,set_0 + + # test_sampler will generate a bunch of sets, with this config it will create + # set_0, set_1, set_2, set_3 + load name=test_sampler + config name=test_sampler producer=sampler \ + action=default \ + base=set \ + num_sets=4 \ + push=0 + start name=test_sampler interval=1000000 offset=0 + # meminfo + load name=meminfo + config name=meminfo producer=sampler \ + instance=meminfo + start name=meminfo interval=1000000 offset=0 + ### END OF sampler.conf + + ### aggregator.conf + # Normal producer setup + prdcr_add name=prdcr host=localhost port=10001 xprt=sock \ + interval=1000000 \ + type=active + prdcr_start name=prdcr + # Setup the `grp_updtr` so that it only updates `thegroup`. + updtr_add name=grp_updtr interval=1000000 offset=500000 + updtr_match_add name=grp_updtr regex=thegroup + updtr_prdcr_add name=grp_updtr regex=prdcr + updtr_start name=grp_updtr + ### END OF sampler.conf + +The daemons can be started with the following commands: + +:: + + # For sampler, foreground start + $ ldmsd -F -c sampler.conf -x sock:10001 + # For aggregator, foreground start + $ ldmsd -F -c aggregator.conf -x sock:10000 + +When listing the sets on the aggregator with **-v** option, you'll see +that only \`meminfo\` and \`set_0\` are recent. \`thegroup\` is only +updated when its information changed. The rest of the sets only been +looked-up, but not updated. + +:: + + $ ldms_ls -x sock -p 10000 -v | grep update + thegroup: consistent, last update: Thu Jul 05 16:22:08 2018 [303411us] + set_3: inconsistent, last update: Wed Dec 31 18:00:00 1969 [0us] + set_2: inconsistent, last update: Wed Dec 31 18:00:00 1969 [0us] + set_1: inconsistent, last update: Wed Dec 31 18:00:00 1969 [0us] + set_0: consistent, last update: Thu Jul 05 16:36:30 2018 [1793us] + meminfo: consistent, last update: Thu Jul 05 16:36:31 2018 [1946us] + +While when listing the sets on the sampler, we will see all of them +being updated (except \`thegroup\`). + +:: + + thegroup: consistent, last update: Thu Jul 05 16:22:08 2018 [303411us] + set_3: consistent, last update: Thu Jul 05 16:39:52 2018 [1915us] + set_2: consistent, last update: Thu Jul 05 16:39:52 2018 [1916us] + set_1: consistent, last update: Thu Jul 05 16:39:53 2018 [1948us] + set_0: consistent, last update: Thu Jul 05 16:39:53 2018 [1948us] + meminfo: consistent, last update: Thu Jul 05 16:39:53 2018 [2022us] + +**Removing/inserting** instances from/into the group can also be done +interactively via **ldmsd_controller**. If we do the following on the +**sampler**: + +:: + + $ ldmsd_controller --port 10001 + Welcome to the LDMSD control processor + sock:localhost:10001> setgroup_rm name=thegroup instance=set_0 + sock:localhost:10001> setgroup_ins name=thegroup instance=set_3 + +\`set_0\` will be removed from \`thegroup\`, and \`set_3\` will be added +into \`thegroup\`. Listing the sets on the **aggregator** will see that +\`set_0\` stopped being updated, and \`set_3\` becomes recent. + +:: + + thegroup: consistent, last update: Thu Jul 05 16:42:12 2018 [378918us] + set_3: consistent, last update: Thu Jul 05 16:42:14 2018 [2070us] + set_2: inconsistent, last update: Wed Dec 31 18:00:00 1969 [0us] + set_1: inconsistent, last update: Wed Dec 31 18:00:00 1969 [0us] + set_0: consistent, last update: Thu Jul 05 16:41:25 2018 [1116us] + meminfo: consistent, last update: Thu Jul 05 16:42:15 2018 [1223us] + +The **members** of the group can be **listed** by the following: + +:: + + $ ldms_ls -x sock -p 10000 -v thegroup + thegroup: consistent, last update: Thu Jul 05 16:42:12 2018 [378918us] + APPLICATION SET INFORMATION ------ + grp_member: set_3 : - + grp_member: meminfo : - + ldmsd_grp_gn : 8 + METADATA -------- + Producer Name : a:10001 + Instance Name : thegroup + Schema Name : ldmsd_grp_schema + Size : 184 + Metric Count : 1 + GN : 1 + User : root(0) + Group : root(0) + Permissions : -rwxrwxrwx + DATA ------------ + Timestamp : Thu Jul 05 16:42:12 2018 [378918us] + Duration : [0.000017s] + Consistent : TRUE + Size : 64 + GN : 8 + ----------------- diff --git a/rtd/man2rst/ldmsd_stream_publish.rst b/rtd/man2rst/ldmsd_stream_publish.rst new file mode 100644 index 000000000..71055ad6a --- /dev/null +++ b/rtd/man2rst/ldmsd_stream_publish.rst @@ -0,0 +1,131 @@ +==================== +ldmsd_stream_publish +==================== + +:Date: 21 Aug 2021 + +.. contents:: + :depth: 3 +.. + +NAME +===================== + +ldmsd_stream_publish - man page for the LDMS ldmsd_stream_publish +executable utility + +SYNOPSIS +========================= + +At the command line: ldmsd_stream_publish [args] + +DESCRIPTION +============================ + +The ldmsd_stream_publish executable publishes to the ldmsd_streams +interface of a running ldms daemon. The hello_publisher takes a file as +input and publishes it either in bulk or line by line. It reuses the +connection for all the messages + +COMMAND LINE SYNTAX +==================================== + +ldmsd_sstream_publish -x -h -p -s -a -A -t -f [-l] + | + + -x + | + | transport of the ldmsd to which to connect. + + -p + | + | port of the ldmsd to which to connect. + + -a + | + | auth to connect to the ldmsd + + -A + | + | auth-opts to connect to the ldmsd + + -s + | + | Name of the stream (this will be used for subscribing) + + -t + | + | Optional data-format. Either 'string' or 'json'. Default is + string. + + -l + | + | Optional line mode. Publishes file one line at a time as + separate publish calls + + -f + | + | File that is published. If not specified, input is copied from + STDIN. + + -r N + | + | Repeat the publication of the file N times, with a delay + interval specifed by -i. Repeating is not supported unless the + input is a file. If the -l option is given, the file and + connection are opened once and the lines are replayed to + individual ldmsd_stream_publish calls. If -l is not given, the + ldmsd_stream_publish_file call is used, resulting in multiple + connection openings. -i interval_in_microseconds + | Change the default delay (usleep(interval_in_microseconds)) used + if repeat is specified. + +BUGS +===================== + +No known bugs. + +NOTES +====================== + +This executable is in development and may change at any time. + +The difference in repeat behavior if -l is present allows for testing +two scenarios: repeating many messages to a single connection and +repeating connection attempts to a daemon that may come and go during +publication attempts. Environment variables LDMSD_STREAM_CONN_TIMEOUT +and LDMSD_STREAM_ACK_TIMEOUT will affect the timing of the repeat loop +when -l is not given. + +EXAMPLES +========================= + +Within ldmsd_controller or a configuration file: + +:: + + load name=hello_sampler + config name=hello_sampler producer=host1 instance=host1/hello_sampler stream=foo component_id=1 + start name=hello_sampler interval=1000000 offset=0 + +:: + + > cat testdata.10.out + { "seq": 0, "job-id" : 10364, "rank" : 1, "kokkos-perf-data" : [ {"name" : "SPARTAFOO0", "count": 0, "time": 0.0000},{"name" : "SPARTAFOO1", "count": 1, "time": 0.0001},{"name" : "SPARTAFOO2", "count": 2, "time": 0.0002},{"name" : "SPARTAFOO3", "count": 3, "time": 0.0003},{"name" : "SPARTAFOO4", "count": 4, "time": 0.0004},{"name" : "SPARTAFOO5", "count": 5, "time": 0.0005},{"name" : "SPARTAFOO6", "count": 6, "time": 0.0006},{"name" : "SPARTAFOO7", "count": 7, "time": 0.0007},{"name" : "SPARTAFOO8", "count": 8, "time": 0.0008},{"name" : "SPARTAFOO9", "count": 9, "time": 0.0009}] } + +:: + + > ldmsd_stream_publish -x sock -h localhost -p 52001 -s foo -t json -f ./testdata.10.out -a none + + + In the log file of the ldmsd: + > cat log.txt + Sat Aug 21 18:15:27 2021: CRITICAL : stream_type: JSON, msg: "{ "seq": 0, "job-id" : 10364, "rank" : 1, "kokkos-perf-data" : [ {"name" : "SPARTAFOO0", "count": 0, "time": 0.0000},{"name" : "SPARTAFOO1", "count": 1, "time": 0.0001},{"name" : "SPARTAFOO2", "count": 2, "time": 0.0002},{"name" : "SPARTAFOO3", "count": 3, "time": 0.0003},{"name" : "SPARTAFOO4", "count": 4, "time": 0.0004},{"name" : "SPARTAFOO5", "count": 5, "time": 0.0005},{"name" : "SPARTAFOO6", "count": 6, "time": 0.0006},{"name" : "SPARTAFOO7", "count": 7, "time": 0.0007},{"name" : "SPARTAFOO8", "count": 8, "time": 0.0008},{"name" : "SPARTAFOO9", "count": 9, "time": 0.0009},{"name" : "SPARTAFOO10", "count": 10, "time": 0.00010}] }", msg_len: 589, entity: 0x2aaab8004680 + + Note that the hello_streams sampler does not do a sample, instead it subscribes to the stream with a callback and prints out what it got off the stream. + +SEE ALSO +========================= + +ldmsd(8), ldms_quickstart(7), ldmsd_controller(8), ldms_sampler_base(7), +Plugin_hello_sampler(7), Plugin_stream_csv_store(7) diff --git a/rtd/man2rst/ldmsd_stream_subscribe.rst b/rtd/man2rst/ldmsd_stream_subscribe.rst new file mode 100644 index 000000000..5829c2c99 --- /dev/null +++ b/rtd/man2rst/ldmsd_stream_subscribe.rst @@ -0,0 +1,125 @@ +====================== +ldmsd_stream_subscribe +====================== + +:Date: 21 Aug 2021 + +.. contents:: + :depth: 3 +.. + +NAME +======================= + +ldmsd_stream_subscribe - man page for the LDMS ldmsd_stream_subscribe +utility + +SYNOPSIS +=========================== + +At the command line: ldmsd_stream_subscribe [args] + +DESCRIPTION +============================== + +The ldmsd_stream_subscribe program subscribes to a stream in place of a +full ldmsd daemon, writing received messages to a file or to stdout. + +COMMAND LINE SYNTAX +====================================== + +ldmsd_stream_subscribe -x -h -p -s -a -A -f -D -i -R -q -E + | + + -x,--xprt + | + | transport type on which to listen. + + -p,--port + | + | port on which to listen. + + -h,--host + | + | hostname or IP address of interface on which to listen. + + -a,--auth + | + | authentication to expect from publishers. + + -A,--auth_arg + | + | auth options if needed (for e.g. ovis auth or munge on unusual + port) + + -s,--stream + | + | Name of the stream to subscribe. + + -f,--file + | + | File where messages delivered are written. If not specified, + STDOUT. + + -E,--events-raw + | + | Suppress delivery envelope information in message output. + + -q,--quiet + | + | Suppress message output to file or stdout entirely. + + -D,--daemonize + | + | Put the process in the background as a daemon. + + -R,--daemon-noroot + | + | Prevent file system root (/) change-directory when starting the + daemon. (Does nothing if -D is not present). + + -i,--daemon-io + | + | Keep the input and output file descriptors attached to the + daemon instead of closing them. (Does nothing if -D is not + present). + +BUGS +======================= + +No known bugs. + +NOTES +======================== + +This program is in development and may change at any time. + +Using "-a none" is insecure and should only be used with care. + +EXAMPLES +=========================== + +Running in user mode as a sink to test a stream publishing program +writing to tag 'mystream': + +:: + + ldmsd_stream_subscribe -x sock -h 127.0.0.1 -p 20411 -s mystream -a none -f messages.out -D -R + +Running in root mode and testing on port 511 + +:: + + ldmsd_stream_subscribe -x sock -h 127.0.0.1 -p 511 -s mystream -a munge -f /var/log/ldms-stream/messages.out -D + +Sending data to listening subscriber + +:: + + echo '{ "a": "worthless message"}' | ./ldmsd_stream_publish -x sock -h 127.0.0.1 -p 20411 -s mystream -a none -t json + +SEE ALSO +=========================== + +ldmsd(8), ldms_quickstart(7), ldmsd_stream_publish(8), +ldms_authentication(7) diff --git a/rtd/man2rst/lsdate.rst b/rtd/man2rst/lsdate.rst new file mode 100644 index 000000000..1d4f61fa3 --- /dev/null +++ b/rtd/man2rst/lsdate.rst @@ -0,0 +1,37 @@ +========= +LSDATE +========= + +:Date: June 2018 + +.. contents:: + :depth: 3 +.. + +NAME +======= + +lsdate - list directory contents with UTC timestamp suffix translation + +SYNOPSIS +=========== + +**lsdate** [*OPTION*]... [*FILE*]... + +DESCRIPTION +============== + +Execute ls(1) and apply an output filter to reveal the calendar date of +timestamp suffixed files, such as produced by LDMS CVS stores. +Timestamps are assumed to be seconds since the epoch. Fractional seconds +are not supported. + +SEE ALSO +=========== + +ls(1), Plugin_store_csv(7) + +NOTES +======== + +The output of lsdate -s and the output of lsdate -l may be surprising. diff --git a/rtd/man2rst/netlink-notifier.rst b/rtd/man2rst/netlink-notifier.rst new file mode 100644 index 000000000..11c708367 --- /dev/null +++ b/rtd/man2rst/netlink-notifier.rst @@ -0,0 +1,205 @@ +================ +netlink-notifier +================ + +:Date: 25 June 2021 + +.. contents:: + :depth: 3 +.. + +NAME +================= + +ldms-netlink-notifier - Transmit Linux kernel netlink process life +messages to ldmsd streams. + +ldms-notify - systemd service + +SYNOPSIS +===================== + +ldms-netlink-notifier [OPTION...] + +DESCRIPTION +======================== + +The netlink-notifier generates JSON message for ldmsd and JSON aware +LDMS samplers. Its messages are mostly compatible with those from the +slurm spank based notifier. + +OPTIONS +==================== + +:: + + -c use task comm field for process name. + -d strip off directory path from process name. + -D specify run duration in seconds. If unspecified, run forever. + -e select which events to monitor. + -E equivalent to -e all. + -g show glyphs for event types in debug mode. + -h show this help. + -i seconds time (float) to sleep between checks for processes exceeding the short dir filter time. + If the -i value > the -m value, -i may effectively filter out additional processes. + -j file file to log json messages and transmission status. + -l force stdout line buffering. + -L file log to file instead of stdout. + -r run with real time FIFO scheduler (available on some kernels). + -s show short process name in debugging. + -S suppress stream message publication. + -t show debugging trace messages. + -u umin ignore processes with uid < umin + -v lvl log level for stream library messages. Higher is quieter. Error messages are >= 3. + -q run quietly + -x show extra process information. + -X equivalent to -Egrx. + The ldmsd connection and commonly uninteresting or short-lived processes may be specified with the options or environment variables below. + The 'short' options do not override the exclude entirely options. + --exclude-programs[=] change the default value of exclude-programs + When repeated, all values are concatenated. + If given with no value, the default (nullexe): is removed. + If not given, the default is used unless + the environment variable NOTIFIER_EXCLUDE_PROGRAMS is set. + --exclude-dir-path[=] change the default value of exclude-dir-path + When repeated, all values are concatenated. + If given with no value, the default /sbin is removed. + If not given, the default is used unless + the environment variable NOTIFIER_EXCLUDE_DIR_PATH is set. + --exclude-short-path[=] change the default value of exclude-short-path + When repeated, all values are concatenated. + If given with no value, the default /bin:/usr is removed. + If not given, the default is used unless + the environment variable NOTIFIER_EXCLUDE_SHORT_PATH is set. + --exclude-short-time[=][val] change the default value of exclude-short-time. + If repeated, the last value given wins. + If given with no value, the default 1 becomes 0 unless + the environment variable NOTIFIER_EXCLUDE_SHORT_TIME is set. + --stream[=] change the default value of stream. + If repeated, the last value given wins. + The default slurm is used if env NOTIFIER_LDMS_STREAM is not set. + --xprt[=] change the default value of xprt. + If repeated, the last value given wins. + The default sock is used if env NOTIFIER_LDMS_XPRT is not set. + --host[=] change the default value of host. + If repeated, the last value given wins. + The default localhost is used if env NOTIFIER_LDMS_HOST is not set. + --port[=] change the default value of port. + If repeated, the last value given wins. + The default 411 is used if env NOTIFIER_LDMS_PORT is not set. + --auth[=] change the default value of auth. + If repeated, the last value given wins. + The default munge is used if env NOTIFIER_LDMS_AUTH is not set. + --reconnect[=] change the default value of reconnect. + If repeated, the last value given wins. + The default 600 is used if env NOTIFIER_LDMS_RECONNECT is not set. + --timeout[=] change the default value of timeout. + If repeated, the last value given wins. + The default 1 is used if env NOTIFIER_LDMS_TIMEOUT is not set. + --track-dir[=] change the pids published directory. + The default is used if env NOTIFIER_TRACK_DIR is not set. + The path given should be on a RAM-based file system for efficiency, + and it should not contain any files except those created by + this daemon. When enabled, track-dir will be populated even if + -S is used to suppress the stream output. + --component_id= set the value of component_id. + If not set, the component_id field is not included in the stream formats produced. + --ProducerName= set the value of ProducerName + If not set, the ProducerName field is not included in the stream formats produced. + +ENVIRONMENT +======================== + +The following variables override defaults if a command line option is +not present, as describe in the options section. + +:: + + NOTIFIER_EXCLUDE_PROGRAMS="(nullexe):" + NOTIFIER_EXCLUDE_DIRS=/sbin + NOTIFIER_EXCLUDE_SHORT_PATH=/bin:/usr + NOTIFIER_EXCLUDE_SHORT_TIME=1 + NOTIFIER_TRACK_DIR=/var/run/ldms-netlink-tracked + NOTIFIER_LDMS_RECONNECT=600 + NOTIFIER_LDMS_TIMEOUT=1 + NOTIFIER_LDMS_STREAM=slurm + NOTIFIER_LDMS_XPRT=sock + NOTIFIER_LDMS_HOST=localhost + NOTIFIER_LDMS_PORT=411 + NOTIFIER_LDMS_AUTH=munge + +Omitting (nullexe): from NOTIFIER_EXCLUDE_PROGRAMS may cause +incomplete output related to processes no longer present. In exotic +circumstances, this may be desirable anyway. + +FILES +================== + +Users or other processes may discover which processes are the subject of +notifications by examining the files in + +/NOTIFIER_TRACK_DIR/\* + +For each pid started event which would be emitted to an LDMS stream, a +temporary file with the name of the pid is created in +NOTIFIER_TRACK_DIR. The file will contain the json event attempted. The +temporary file will be removed when the corresponding pid stopped event +is sent. These files are not removed when the notifier daemon exits. +Client applications may validate a file by checking the contents against +the /proc/$pid/stat content, if it exists. Invalid files should be +removed by clients or system scripts. + +NOTES +================== + +The core of this utility is derived from forkstat(8). + +The output of this utility, if used to drive a sampler, usually needs to +be consumed on the same node. + +If not used with a sampler, the --component_id or --ProducerName options +are needed to add a node identifier to the messages. Normally a +process-following sampler that creates sets will add the node identifier +automatically. + +Options are still in development. Several options affect only the trace +output. + +EXAMPLES +===================== + +Run for 30 seconds with screen and json.log test output connecting to +the ldmsd from 'ldms-static-test.sh blobwriter' test: + +:: + + netlink-notifier -t -D 30 -g -u 1 -x -e exec,clone,exit \ + -j json.log --exclude-dir-path=/bin:/sbin:/usr \ + --port=61061 --auth=none --reconnect=1" + +Run in a typical deployment (sock, munge, port 411, localhost, forever, +10 minute reconnect): + +:: + + netlink-notifier + +Run in a systemd .service wrapper, excluding root owned processes. + +:: + + EnvironmentFile=-/etc/sysconfig/ldms-netlink-notifier.conf + ExecStart=/usr/sbin/ldms-netlink-notifier -u 1 -x -e exec,clone,exit + +Run in a systemd .service wrapper, excluding root owned processes, with +debugging files + +:: + + EnvironmentFile=-/etc/sysconfig/ldms-netlink-notifier.conf + ExecStart=/usr/sbin/ldms-netlink-notifier -u 1 -x -e exec,clone,exit -j /home/user/nl.json -L /home/user/nl.log -t --ProducerName=%H + +SEE ALSO +===================== + +forkstat(8), ldmsd(8), ldms-static-test(8) diff --git a/rtd/pandoc_man_2_rst.py b/rtd/pandoc_man_2_rst.py new file mode 100644 index 000000000..99f84df3c --- /dev/null +++ b/rtd/pandoc_man_2_rst.py @@ -0,0 +1,22 @@ +#/usr/bin/python3 +#requires pandoc v3 for man input format +#point X_ROOT to the build directory + +import os +import glob + +OVIS_ROOT='/opt/ovis/build/ovis' +source = ['man','src/contrib/sampler/*','src/contrib/store/*','src/ldmsd/test/','src/sampler/*','src/store/*'] +dest = ['ldms_man','sampler_man','store_man','ldms_man','sampler_man','store_man'] +for c,s in enumerate(source): + files = glob.glob(f'{OVIS_ROOT}/ldms/{s}/*man') + for i in files: + fname = i.split('/')[-1].replace('.man','.rst') + os.system('mkdir -p man2rst/') + os.system(f'/usr/local/bin/pandoc -f man -s -t rst --toc {i} -o man2rst/{fname}') + plugin = fname.replace('.rst','') + plugin_title = '='*len(plugin) + os.system('sed -i -e "0,/man/{s/man/'+plugin+'/}" man2rst/'+fname) + os.system('sed -i -e "s/===/'+plugin_title+'/" man2rst/'+fname) + os.system(f'cp man2rst/{fname} docs/source/{dest[c]}') + diff --git a/rtd/paper.lock b/rtd/paper.lock new file mode 100644 index 000000000..eb8b6d1e1 --- /dev/null +++ b/rtd/paper.lock @@ -0,0 +1 @@ +Name | Date | File(s) diff --git a/rtd/pyproject.toml b/rtd/pyproject.toml new file mode 100644 index 000000000..6abaa9ca0 --- /dev/null +++ b/rtd/pyproject.toml @@ -0,0 +1,8 @@ +[build-system] +requires = ["flit_core >=3.2,<4"] +build-backend = "flit_core.buildapi" + +[project] +name = "LDMS" +authors = [{name = "ldms", email = "ldms@sandia.gov"}] +dynamic = ["version", "description"]