Skip to content

Commit

Permalink
DAOS-16837 container: Add client-side DFS metrics
Browse files Browse the repository at this point in the history
If metrics are enabled for a POSIX container, create
a new container/$UUID/dfs metrics root in the client
telemetry to provide DFS-oriented metrics (POSIX ops,
file I/Os, etc).

Required-githooks: true

Signed-off-by: Michael MacDonald <[email protected]>
  • Loading branch information
mjmac committed Nov 27, 2024
1 parent ab60993 commit 3691fb0
Show file tree
Hide file tree
Showing 13 changed files with 320 additions and 10 deletions.
2 changes: 1 addition & 1 deletion src/client/dfs/SConscript
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ def scons():
libraries = ['daos_common', 'daos', 'uuid', 'gurt']

dfs_src = ['common.c', 'cont.c', 'dir.c', 'file.c', 'io.c', 'lookup.c', 'mnt.c', 'obj.c',
'pipeline.c', 'readdir.c', 'rename.c', 'xattr.c', 'dfs_sys.c']
'pipeline.c', 'readdir.c', 'rename.c', 'xattr.c', 'dfs_sys.c', 'metrics.c']
dfs = denv.d_library('dfs', dfs_src, LIBS=libraries)
denv.Install('$PREFIX/lib64/', dfs)

Expand Down
2 changes: 2 additions & 0 deletions src/client/dfs/common.c
Original file line number Diff line number Diff line change
Expand Up @@ -625,6 +625,8 @@ entry_stat(dfs_t *dfs, daos_handle_t th, daos_handle_t oh, const char *name, siz
stbuf->st_atim.tv_sec = stbuf->st_mtim.tv_sec;
stbuf->st_atim.tv_nsec = stbuf->st_mtim.tv_nsec;
}

DFS_OP_STAT_INCR(dfs, DOS_STAT);
return 0;
}

Expand Down
4 changes: 4 additions & 0 deletions src/client/dfs/dfs_internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@
#include <daos.h>
#include <daos_fs.h>

#include "metrics.h"

/** D-key name of SB metadata */
#define SB_DKEY "DFS_SB_METADATA"

Expand Down Expand Up @@ -190,6 +192,8 @@ struct dfs {
struct dfs_mnt_hdls *cont_hdl;
/** the root dir stat buf */
struct stat root_stbuf;
/** DFS top-level metrics */
struct dfs_metrics *metrics;
};

struct dfs_entry {
Expand Down
2 changes: 2 additions & 0 deletions src/client/dfs/dir.c
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@ dfs_mkdir(dfs_t *dfs, dfs_obj_t *parent, const char *name, mode_t mode, daos_ocl
if (rc != 0)
return daos_der2errno(rc);

DFS_OP_STAT_INCR(dfs, DOS_MKDIR);
return rc;
}

Expand Down Expand Up @@ -220,6 +221,7 @@ dfs_remove(dfs_t *dfs, dfs_obj_t *parent, const char *name, bool force, daos_obj
if (oid)
oid_cp(oid, entry.oid);

DFS_OP_STAT_INCR(dfs, DOS_REMOVE);
out:
rc = check_tx(th, rc);
if (rc == ERESTART)
Expand Down
34 changes: 32 additions & 2 deletions src/client/dfs/io.c
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,18 @@ read_cb(tse_task_t *task, void *data)
return rc;
}

static void
dfs_update_file_metrics(dfs_t *dfs, daos_size_t read_bytes, daos_size_t write_bytes)
{
if (dfs == NULL || dfs->metrics == NULL)
return;

if (read_bytes > 0)
d_tm_inc_gauge(dfs->metrics->dm_read_bytes, read_bytes);
if (write_bytes > 0)
d_tm_inc_gauge(dfs->metrics->dm_write_bytes, write_bytes);
}

static int
dfs_read_int(dfs_t *dfs, dfs_obj_t *obj, daos_off_t off, dfs_iod_t *iod, d_sg_list_t *sgl,
daos_size_t buf_size, daos_size_t *read_size, daos_event_t *ev)
Expand Down Expand Up @@ -85,11 +97,14 @@ dfs_read_int(dfs_t *dfs, dfs_obj_t *obj, daos_off_t off, dfs_iod_t *iod, d_sg_li
if (rc)
D_GOTO(err_params, rc);

DFS_OP_STAT_INCR(dfs, DOS_READ);
/*
* dc_task_schedule() calls tse_task_complete() even on error (which also calls the
* completion cb that frees params in this case, so we can just ignore the rc here.
*/
dc_task_schedule(task, true);

dfs_update_file_metrics(dfs, *params->read_size, 0);
return 0;

err_params:
Expand Down Expand Up @@ -125,6 +140,7 @@ dfs_read(dfs_t *dfs, dfs_obj_t *obj, d_sg_list_t *sgl, daos_off_t off, daos_size
daos_event_launch(ev);
daos_event_complete(ev, 0);
}
DFS_OP_STAT_INCR(dfs, DOS_READ);
return 0;
}

Expand All @@ -146,7 +162,9 @@ dfs_read(dfs_t *dfs, dfs_obj_t *obj, d_sg_list_t *sgl, daos_off_t off, daos_size
return daos_der2errno(rc);
}

DFS_OP_STAT_INCR(dfs, DOS_READ);
*read_size = iod.arr_nr_read;
dfs_update_file_metrics(dfs, iod.arr_nr_read, 0);
return 0;
}

Expand All @@ -173,6 +191,7 @@ dfs_readx(dfs_t *dfs, dfs_obj_t *obj, dfs_iod_t *iod, d_sg_list_t *sgl, daos_siz
daos_event_launch(ev);
daos_event_complete(ev, 0);
}
DFS_OP_STAT_INCR(dfs, DOS_READX);
return 0;
}

Expand All @@ -189,7 +208,9 @@ dfs_readx(dfs_t *dfs, dfs_obj_t *obj, dfs_iod_t *iod, d_sg_list_t *sgl, daos_siz
return daos_der2errno(rc);
}

DFS_OP_STAT_INCR(dfs, DOS_READX);
*read_size = arr_iod.arr_nr_read;
dfs_update_file_metrics(dfs, arr_iod.arr_nr_read, 0);
return 0;
}

Expand Down Expand Up @@ -223,6 +244,7 @@ dfs_write(dfs_t *dfs, dfs_obj_t *obj, d_sg_list_t *sgl, daos_off_t off, daos_eve
daos_event_launch(ev);
daos_event_complete(ev, 0);
}
DFS_OP_STAT_INCR(dfs, DOS_WRITE);
return 0;
}

Expand All @@ -238,8 +260,12 @@ dfs_write(dfs_t *dfs, dfs_obj_t *obj, d_sg_list_t *sgl, daos_off_t off, daos_eve
daos_event_errno_rc(ev);

rc = daos_array_write(obj->oh, DAOS_TX_NONE, &iod, sgl, ev);
if (rc)
if (rc == 0) {
DFS_OP_STAT_INCR(dfs, DOS_WRITE);
dfs_update_file_metrics(dfs, 0, buf_size);
} else {
D_ERROR("daos_array_write() failed, " DF_RC "\n", DP_RC(rc));
}

return daos_der2errno(rc);
}
Expand All @@ -266,6 +292,7 @@ dfs_writex(dfs_t *dfs, dfs_obj_t *obj, dfs_iod_t *iod, d_sg_list_t *sgl, daos_ev
daos_event_launch(ev);
daos_event_complete(ev, 0);
}
DFS_OP_STAT_INCR(dfs, DOS_WRITEX);
return 0;
}

Expand All @@ -277,8 +304,11 @@ dfs_writex(dfs_t *dfs, dfs_obj_t *obj, dfs_iod_t *iod, d_sg_list_t *sgl, daos_ev
daos_event_errno_rc(ev);

rc = daos_array_write(obj->oh, DAOS_TX_NONE, &arr_iod, sgl, ev);
if (rc)
if (rc == 0) {
DFS_OP_STAT_INCR(dfs, DOS_WRITEX);
} else {
D_ERROR("daos_array_write() failed (%d)\n", rc);
}

return daos_der2errno(rc);
}
158 changes: 158 additions & 0 deletions src/client/dfs/metrics.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,158 @@
/**
* (C) Copyright 2024 Intel Corporation.
*
* SPDX-License-Identifier: BSD-2-Clause-Patent
*/
#define D_LOGFAC DD_FAC(dfs)

#include <uuid/uuid.h>
#include <fcntl.h>

#include <daos.h>
#include <daos_fs.h>
#include <daos_fs_sys.h>
#include <daos/common.h>
#include <daos/container.h>
#include <daos/metrics.h>
#include <daos/job.h>
#include <gurt/telemetry_common.h>
#include <gurt/telemetry_producer.h>
#include <gurt/telemetry_consumer.h>

#include "metrics.h"
#include "dfs_internal.h"

#define DFS_METRICS_ROOT "dfs"

#define STAT_METRICS_SIZE (D_TM_METRIC_SIZE * DOS_LIMIT)
#define FILE_METRICS_SIZE (((D_TM_METRIC_SIZE * NR_SIZE_BUCKETS) * 2) + D_TM_METRIC_SIZE * 2)
#define DFS_METRICS_SIZE (STAT_METRICS_SIZE + FILE_METRICS_SIZE)

#define SPRINTF_CONT_PATH(buf, cont_uuid, path) \
snprintf(buf, sizeof(buf), "container/" DF_UUIDF "/%s", DP_UUID(cont_uuid), path);

#define ADD_STAT_METRIC(name, ...) \
SPRINTF_CONT_PATH(tmp_path, cont_uuid, DFS_METRICS_ROOT "/ops/" #name); \
rc = d_tm_add_metric(&metrics->dm_op_stats[i], D_TM_COUNTER, "Count of " #name " calls", \
"calls", tmp_path); \
if (rc != 0) { \
DL_ERROR(rc, "failed to create " #name " counter"); \
return; \
} \
i++;

static void
op_stats_init(struct dfs_metrics *metrics, uuid_t cont_uuid)
{
char tmp_path[D_TM_MAX_NAME_LEN] = {0};
int i = 0;
int rc;

if (metrics == NULL)
return;

D_FOREACH_DFS_OP_STAT(ADD_STAT_METRIC);
}

static void
cont_stats_init(struct dfs_metrics *metrics, uuid_t cont_uuid)
{
char tmp_path[D_TM_MAX_NAME_LEN] = {0};
int rc = 0;

if (metrics == NULL)
return;

SPRINTF_CONT_PATH(tmp_path, cont_uuid, "mount_time");
rc = d_tm_add_metric(&metrics->dm_mount_time, D_TM_TIMESTAMP, "container mount time", NULL,
tmp_path);
if (rc != 0)
DL_ERROR(rc, "failed to create mount_time timestamp");
}

static void
file_stats_init(struct dfs_metrics *metrics, uuid_t cont_uuid)
{
char tmp_path[D_TM_MAX_NAME_LEN] = {0};
int rc = 0;

if (metrics == NULL)
return;

SPRINTF_CONT_PATH(tmp_path, cont_uuid, DFS_METRICS_ROOT "/read_bytes");
rc = d_tm_add_metric(&metrics->dm_read_bytes, D_TM_STATS_GAUGE, "dfs read bytes", "bytes",
tmp_path);
if (rc != 0)
DL_ERROR(rc, "failed to create dfs read_bytes counter");
rc =
d_tm_init_histogram(metrics->dm_read_bytes, tmp_path, NR_SIZE_BUCKETS, 256, 2, "bytes");
if (rc)
DL_ERROR(rc, "Failed to init dfs read size histogram");

SPRINTF_CONT_PATH(tmp_path, cont_uuid, DFS_METRICS_ROOT "/write_bytes");
rc = d_tm_add_metric(&metrics->dm_write_bytes, D_TM_STATS_GAUGE, "dfs write bytes", "bytes",
tmp_path);
if (rc != 0)
DL_ERROR(rc, "failed to create dfs write_bytes counter");
rc = d_tm_init_histogram(metrics->dm_write_bytes, tmp_path, NR_SIZE_BUCKETS, 256, 2,
"bytes");
if (rc)
DL_ERROR(rc, "Failed to init dfs write size histogram");
}

void
dfs_metrics_init(dfs_t *dfs)
{
uuid_t cont_uuid;
char root_name[D_TM_MAX_NAME_LEN];
pid_t pid = getpid();
size_t root_size = DFS_METRICS_SIZE + (D_TM_METRIC_SIZE * 3);
int rc;

if (dfs == NULL)
return;

rc = dc_cont_hdl2uuid(dfs->coh, NULL, &cont_uuid);
if (rc != 0) {
DL_ERROR(rc, "failed to get container UUID");
goto error;
}

snprintf(root_name, sizeof(root_name), "%d", pid);
/* if only container-level metrics are enabled; this will init a root for them */
rc = d_tm_init_with_name(d_tm_cli_pid_key(pid), root_size, D_TM_OPEN_OR_CREATE, root_name);
if (rc != 0 && rc != -DER_ALREADY) {
DL_ERROR(rc, "failed to init DFS metrics");
goto error;
}

D_ALLOC_PTR(dfs->metrics);
if (dfs->metrics == NULL) {
D_ERROR("failed to alloc DFS metrics");
goto error;
}

SPRINTF_CONT_PATH(root_name, cont_uuid, DFS_METRICS_ROOT);
rc = d_tm_add_ephemeral_dir(NULL, DFS_METRICS_SIZE, root_name);
if (rc != 0) {
DL_ERROR(rc, "failed to add DFS metrics dir");
goto error;
}

cont_stats_init(dfs->metrics, cont_uuid);
op_stats_init(dfs->metrics, cont_uuid);
file_stats_init(dfs->metrics, cont_uuid);

d_tm_record_timestamp(dfs->metrics->dm_mount_time);
return;

error:
if (dfs->metrics != NULL)
D_FREE(dfs->metrics);
}

void
dfs_metrics_fini(dfs_t *dfs)
{
D_FREE(dfs->metrics);
}
Loading

0 comments on commit 3691fb0

Please sign in to comment.