From b0f80eec238822223387ec1364732ac4ff2b14c7 Mon Sep 17 00:00:00 2001 From: hamistao Date: Mon, 21 Oct 2024 11:34:11 -0300 Subject: [PATCH 1/7] lxd/api_metrics: Filter Operation query by node Signed-off-by: hamistao --- lxd/api_metrics.go | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/lxd/api_metrics.go b/lxd/api_metrics.go index 863220a487d6..3fe4087efa13 100644 --- a/lxd/api_metrics.go +++ b/lxd/api_metrics.go @@ -389,7 +389,9 @@ func internalMetrics(ctx context.Context, daemonStartTime time.Time, tx *db.Clus out.AddSamples(metrics.WarningsTotal, metrics.Sample{Value: float64(len(warnings))}) } - operations, err := dbCluster.GetOperations(ctx, tx.Tx()) + // Create local variable to get a pointer. + nodeID := tx.GetNodeID() + operations, err := dbCluster.GetOperations(ctx, tx.Tx(), dbCluster.OperationFilter{NodeID: &nodeID}) if err != nil { logger.Warn("Failed to get operations", logger.Ctx{"err": err}) } else { From 077c887e69e1b6680f91bf54ee7568642fa70bc3 Mon Sep 17 00:00:00 2001 From: hamistao Date: Tue, 22 Oct 2024 02:18:38 -0300 Subject: [PATCH 2/7] lxd/db/cluster/warnings: Allow filtering by Node and Status Signed-off-by: hamistao (cherry picked from commit 24f150cf451eef796d879f1a191004ef0504b301) Signed-off-by: hamistao --- lxd/db/cluster/warnings.go | 1 + 1 file changed, 1 insertion(+) diff --git a/lxd/db/cluster/warnings.go b/lxd/db/cluster/warnings.go index 79adbda3d9e0..47123ed3fbd5 100644 --- a/lxd/db/cluster/warnings.go +++ b/lxd/db/cluster/warnings.go @@ -18,6 +18,7 @@ import ( //go:generate mapper stmt -e warning objects-by-UUID //go:generate mapper stmt -e warning objects-by-Project //go:generate mapper stmt -e warning objects-by-Status +//go:generate mapper stmt -e warning objects-by-Node-and-Status //go:generate mapper stmt -e warning objects-by-Node-and-TypeCode //go:generate mapper stmt -e warning objects-by-Node-and-TypeCode-and-Project //go:generate mapper stmt -e warning objects-by-Node-and-TypeCode-and-Project-and-EntityType-and-EntityID From ff136e7e82ac665f450b4ac8540e53f61744f350 Mon Sep 17 00:00:00 2001 From: hamistao Date: Tue, 22 Oct 2024 02:19:13 -0300 Subject: [PATCH 3/7] lxd/db/cluster: Run `make update-schema` Signed-off-by: hamistao (cherry picked from commit decddf5adfbd0c34c1f189c4eff16c19b3527abd) Signed-off-by: hamistao --- lxd/db/cluster/warnings.mapper.go | 33 +++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/lxd/db/cluster/warnings.mapper.go b/lxd/db/cluster/warnings.mapper.go index 93d7b4634cd2..beac24e5f334 100644 --- a/lxd/db/cluster/warnings.mapper.go +++ b/lxd/db/cluster/warnings.mapper.go @@ -53,6 +53,15 @@ SELECT warnings.id, coalesce(nodes.name, '') AS node, coalesce(projects.name, '' ORDER BY warnings.uuid `) +var warningObjectsByNodeAndStatus = RegisterStmt(` +SELECT warnings.id, coalesce(nodes.name, '') AS node, coalesce(projects.name, '') AS project, coalesce(warnings.entity_type_code, -1), coalesce(warnings.entity_id, -1), warnings.uuid, warnings.type_code, warnings.status, warnings.first_seen_date, warnings.last_seen_date, warnings.updated_date, warnings.last_message, warnings.count + FROM warnings + LEFT JOIN nodes ON warnings.node_id = nodes.id + LEFT JOIN projects ON warnings.project_id = projects.id + WHERE ( coalesce(node, '') = ? AND warnings.status = ? ) + ORDER BY warnings.uuid +`) + var warningObjectsByNodeAndTypeCode = RegisterStmt(` SELECT warnings.id, coalesce(nodes.name, '') AS node, coalesce(projects.name, '') AS project, coalesce(warnings.entity_type_code, -1), coalesce(warnings.entity_id, -1), warnings.uuid, warnings.type_code, warnings.status, warnings.first_seen_date, warnings.last_seen_date, warnings.updated_date, warnings.last_message, warnings.count FROM warnings @@ -238,6 +247,30 @@ func GetWarnings(ctx context.Context, tx *sql.Tx, filters ...WarningFilter) ([]W continue } + _, where, _ := strings.Cut(parts[0], "WHERE") + queryParts[0] += "OR" + where + } else if filter.Node != nil && filter.Status != nil && filter.ID == nil && filter.UUID == nil && filter.Project == nil && filter.TypeCode == nil && filter.EntityType == nil && filter.EntityID == nil { + args = append(args, []any{filter.Node, filter.Status}...) + if len(filters) == 1 { + sqlStmt, err = Stmt(tx, warningObjectsByNodeAndStatus) + if err != nil { + return nil, fmt.Errorf("Failed to get \"warningObjectsByNodeAndStatus\" prepared statement: %w", err) + } + + break + } + + query, err := StmtString(warningObjectsByNodeAndStatus) + if err != nil { + return nil, fmt.Errorf("Failed to get \"warningObjects\" prepared statement: %w", err) + } + + parts := strings.SplitN(query, "ORDER BY", 2) + if i == 0 { + copy(queryParts[:], parts) + continue + } + _, where, _ := strings.Cut(parts[0], "WHERE") queryParts[0] += "OR" + where } else if filter.UUID != nil && filter.ID == nil && filter.Project == nil && filter.Node == nil && filter.TypeCode == nil && filter.EntityType == nil && filter.EntityID == nil && filter.Status == nil { From a22d214a00a891643cc2baf1730d4347f6d032e5 Mon Sep 17 00:00:00 2001 From: hamistao Date: Wed, 13 Nov 2024 10:02:19 -0300 Subject: [PATCH 4/7] lxd/api_metrics: Use `*state.State` instead of `time.Time` on `internalMetrics` Signed-off-by: hamistao --- lxd/api_metrics.go | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/lxd/api_metrics.go b/lxd/api_metrics.go index 3fe4087efa13..4d8237ffebf6 100644 --- a/lxd/api_metrics.go +++ b/lxd/api_metrics.go @@ -129,7 +129,7 @@ func metricsGet(d *Daemon, r *http.Request) response.Response { } // Register internal metrics. - intMetrics = internalMetrics(ctx, s.StartTime, tx) + intMetrics = internalMetrics(ctx, s, tx) return nil }) if err != nil { @@ -378,7 +378,7 @@ func getFilteredMetrics(s *state.State, r *http.Request, compress bool, metricSe return response.SyncResponsePlain(true, compress, metricSet.String()) } -func internalMetrics(ctx context.Context, daemonStartTime time.Time, tx *db.ClusterTx) *metrics.MetricSet { +func internalMetrics(ctx context.Context, s *state.State, tx *db.ClusterTx) *metrics.MetricSet { out := metrics.NewMetricSet(nil) warnings, err := dbCluster.GetWarnings(ctx, tx.Tx()) @@ -421,7 +421,7 @@ func internalMetrics(ctx context.Context, daemonStartTime time.Time, tx *db.Clus } // Daemon uptime - out.AddSamples(metrics.UptimeSeconds, metrics.Sample{Value: time.Since(daemonStartTime).Seconds()}) + out.AddSamples(metrics.UptimeSeconds, metrics.Sample{Value: time.Since(s.StartTime).Seconds()}) // Number of goroutines out.AddSamples(metrics.GoGoroutines, metrics.Sample{Value: float64(runtime.NumGoroutine())}) From 8e07bd0dd8b86dd73ff35812091ac85b2827337e Mon Sep 17 00:00:00 2001 From: hamistao Date: Tue, 22 Oct 2024 18:55:13 -0300 Subject: [PATCH 5/7] lxd/api_metrics: Filter query for Warnings appropriately Filters query for Warnings on the metrics handler by Node. Since some Warnings do not have a node, nodeless Warnings are only being included if querying the metrics from the leader node. Signed-off-by: hamistao --- lxd/api_metrics.go | 32 +++++++++++++++++++++++++++++++- 1 file changed, 31 insertions(+), 1 deletion(-) diff --git a/lxd/api_metrics.go b/lxd/api_metrics.go index 4d8237ffebf6..3a3bb8b0108b 100644 --- a/lxd/api_metrics.go +++ b/lxd/api_metrics.go @@ -14,6 +14,7 @@ import ( "github.com/canonical/lxd/lxd/auth" "github.com/canonical/lxd/lxd/db" dbCluster "github.com/canonical/lxd/lxd/db/cluster" + "github.com/canonical/lxd/lxd/db/warningtype" "github.com/canonical/lxd/lxd/instance" instanceDrivers "github.com/canonical/lxd/lxd/instance/drivers" "github.com/canonical/lxd/lxd/instance/instancetype" @@ -378,10 +379,39 @@ func getFilteredMetrics(s *state.State, r *http.Request, compress bool, metricSe return response.SyncResponsePlain(true, compress, metricSet.String()) } +// clusterMemberWarnings returns the list of unresolved and unacknowledged warnings related to this cluster member. +// If this member is the leader, also include nodeless warnings. +// This way we include them while avoiding counting them redundantly across cluster members. +func clusterMemberWarnings(ctx context.Context, s *state.State, tx *db.ClusterTx) ([]dbCluster.Warning, error) { + var filters []dbCluster.WarningFilter + + leaderInfo, err := s.LeaderInfo() + if err != nil { + return nil, err + } + + // Use local variable to get pointer. + emptyNode := "" + + for status := range warningtype.Statuses { + // Do not include resolved warnings that are resolved but not yet pruned neither those that were acknowledged. + if status != warningtype.StatusResolved && status != warningtype.StatusAcknowledged { + filters = append(filters, dbCluster.WarningFilter{Node: &s.ServerName, Status: &status}) + if leaderInfo.Leader { + // Count the nodeless warnings as belonging to the leader node. + filters = append(filters, dbCluster.WarningFilter{Node: &emptyNode, Status: &status}) + } + } + } + + return dbCluster.GetWarnings(ctx, tx.Tx(), filters...) +} + func internalMetrics(ctx context.Context, s *state.State, tx *db.ClusterTx) *metrics.MetricSet { out := metrics.NewMetricSet(nil) - warnings, err := dbCluster.GetWarnings(ctx, tx.Tx()) + warnings, err := clusterMemberWarnings(ctx, s, tx) + if err != nil { logger.Warn("Failed to get warnings", logger.Ctx{"err": err}) } else { From 2d4dcd0364b9bf2d6b78d806e202e9642571f70e Mon Sep 17 00:00:00 2001 From: hamistao Date: Fri, 15 Nov 2024 15:07:01 -0300 Subject: [PATCH 6/7] test: Test for querying metrics on a cluster Signed-off-by: hamistao --- test/suites/clustering.sh | 80 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 80 insertions(+) diff --git a/test/suites/clustering.sh b/test/suites/clustering.sh index 4e2aeb7f8ff6..b74cec589217 100644 --- a/test/suites/clustering.sh +++ b/test/suites/clustering.sh @@ -1963,6 +1963,86 @@ test_clustering_projects() { kill_lxd "${LXD_TWO_DIR}" } +test_clustering_metrics() { + local LXD_DIR + + setup_clustering_bridge + prefix="lxd$$" + bridge="${prefix}" + + setup_clustering_netns 1 + LXD_ONE_DIR=$(mktemp -d -p "${TEST_DIR}" XXX) + chmod +x "${LXD_ONE_DIR}" + ns1="${prefix}1" + spawn_lxd_and_bootstrap_cluster "${ns1}" "${bridge}" "${LXD_ONE_DIR}" + + # Add a newline at the end of each line. YAML as weird rules.. + cert=$(sed ':a;N;$!ba;s/\n/\n\n/g' "${LXD_ONE_DIR}/cluster.crt") + + # Spawn a second node + setup_clustering_netns 2 + LXD_TWO_DIR=$(mktemp -d -p "${TEST_DIR}" XXX) + chmod +x "${LXD_TWO_DIR}" + ns2="${prefix}2" + spawn_lxd_and_join_cluster "${ns2}" "${bridge}" "${cert}" 2 1 "${LXD_TWO_DIR}" "${LXD_ONE_DIR}" + + # Create one running container in each node and a stopped one on the leader. + LXD_DIR="${LXD_ONE_DIR}" deps/import-busybox --project default --alias testimage + LXD_DIR="${LXD_ONE_DIR}" lxc launch --target node1 testimage c1 + LXD_DIR="${LXD_ONE_DIR}" lxc init --target node1 testimage stopped + LXD_DIR="${LXD_ONE_DIR}" lxc launch --target node2 testimage c2 + + # Check that scraping metrics on each node only includes started instances on that node. + LXD_DIR="${LXD_ONE_DIR}" lxc query "/1.0/metrics" | grep 'name="c1"' + ! LXD_DIR="${LXD_ONE_DIR}" lxc query "/1.0/metrics" | grep 'name="stopped"' || false + ! LXD_DIR="${LXD_ONE_DIR}" lxc query "/1.0/metrics" | grep 'name="c2"' || false + ! LXD_DIR="${LXD_TWO_DIR}" lxc query "/1.0/metrics" | grep 'name="c1"' || false + LXD_DIR="${LXD_TWO_DIR}" lxc query "/1.0/metrics" | grep 'name="c2"' + + # Stopped container is counted on lxd_instances. + LXD_DIR="${LXD_ONE_DIR}" lxc query /1.0/metrics | grep -xF 'lxd_instances{project="default",type="container"} 2' + LXD_DIR="${LXD_TWO_DIR}" lxc query /1.0/metrics | grep -xF 'lxd_instances{project="default",type="container"} 1' + + # Remove previously existing warnings so they don't interfere with tests. + LXD_DIR="${LXD_ONE_DIR}" lxc warning delete --all + + # Populate database with dummy warnings and check that each node only counts their own warnings. + LXD_DIR="${LXD_ONE_DIR}" lxc query --wait -X POST -d '{\"location\": \"node1\", \"type_code\": 0, \"message\": \"node1 is in a bad mood\"}' /internal/testing/warnings + LXD_DIR="${LXD_ONE_DIR}" lxc query --wait -X POST -d '{\"location\": \"node1\", \"type_code\": 1, \"message\": \"node1 is bored\"}' /internal/testing/warnings + LXD_DIR="${LXD_ONE_DIR}" lxc query --wait -X POST -d '{\"location\": \"node2\", \"type_code\": 0, \"message\": \"node2 is too cool for this\"}' /internal/testing/warnings + + LXD_DIR="${LXD_ONE_DIR}" lxc query "/1.0/metrics" | grep -xF "lxd_warnings_total 2" + LXD_DIR="${LXD_TWO_DIR}" lxc query "/1.0/metrics" | grep -xF "lxd_warnings_total 1" + + # Add a nodeless warning and check if count incremented only on the leader node. + LXD_DIR="${LXD_ONE_DIR}" lxc query --wait -X POST -d '{\"type_code\": 0, \"message\": \"nodeless warning\"}' /internal/testing/warnings + + LXD_DIR="${LXD_ONE_DIR}" lxc query "/1.0/metrics" | grep -xF "lxd_warnings_total 3" + LXD_DIR="${LXD_TWO_DIR}" lxc query "/1.0/metrics" | grep -xF "lxd_warnings_total 1" + + # Acknowledge/resolve a warning and check if the count decremented on the node relative to the resolved warning. + uuid=$(LXD_DIR="${LXD_ONE_DIR}" lxc warning list --format json | jq -r '.[] | select(.last_message=="node1 is bored") | .uuid') + LXD_DIR="${LXD_ONE_DIR}" lxc warning ack "${uuid}" + + LXD_DIR="${LXD_ONE_DIR}" lxc query "/1.0/metrics" | grep -xF "lxd_warnings_total 2" + LXD_DIR="${LXD_TWO_DIR}" lxc query "/1.0/metrics" | grep -xF "lxd_warnings_total 1" + + LXD_DIR="${LXD_ONE_DIR}" lxc delete -f c1 stopped c2 + LXD_DIR="${LXD_ONE_DIR}" lxc image delete testimage + + LXD_DIR="${LXD_TWO_DIR}" lxd shutdown + LXD_DIR="${LXD_ONE_DIR}" lxd shutdown + sleep 0.5 + rm -f "${LXD_TWO_DIR}/unix.socket" + rm -f "${LXD_ONE_DIR}/unix.socket" + + teardown_clustering_netns + teardown_clustering_bridge + + kill_lxd "${LXD_ONE_DIR}" + kill_lxd "${LXD_TWO_DIR}" +} + test_clustering_address() { local LXD_DIR From c5677203b810702c68aed94112542e4c6cc9029d Mon Sep 17 00:00:00 2001 From: hamistao Date: Fri, 15 Nov 2024 15:07:41 -0300 Subject: [PATCH 7/7] test: Run `test_clustering_metrics` with cluster tests Signed-off-by: hamistao --- test/main.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/test/main.sh b/test/main.sh index d213c9d4ef73..1378d5ed849a 100755 --- a/test/main.sh +++ b/test/main.sh @@ -265,6 +265,7 @@ if [ "${1:-"all"}" != "standalone" ]; then run_test test_clustering_join_api "clustering join api" run_test test_clustering_shutdown_nodes "clustering shutdown" run_test test_clustering_projects "clustering projects" + run_test test_clustering_metrics "clustering metrics" run_test test_clustering_update_cert "clustering update cert" run_test test_clustering_update_cert_reversion "clustering update cert reversion" run_test test_clustering_update_cert_token "clustering update cert token"