From 99d6bb07abc7d9f50002a4a6dac3afb2b6fccb00 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jacob=20Baung=C3=A5rd=20Hansen?= Date: Wed, 9 Oct 2024 13:39:18 +0200 Subject: [PATCH] ACM-14639: Dashboard loader: retry adding folder on errors (#1646) * Dashboard loader: retry adding folder on errors Grafana sometimes fails to correctly set (default) permissions for folders added via the API. This seems to mostly affect the "Custom" folder. When this happens, while the folder is added, it's not possible for ACM users to view the folder, and its dashboards. This commit tries to detect that case, and retry adding the folder if it fails. Additional minor changes: - Configure grafana to retry queries - Log messages from the component all starts in lower-case. Signed-off-by: Jacob Baungard Hansen * Grafana-dev test: Use default dashboard Previously, the grafana-dev test would create a new custom dashboard, and test the exporting to configmap using that. However since adding custom dashboards are not so reliable after Grafana 11 update, we change the test to use of the default dashboards. This should improve test reliability, and there are no difference whether we export a custom dashboard or a default one. Signed-off-by: Jacob Baungard Hansen * Tests: increase timeout for custom dashboard tests Since these are unreliable, and requires a few retries increase the test timeout, to give a better chance of success. Signed-off-by: Jacob Baungard Hansen * Grafana-dev test: don't delete custom dashboard .. since we no longer add it. Signed-off-by: Jacob Baungard Hansen * Dashboard loader: use exit code 1 instead of 3 1 is slightly better than one, since 1 is usually used as a generic "the program had an error" exit code. Signed-off-by: Jacob Baungard Hansen * Dashboard loader: no forceful exit, increase retry - Don't forcefully exit - bump retries - naming nits Signed-off-by: Jacob Baungard Hansen --------- Signed-off-by: Jacob Baungard Hansen --- .../pkg/controller/dashboard_controller.go | 119 ++++++++++++------ .../controller/dashboard_controller_test.go | 8 +- .../manifests/base/grafana/config.yaml | 2 + tests/grafana-dev-test.sh | 8 +- .../pkg/tests/observability_dashboard_test.go | 6 +- 5 files changed, 92 insertions(+), 51 deletions(-) diff --git a/loaders/dashboards/pkg/controller/dashboard_controller.go b/loaders/dashboards/pkg/controller/dashboard_controller.go index 3d3367ef7..1cd8ef04e 100644 --- a/loaders/dashboards/pkg/controller/dashboard_controller.go +++ b/loaders/dashboards/pkg/controller/dashboard_controller.go @@ -8,6 +8,7 @@ import ( "bytes" "context" "encoding/json" + "errors" "fmt" "net/http" "os" @@ -40,24 +41,25 @@ const ( var ( grafanaURI = "http://127.0.0.1:3001" // Retry on errors. - retry = 10 + maxHttpRetry = 10 + maxDashboardRetry = 40 ) // RunGrafanaDashboardController ... func RunGrafanaDashboardController(stop <-chan struct{}) { config, err := clientcmd.BuildConfigFromFlags("", "") if err != nil { - klog.Error("Failed to get cluster config", "error", err) + klog.Error("failed to get cluster config", "error", err) } // Build kubeclient client and informer for managed cluster kubeClient, err := kubernetes.NewForConfig(config) if err != nil { - klog.Fatal("Failed to build kubeclient", "error", err) + klog.Fatal("failed to build kubeclient", "error", err) } informer, err := newKubeInformer(kubeClient.CoreV1()) if err != nil { - klog.Fatal("Failed to get informer", "error", err) + klog.Fatal("failed to get informer", "error", err) } go informer.Run(stop) @@ -109,7 +111,23 @@ func newKubeInformer(coreClient corev1client.CoreV1Interface) (cache.SharedIndex return } klog.Infof("detect there is a new dashboard %v created", obj.(*corev1.ConfigMap).Name) - updateDashboard(nil, obj, false) + err := updateDashboard(nil, obj, false) + + times := 0 + for { + if err == nil { + break + } else if times == maxDashboardRetry { + klog.Errorf("dashboard: %s could not be created after retrying %v times", obj.(*corev1.ConfigMap).Name, maxDashboardRetry) + break + } + + klog.Warningf("creation of dashboard: %v failed. Retrying in 10s. Error: %v", obj.(*corev1.ConfigMap).Name, err) + time.Sleep(time.Second * 10) + err = updateDashboard(nil, obj, false) + + times++ + } }, UpdateFunc: func(old, new interface{}) { if old.(*corev1.ConfigMap).ObjectMeta.ResourceVersion == new.(*corev1.ConfigMap).ObjectMeta.ResourceVersion { @@ -119,7 +137,24 @@ func newKubeInformer(coreClient corev1client.CoreV1Interface) (cache.SharedIndex return } klog.Infof("detect there is a dashboard %v updated", new.(*corev1.ConfigMap).Name) - updateDashboard(old, new, false) + err := updateDashboard(old, new, false) + + times := 0 + for { + if err == nil { + break + } else if times == maxDashboardRetry { + klog.Errorf("dashboard: %s could not be created after retrying %v times", new.(*corev1.ConfigMap).Name, maxDashboardRetry) + break + } + + klog.Warningf("updating of dashboard: %v failed. Retrying in 10s. Error: %v", new.(*corev1.ConfigMap).Name, err) + time.Sleep(time.Second * 10) + + err = updateDashboard(old, new, false) + + times++ + } }, DeleteFunc: func(obj interface{}) { if !isDesiredDashboardConfigmap(obj) { @@ -138,7 +173,7 @@ func newKubeInformer(coreClient corev1client.CoreV1Interface) (cache.SharedIndex func hasCustomFolder(folderTitle string) float64 { grafanaURL := grafanaURI + "/api/folders" - body, _ := util.SetRequest("GET", grafanaURL, nil, retry) + body, _ := util.SetRequest("GET", grafanaURL, nil, maxHttpRetry) folders := []map[string]interface{}{} err := json.Unmarshal(body, &folders) @@ -158,14 +193,28 @@ func hasCustomFolder(folderTitle string) float64 { func createCustomFolder(folderTitle string) float64 { folderID := hasCustomFolder(folderTitle) if folderID == 0 { + // Create the folder grafanaURL := grafanaURI + "/api/folders" - body, _ := util.SetRequest("POST", grafanaURL, strings.NewReader("{\"title\":\""+folderTitle+"\"}"), retry) + body, _ := util.SetRequest("POST", grafanaURL, strings.NewReader("{\"title\":\""+folderTitle+"\"}"), maxHttpRetry) folder := map[string]interface{}{} err := json.Unmarshal(body, &folder) if err != nil { klog.Error(unmarshallErrMsg, "error", err) return 0 } + + time.Sleep(time.Second * 1) + // check if permissions were set correctly as sometimes this silently fails in Grafana + grafanaURL = grafanaURI + "/api/folders/" + folder["uid"].(string) + "/permissions" + body, _ = util.SetRequest("GET", grafanaURL, nil, maxHttpRetry) + if string(body) == "[]" { + // if this fails no permissions are set. In which case we want to delete the folder and try again... + klog.Warningf("failed to set permissions for folder: %v. Deleting folder and retrying later.", folderTitle) + + time.Sleep(time.Second * 5) + deleteCustomFolder(folder["id"].(float64)) + return 0 + } return folder["id"].(float64) } return folderID @@ -173,7 +222,7 @@ func createCustomFolder(folderTitle string) float64 { func getCustomFolderUID(folderID float64) string { grafanaURL := grafanaURI + "/api/folders/id/" + fmt.Sprint(folderID) - body, _ := util.SetRequest("GET", grafanaURL, nil, retry) + body, _ := util.SetRequest("GET", grafanaURL, nil, maxHttpRetry) folder := map[string]interface{}{} err := json.Unmarshal(body, &folder) if err != nil { @@ -194,7 +243,7 @@ func isEmptyFolder(folderID float64) bool { } grafanaURL := grafanaURI + "/api/search?folderIds=" + fmt.Sprint(folderID) - body, _ := util.SetRequest("GET", grafanaURL, nil, retry) + body, _ := util.SetRequest("GET", grafanaURL, nil, maxHttpRetry) dashboards := []map[string]interface{}{} err := json.Unmarshal(body, &dashboards) if err != nil { @@ -217,12 +266,12 @@ func deleteCustomFolder(folderID float64) bool { uid := getCustomFolderUID(folderID) if uid == "" { - klog.Error("Failed to get custom folder UID") + klog.Error("failed to get custom folder UID") return false } grafanaURL := grafanaURI + "/api/folders/" + uid - _, respStatusCode := util.SetRequest("DELETE", grafanaURL, nil, retry) + _, respStatusCode := util.SetRequest("DELETE", grafanaURL, nil, maxHttpRetry) if respStatusCode != http.StatusOK { klog.Errorf("failed to delete custom folder %v with %v", folderID, respStatusCode) return false @@ -251,14 +300,13 @@ func getDashboardCustomFolderTitle(obj interface{}) string { } // updateDashboard is used to update the customized dashboards via calling grafana api. -func updateDashboard(old, new interface{}, overwrite bool) { +func updateDashboard(old, new interface{}, overwrite bool) error { folderID := 0.0 folderTitle := getDashboardCustomFolderTitle(new) if folderTitle != "" { folderID = createCustomFolder(folderTitle) if folderID == 0 { - klog.Error("Failed to get custom folder id") - return + return errors.New("failed to get folder id") } } @@ -267,8 +315,7 @@ func updateDashboard(old, new interface{}, overwrite bool) { dashboard := map[string]interface{}{} err := json.Unmarshal([]byte(value), &dashboard) if err != nil { - klog.Error("Failed to unmarshall data", "error", err) - return + return fmt.Errorf("failed to unmarshall data: %v", err) } if dashboard["uid"] == nil { dashboard["uid"], _ = util.GenerateUID(new.(*corev1.ConfigMap).GetName(), @@ -283,26 +330,24 @@ func updateDashboard(old, new interface{}, overwrite bool) { b, err := json.Marshal(data) if err != nil { - klog.Error("failed to marshal body", "error", err) - return + return fmt.Errorf("failed to marshal body: %v", err) } grafanaURL := grafanaURI + "/api/dashboards/db" - body, respStatusCode := util.SetRequest("POST", grafanaURL, bytes.NewBuffer(b), retry) + body, respStatusCode := util.SetRequest("POST", grafanaURL, bytes.NewBuffer(b), maxHttpRetry) if respStatusCode != http.StatusOK { if respStatusCode == http.StatusPreconditionFailed { if strings.Contains(string(body), "version-mismatch") { - updateDashboard(nil, new, true) + return updateDashboard(nil, new, true) } else if strings.Contains(string(body), "name-exists") { - klog.Info("the dashboard name already existed") + return fmt.Errorf("the dashboard name already existed") } else { - klog.Infof("failed to create/update: %v", respStatusCode) + return fmt.Errorf("failed to create/update dashboard: %v", respStatusCode) } } else { - klog.Infof("failed to create/update: %v", respStatusCode) + return fmt.Errorf("failed to create/update dashboard: %v", respStatusCode) } - return } if dashboard["title"] == homeDashboardTitle { @@ -314,23 +359,23 @@ func updateDashboard(old, new interface{}, overwrite bool) { } else { id, err := strconv.Atoi(strings.Trim(string(result[1]), " ")) if err != nil { - klog.Error(err, "failed to parse dashboard id") + return fmt.Errorf("failed to parse dashboard id: %v", err) } else { setHomeDashboard(id) } } } - klog.Info("Dashboard created/updated") + klog.Infof("dashboard: %v created/updated successfully", new.(*corev1.ConfigMap).Name) } folderTitle = getDashboardCustomFolderTitle(old) folderID = hasCustomFolder(folderTitle) if isEmptyFolder(folderID) { - if deleteCustomFolder(folderID) { - klog.Errorf("Failed to delete custom folder") - return + if !deleteCustomFolder(folderID) { + return errors.New("failed to delete custom folder") } } + return nil } // DeleteDashboard ... @@ -340,7 +385,7 @@ func deleteDashboard(obj interface{}) { dashboard := map[string]interface{}{} err := json.Unmarshal([]byte(value), &dashboard) if err != nil { - klog.Error("Failed to unmarshall data", "error", err) + klog.Error("failed to unmarshall data", "error", err) return } @@ -351,18 +396,18 @@ func deleteDashboard(obj interface{}) { grafanaURL := grafanaURI + "/api/dashboards/uid/" + uid - _, respStatusCode := util.SetRequest("DELETE", grafanaURL, nil, retry) + _, respStatusCode := util.SetRequest("DELETE", grafanaURL, nil, maxHttpRetry) if respStatusCode != http.StatusOK { klog.Errorf("failed to delete dashboard %v with %v", obj.(*corev1.ConfigMap).Name, respStatusCode) } else { - klog.Info("Dashboard deleted") + klog.Info("dashboard deleted") } folderTitle := getDashboardCustomFolderTitle(obj) folderID := hasCustomFolder(folderTitle) if isEmptyFolder(folderID) { - if deleteCustomFolder(folderID) { - klog.Errorf("Failed to delete custom folder") + if !deleteCustomFolder(folderID) { + klog.Errorf("failed to delete custom folder") return } } @@ -380,11 +425,11 @@ func setHomeDashboard(id int) { return } grafanaURL := grafanaURI + "/api/org/preferences" - _, respStatusCode := util.SetRequest("PUT", grafanaURL, bytes.NewBuffer(b), retry) + _, respStatusCode := util.SetRequest("PUT", grafanaURL, bytes.NewBuffer(b), maxHttpRetry) if respStatusCode != http.StatusOK { klog.Infof("failed to set home dashboard: %v", respStatusCode) } else { - klog.Info("Home dashboard is set") + klog.Info("home dashboard is set") } } diff --git a/loaders/dashboards/pkg/controller/dashboard_controller_test.go b/loaders/dashboards/pkg/controller/dashboard_controller_test.go index e12b66f6a..9ef58fc43 100644 --- a/loaders/dashboards/pkg/controller/dashboard_controller_test.go +++ b/loaders/dashboards/pkg/controller/dashboard_controller_test.go @@ -96,7 +96,7 @@ func TestGrafanaDashboardController(t *testing.T) { stop := make(chan struct{}) go createFakeServer(t) - retry = 1 + maxHttpRetry = 1 os.Setenv("POD_NAMESPACE", "ns2") @@ -237,7 +237,7 @@ func TestIsDesiredDashboardConfigmap(t *testing.T) { func TestGetCustomFolderUID(t *testing.T) { if !hasFakeServer { go createFakeServer(t) - retry = 1 + maxHttpRetry = 1 } testCaseList := []struct { @@ -273,7 +273,7 @@ func TestGetCustomFolderUID(t *testing.T) { func TestIsEmptyFolder(t *testing.T) { if !hasFakeServer { go createFakeServer(t) - retry = 1 + maxHttpRetry = 1 } testCaseList := []struct { @@ -351,7 +351,7 @@ func TestGetDashboardCustomFolderTitle(t *testing.T) { func TestDeleteCustomFolder(t *testing.T) { if !hasFakeServer { go createFakeServer(t) - retry = 1 + maxHttpRetry = 1 } testCaseList := []struct { diff --git a/operators/multiclusterobservability/manifests/base/grafana/config.yaml b/operators/multiclusterobservability/manifests/base/grafana/config.yaml index b5ce49689..0ca1ef339 100644 --- a/operators/multiclusterobservability/manifests/base/grafana/config.yaml +++ b/operators/multiclusterobservability/manifests/base/grafana/config.yaml @@ -39,3 +39,5 @@ stringData: enabled = false [unified_alerting] enabled = false + [database] + query_retries = 5 diff --git a/tests/grafana-dev-test.sh b/tests/grafana-dev-test.sh index 17629f080..2045febf4 100755 --- a/tests/grafana-dev-test.sh +++ b/tests/grafana-dev-test.sh @@ -10,9 +10,6 @@ base_dir="$( cd "$base_dir" obs_namespace=open-cluster-management-observability -# create a dashboard for test export grafana dashboard -kubectl apply -n "$obs_namespace" -f "$base_dir"/examples/dashboards/sample_custom_dashboard/custom-sample-dashboard.yaml - # test deploy grafana-dev cd $base_dir/tools ./setup-grafana-dev.sh --deploy @@ -63,7 +60,7 @@ fi n=0 until [ "$n" -ge 10 ]; do # test export grafana dashboard - ./generate-dashboard-configmap-yaml.sh "Sample Dashboard for E2E" + ./generate-dashboard-configmap-yaml.sh "ACM - Clusters Overview" if [ $? -eq 0 ]; then break fi @@ -81,6 +78,3 @@ if [ $? -ne 0 ]; then echo "Failed run setup-grafana-dev.sh --clean" exit 1 fi - -# clean test env -kubectl delete -n "$obs_namespace" -f "$base_dir"/examples/dashboards/sample_custom_dashboard/custom-sample-dashboard.yaml diff --git a/tests/pkg/tests/observability_dashboard_test.go b/tests/pkg/tests/observability_dashboard_test.go index e43ee580a..29ada8df3 100644 --- a/tests/pkg/tests/observability_dashboard_test.go +++ b/tests/pkg/tests/observability_dashboard_test.go @@ -47,7 +47,7 @@ var _ = Describe("Observability:", func() { Eventually(func() bool { _, result := utils.ContainDashboard(testOptions, dashboardTitle) return result - }, EventuallyTimeoutMinute*3, EventuallyIntervalSecond*5).Should(BeTrue()) + }, EventuallyTimeoutMinute*5, EventuallyIntervalSecond*5).Should(BeTrue()) }) It("[P2][Sev2][observability][Stable] Should have update custom dashboard after configmap updated (dashboard/g0)", func() { @@ -64,11 +64,11 @@ var _ = Describe("Observability:", func() { Eventually(func() bool { _, result := utils.ContainDashboard(testOptions, dashboardTitle) return result - }, EventuallyTimeoutMinute*3, EventuallyIntervalSecond*5).Should(BeFalse()) + }, EventuallyTimeoutMinute*5, EventuallyIntervalSecond*5).Should(BeFalse()) Eventually(func() bool { _, result := utils.ContainDashboard(testOptions, updateDashboardTitle) return result - }, EventuallyTimeoutMinute*3, EventuallyIntervalSecond*5).Should(BeTrue()) + }, EventuallyTimeoutMinute*5, EventuallyIntervalSecond*5).Should(BeTrue()) }) It("[P2][Sev2][observability][Stable] Should have no custom dashboard in grafana after related configmap removed (dashboard/g0)", func() {