From 72348ed52fadf3fa0f94eaf0e43aa5456ae82a51 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Corentin=20N=C3=A9au?= Date: Mon, 1 Jul 2024 14:55:31 +0200 Subject: [PATCH] Clean up content resources through finalizers Content resources may be used by one or more bundle deployments, which now each add their own finalizer on a content resource to indicate their reliance on it, and delete that finalizer when that reliance no longer applies. This eliminates the need for batch cleanup of content resources. --- charts/fleet/templates/rbac_gitjob.yaml | 1 + e2e/single-cluster/gitrepo_test.go | 48 ++++++-- e2e/single-cluster/oci_registry_test.go | 6 +- .../cmd/controller/cleanup/content/purge.go | 110 ------------------ internal/cmd/controller/cleanup/start.go | 5 - internal/cmd/controller/finalize/finalize.go | 43 +++++++ .../reconciler/bundle_controller.go | 49 +++++++- 7 files changed, 133 insertions(+), 129 deletions(-) delete mode 100644 internal/cmd/controller/cleanup/content/purge.go diff --git a/charts/fleet/templates/rbac_gitjob.yaml b/charts/fleet/templates/rbac_gitjob.yaml index 35d2072af2..e2c93169de 100644 --- a/charts/fleet/templates/rbac_gitjob.yaml +++ b/charts/fleet/templates/rbac_gitjob.yaml @@ -51,6 +51,7 @@ rules: - "bundles" - "bundledeployments" - "imagescans" + - "contents" verbs: - list - delete diff --git a/e2e/single-cluster/gitrepo_test.go b/e2e/single-cluster/gitrepo_test.go index 5c9792c257..d930f1dc9c 100644 --- a/e2e/single-cluster/gitrepo_test.go +++ b/e2e/single-cluster/gitrepo_test.go @@ -23,6 +23,7 @@ import ( "github.com/rancher/fleet/e2e/testenv/kubectl" fleet "github.com/rancher/fleet/pkg/apis/fleet.cattle.io/v1alpha1" "github.com/rancher/wrangler/v2/pkg/genericcondition" + "github.com/rancher/wrangler/v2/pkg/kv" ) const ( @@ -69,17 +70,48 @@ var _ = Describe("Monitoring Git repos via HTTP for change", Label("infra-setup" AfterEach(func() { _ = os.RemoveAll(tmpDir) - // Check that the bundle deployment is properly deleted - out, _ := k.Get("bundledeployments", "-A") - Expect(out).To(ContainSubstring(gitrepoName)) + deployID, err := k.Get( + "bundledeployments", + "-A", + "-l", + fmt.Sprintf("fleet.cattle.io/repo-name=%s", gitrepoName), + "-o=jsonpath={.items[*].spec.deploymentID}", + ) + Expect(err).ToNot(HaveOccurred()) + + // Check existence of content resource + contentID, _ := kv.Split(deployID, ":") + + out, err := k.Get("content", contentID) + Expect(err).ToNot(HaveOccurred(), out) - _, err := k.Delete("gitrepo", gitrepoName) + _, err = k.Delete("gitrepo", gitrepoName) Expect(err).ToNot(HaveOccurred()) - Eventually(func() string { - out, _ := k.Get("bundledeployments", "-A") - return out - }).ShouldNot(ContainSubstring(gitrepoName)) + // Check that the bundle deployment and content resources have been deleted + Eventually(func(g Gomega) { + out, _ := k.Get( + "bundledeployments", + "-A", + "-l", + fmt.Sprintf("fleet.cattle.io/repo-name=%s", gitrepoName), + ) + g.Expect(out).To(ContainSubstring("No resources found")) + + // Check that the last known content resource for the gitrepo has been deleted + out, _ = k.Get("content", contentID) + g.Expect(out).To(ContainSubstring("not found")) + + // Check that no content resource is left for the gitrepo (or rather its bundle(s)) + out, err := k.Get( + "contents", + `-o=jsonpath=jsonpath='{range .items[*]}{.metadata.name}{"\t"}{.metadata.finalizers}`+ + `{"\n"}{end}'`, + ) + g.Expect(err).ToNot(HaveOccurred()) + g.Expect(out).ToNot(ContainSubstring(gitrepoName)) + + }).Should(Succeed()) out, err = k.Namespace("cattle-fleet-system").Logs( "-l", diff --git a/e2e/single-cluster/oci_registry_test.go b/e2e/single-cluster/oci_registry_test.go index c78a23af64..c52fb71ad5 100644 --- a/e2e/single-cluster/oci_registry_test.go +++ b/e2e/single-cluster/oci_registry_test.go @@ -198,11 +198,11 @@ var _ = Describe("Single Cluster Deployments using OCI registry", Label("oci-reg updateExperimentalFlagValue(k, experimentalFlagBefore) } - // purge content id if needed + // check that contents have been purged when deleting the gitrepo if contentIDToPurge != "" { out, err := k.Delete("contents", contentIDToPurge) - Expect(out).To(ContainSubstring("deleted")) - Expect(err).ToNot(HaveOccurred()) + Expect(out).To(ContainSubstring("not found")) + Expect(err).To(HaveOccurred()) } }) diff --git a/internal/cmd/controller/cleanup/content/purge.go b/internal/cmd/controller/cleanup/content/purge.go deleted file mode 100644 index dac972e1a8..0000000000 --- a/internal/cmd/controller/cleanup/content/purge.go +++ /dev/null @@ -1,110 +0,0 @@ -// Package content purges orphaned content objects by inspecting bundledeployments in all namespaces. Runs every 5 minutes. -package content - -import ( - "context" - - "github.com/sirupsen/logrus" - - fleet "github.com/rancher/fleet/pkg/apis/fleet.cattle.io/v1alpha1" - "github.com/rancher/fleet/pkg/durations" - fleetcontrollers "github.com/rancher/fleet/pkg/generated/controllers/fleet.cattle.io/v1alpha1" - - corecontrollers "github.com/rancher/wrangler/v2/pkg/generated/controllers/core/v1" - "github.com/rancher/wrangler/v2/pkg/kv" - "github.com/rancher/wrangler/v2/pkg/ticker" - - "k8s.io/apimachinery/pkg/api/errors" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" -) - -type contentRef struct { - safeToDelete bool - markForDeletion bool - bundleCount int -} - -// PurgeOrphanedInBackground cleans up all orphan contents in a different goroutine. It checks all contents every 5 minutes. -func PurgeOrphanedInBackground(ctx context.Context, content fleetcontrollers.ContentController, bundleDeployment fleetcontrollers.BundleDeploymentController, namespaceClient corecontrollers.NamespaceClient) { - go purgeOrphaned(ctx, content, bundleDeployment, namespaceClient) -} - -func purgeOrphaned(ctx context.Context, content fleetcontrollers.ContentController, bundleDeployment fleetcontrollers.BundleDeploymentController, namespaceClient corecontrollers.NamespaceClient) { - deleteRefs := make(map[string]*contentRef) - - for range ticker.Context(ctx, durations.ContentPurgeInterval) { - logrus.Debugf("Checking for orphaned content objects") - namespaces, err := namespaceClient.List(metav1.ListOptions{}) - if err != nil { - logrus.Warnf("Error reading namespaceClient %v", err) - continue - } - var bundleDeployments []fleet.BundleDeployment - for _, ns := range namespaces.Items { - nsBundleDeployments, err := bundleDeployment.List(ns.Name, metav1.ListOptions{}) - if err != nil { - logrus.Warnf("Error listing bundle deployments %v", err) - continue - } - bundleDeployments = append(bundleDeployments, nsBundleDeployments.Items...) - } - - contentRefs := make(map[string]*contentRef) - - contents, err := content.List(metav1.ListOptions{}) - if err != nil { - logrus.Warnf("Error reading contents %v", err) - continue - } - - for _, content := range contents.Items { - contentRefs[content.Name] = &contentRef{ - safeToDelete: false, - bundleCount: 0, - } - } - - for _, bd := range bundleDeployments { - deployManifestID, _ := kv.Split(bd.Spec.DeploymentID, ":") - if val, ok := contentRefs[deployManifestID]; ok { - val.bundleCount++ - } - - stagedManifestID, _ := kv.Split(bd.Spec.StagedDeploymentID, ":") - if val, ok := contentRefs[stagedManifestID]; ok && stagedManifestID != deployManifestID { - val.bundleCount++ - } - } - - for contentName, cr := range contentRefs { - _, deleteCandidate := deleteRefs[contentName] - if cr.bundleCount > 0 { - if deleteCandidate { - delete(deleteRefs, contentName) - } - } else { - if deleteCandidate { - deleteRefs[contentName].safeToDelete = true - } else { - logrus.Infof("Marking orphaned content[%s] for deletion", contentName) - deleteRefs[contentName] = &contentRef{ - bundleCount: 0, - markForDeletion: true, - safeToDelete: false, - } - } - } - } - - for contentName, dr := range deleteRefs { - if dr.safeToDelete { - logrus.Infof("Deleting orphaned content[%s]", contentName) - if err := content.Delete(contentName, &metav1.DeleteOptions{}); err != nil && !errors.IsNotFound(err) { - logrus.Warnf("Error deleting contentbundle %v", err) - } else { - delete(deleteRefs, contentName) - } - } - } - } -} diff --git a/internal/cmd/controller/cleanup/start.go b/internal/cmd/controller/cleanup/start.go index e6c8850e6d..722e227cca 100644 --- a/internal/cmd/controller/cleanup/start.go +++ b/internal/cmd/controller/cleanup/start.go @@ -3,7 +3,6 @@ package cleanup import ( "context" - "github.com/rancher/fleet/internal/cmd/controller/cleanup/content" "github.com/rancher/fleet/internal/cmd/controller/cleanup/controllers" "github.com/rancher/wrangler/v2/pkg/kubeconfig" "github.com/rancher/wrangler/v2/pkg/leader" @@ -36,10 +35,6 @@ func start(ctx context.Context, kubeConfig, namespace string) error { if err := controllers.Register(ctx, appCtx); err != nil { logrus.Fatal(err) } - content.PurgeOrphanedInBackground(ctx, appCtx.Content(), appCtx.BundleDeployment(), appCtx.Core.Namespace()) - if err := appCtx.Start(ctx); err != nil { - logrus.Fatal(err) - } }) return nil diff --git a/internal/cmd/controller/finalize/finalize.go b/internal/cmd/controller/finalize/finalize.go index 9af3a24879..eb59903dd6 100644 --- a/internal/cmd/controller/finalize/finalize.go +++ b/internal/cmd/controller/finalize/finalize.go @@ -6,6 +6,8 @@ import ( "strings" "github.com/rancher/fleet/pkg/apis/fleet.cattle.io/v1alpha1" + "github.com/rancher/wrangler/v2/pkg/kv" + "github.com/sirupsen/logrus" corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/types" @@ -100,6 +102,47 @@ func PurgeBundleDeployments(ctx context.Context, c client.Client, bundle types.N if err != nil { return err } + + if err = PurgeContent(ctx, c, bd.Name, bd.Spec.DeploymentID); err != nil { + return err + } + } + + return nil +} + +// PurgeContent tries to delete the content resource related with the given bundle deployment. +func PurgeContent(ctx context.Context, c client.Client, name, deplID string) error { + contentID, _ := kv.Split(deplID, ":") + content := &v1alpha1.Content{} + if err := c.Get(ctx, types.NamespacedName{Name: contentID}, content); err != nil { + return client.IgnoreNotFound(err) + } + + nn := types.NamespacedName{Name: content.Name} + if controllerutil.ContainsFinalizer(content, name) { + err := retry.RetryOnConflict(retry.DefaultRetry, func() error { + if err := c.Get(ctx, nn, content); err != nil { + return err + } + + logrus.Infof("Removing finalizer %s from content resource %s", name, contentID) + controllerutil.RemoveFinalizer(content, name) + + return c.Update(ctx, content) + }) + if err != nil { + return err + } + } + + hasOtherFinalizers := (len(content.Finalizers) > 1) || + ((len(content.Finalizers) == 1) && content.Finalizers[0] != name) + if !hasOtherFinalizers { + logrus.Infof("Deleting content resource %s", contentID) + if err := c.Delete(ctx, content); err != nil { + return err + } } return nil diff --git a/internal/cmd/controller/reconciler/bundle_controller.go b/internal/cmd/controller/reconciler/bundle_controller.go index 34b0692c70..041bb915a0 100644 --- a/internal/cmd/controller/reconciler/bundle_controller.go +++ b/internal/cmd/controller/reconciler/bundle_controller.go @@ -207,8 +207,7 @@ func (r *BundleReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctr if !contentsInOCI && len(matchedTargets) > 0 { // when not using the OCI registry we need to create a contents resource // so the BundleDeployments are able to access the contents to be deployed. - // Otherwise, do not create a content resource if there are no targets, it will - // only create work for `PurgeOrphanedInBackground`. + // Otherwise, do not create a content resource if there are no targets. // `fleet apply` puts all resources into `bundle.Spec.Resources`. // `Store` copies all the resources into the content resource. // There is no pruning of unused resources. Therefore we write @@ -253,7 +252,12 @@ func (r *BundleReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctr continue } if target.Deployment.Namespace == "" { - logger.V(1).Info("Skipping bundledeployment with empty namespace, waiting for agentmanagement to set cluster.status.namespace", "bundledeployment", target.Deployment) + logger.V(1).Info( + "Skipping bundledeployment with empty namespace, "+ + "waiting for agentmanagement to set cluster.status.namespace", + "bundledeployment", + target.Deployment, + ) continue } @@ -267,8 +271,47 @@ func (r *BundleReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctr bd.Spec.OCIContents = contentsInOCI + err := retry.RetryOnConflict(retry.DefaultRetry, func() error { + if contentsInOCI { + return nil // no contents resources stored in etcd, no finalizers to add here. + } + + content := &fleet.Content{} + if err := r.Get(ctx, types.NamespacedName{Name: manifestID}, content); err != nil { + return client.IgnoreNotFound(err) + } + + controllerutil.AddFinalizer(content, bd.Name) + + return r.Update(ctx, content) + }) + if err != nil { + logger.Error(err, "Reconcile failed to add content finalizer", "content ID", manifestID) + } + updated := bd.DeepCopy() op, err := controllerutil.CreateOrUpdate(ctx, r.Client, bd, func() error { + // When this mutation function is called by CreateOrUpdate, bd contains the + // _old_ bundle deployment, if any. + // The corresponding Content resource must only be deleted if it is no longer in use, ie if the + // latest version of the bundle points to a different deployment ID. + // An empty value for bd.Spec.DeploymentID means that we are deploying the first version of this + // bundle, hence there are no Contents left over to purge. + if !bd.Spec.OCIContents && + bd.Spec.DeploymentID != "" && + bd.Spec.DeploymentID != updated.Spec.DeploymentID { + if err := finalize.PurgeContent(ctx, r.Client, bd.Name, bd.Spec.DeploymentID); err != nil { + logger.Error( + err, + "Reconcile failed to purge old content resource", + "bundledeployment", + bd, + "deploymentID", + bd.Spec.DeploymentID, + ) + } + } + bd.Spec = updated.Spec bd.Labels = updated.GetLabels() return nil