Skip to content

Commit

Permalink
Merge pull request crossplane#428 from mergenci/async-panic-handler
Browse files Browse the repository at this point in the history
Recover from panics in async external clients
  • Loading branch information
mergenci authored Aug 22, 2024
2 parents 1644827 + 3dc4f0f commit 2e361ad
Show file tree
Hide file tree
Showing 2 changed files with 135 additions and 52 deletions.
107 changes: 81 additions & 26 deletions pkg/controller/external_async_tfpluginfw.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,15 @@ package controller

import (
"context"
"fmt"

xpv1 "github.com/crossplane/crossplane-runtime/apis/common/v1"
"github.com/crossplane/crossplane-runtime/pkg/logging"
"github.com/crossplane/crossplane-runtime/pkg/meta"
"github.com/crossplane/crossplane-runtime/pkg/reconciler/managed"
xpresource "github.com/crossplane/crossplane-runtime/pkg/resource"
"github.com/pkg/errors"
utilruntime "k8s.io/apimachinery/pkg/util/runtime"
"sigs.k8s.io/controller-runtime/pkg/client"

"github.com/crossplane/upjet/pkg/config"
Expand Down Expand Up @@ -131,25 +133,59 @@ func (n *terraformPluginFrameworkAsyncExternalClient) Observe(ctx context.Contex
return o, err
}

// panicHandler wraps an error, so that deferred functions that will
// be executed on a panic can access the error more conveniently.
type panicHandler struct {
err error
}

// recoverIfPanic recovers from panics, if any. Calls to this function
// should be defferred directly: `defer ph.recoverIfPanic()`. Panic
// recovery won't work if the call is wrapped in another function
// call, such as `defer func() { ph.recoverIfPanic() }()`. On
// recovery, API machinery panic handlers run. The implementation
// follows the outline of panic recovery mechanism in
// controller-runtime:
// https://github.com/kubernetes-sigs/controller-runtime/blob/v0.17.3/pkg/internal/controller/controller.go#L105-L112
func (ph *panicHandler) recoverIfPanic() {
ph.err = nil
if r := recover(); r != nil {
for _, fn := range utilruntime.PanicHandlers {
fn(r)
}

ph.err = fmt.Errorf("recovered from panic: %v", r)
}
}

func (n *terraformPluginFrameworkAsyncExternalClient) Create(_ context.Context, mg xpresource.Managed) (managed.ExternalCreation, error) {
if !n.opTracker.LastOperation.MarkStart("create") {
return managed.ExternalCreation{}, errors.Errorf("%s operation that started at %s is still running", n.opTracker.LastOperation.Type, n.opTracker.LastOperation.StartTime().String())
}

ctx, cancel := context.WithDeadline(context.Background(), n.opTracker.LastOperation.StartTime().Add(defaultAsyncTimeout))
go func() {
// The order of deferred functions, executed last-in-first-out, is
// significant. The context should be canceled last, because it is
// used by the finishing operations. Panic recovery should execute
// first, because the finishing operations report the panic error,
// if any.
var ph panicHandler
defer cancel()
defer func() { // Finishing operations
err := tferrors.NewAsyncCreateFailed(ph.err)
n.opTracker.LastOperation.SetError(err)
n.opTracker.logger.Debug("Async create ended.", "error", err)

n.opTracker.LastOperation.MarkEnd()
if cErr := n.callback.Create(mg.GetName())(err, ctx); cErr != nil {
n.opTracker.logger.Info("Async create callback failed", "error", cErr.Error())
}
}()
defer ph.recoverIfPanic()

n.opTracker.logger.Debug("Async create starting...")
_, err := n.terraformPluginFrameworkExternalClient.Create(ctx, mg)
err = tferrors.NewAsyncCreateFailed(err)
n.opTracker.LastOperation.SetError(err)
n.opTracker.logger.Debug("Async create ended.", "error", err)

n.opTracker.LastOperation.MarkEnd()
if cErr := n.callback.Create(mg.GetName())(err, ctx); cErr != nil {
n.opTracker.logger.Info("Async create callback failed", "error", cErr.Error())
}
_, ph.err = n.terraformPluginFrameworkExternalClient.Create(ctx, mg)
}()

return managed.ExternalCreation{}, n.opTracker.LastOperation.Error()
Expand All @@ -162,18 +198,27 @@ func (n *terraformPluginFrameworkAsyncExternalClient) Update(_ context.Context,

ctx, cancel := context.WithDeadline(context.Background(), n.opTracker.LastOperation.StartTime().Add(defaultAsyncTimeout))
go func() {
// The order of deferred functions, executed last-in-first-out, is
// significant. The context should be canceled last, because it is
// used by the finishing operations. Panic recovery should execute
// first, because the finishing operations report the panic error,
// if any.
var ph panicHandler
defer cancel()
defer func() { // Finishing operations
err := tferrors.NewAsyncUpdateFailed(ph.err)
n.opTracker.LastOperation.SetError(err)
n.opTracker.logger.Debug("Async update ended.", "error", err)

n.opTracker.LastOperation.MarkEnd()
if cErr := n.callback.Update(mg.GetName())(err, ctx); cErr != nil {
n.opTracker.logger.Info("Async update callback failed", "error", cErr.Error())
}
}()
defer ph.recoverIfPanic()

n.opTracker.logger.Debug("Async update starting...")
_, err := n.terraformPluginFrameworkExternalClient.Update(ctx, mg)
err = tferrors.NewAsyncUpdateFailed(err)
n.opTracker.LastOperation.SetError(err)
n.opTracker.logger.Debug("Async update ended.", "error", err)

n.opTracker.LastOperation.MarkEnd()
if cErr := n.callback.Update(mg.GetName())(err, ctx); cErr != nil {
n.opTracker.logger.Info("Async update callback failed", "error", cErr.Error())
}
_, ph.err = n.terraformPluginFrameworkExternalClient.Update(ctx, mg)
}()

return managed.ExternalUpdate{}, n.opTracker.LastOperation.Error()
Expand All @@ -190,17 +235,27 @@ func (n *terraformPluginFrameworkAsyncExternalClient) Delete(_ context.Context,

ctx, cancel := context.WithDeadline(context.Background(), n.opTracker.LastOperation.StartTime().Add(defaultAsyncTimeout))
go func() {
// The order of deferred functions, executed last-in-first-out, is
// significant. The context should be canceled last, because it is
// used by the finishing operations. Panic recovery should execute
// first, because the finishing operations report the panic error,
// if any.
var ph panicHandler
defer cancel()
defer func() { // Finishing operations
err := tferrors.NewAsyncDeleteFailed(ph.err)
n.opTracker.LastOperation.SetError(err)
n.opTracker.logger.Debug("Async delete ended.", "error", err)

n.opTracker.LastOperation.MarkEnd()
if cErr := n.callback.Destroy(mg.GetName())(err, ctx); cErr != nil {
n.opTracker.logger.Info("Async delete callback failed", "error", cErr.Error())
}
}()
defer ph.recoverIfPanic()

n.opTracker.logger.Debug("Async delete starting...")
err := tferrors.NewAsyncDeleteFailed(n.terraformPluginFrameworkExternalClient.Delete(ctx, mg))
n.opTracker.LastOperation.SetError(err)
n.opTracker.logger.Debug("Async delete ended.", "error", err)

n.opTracker.LastOperation.MarkEnd()
if cErr := n.callback.Destroy(mg.GetName())(err, ctx); cErr != nil {
n.opTracker.logger.Info("Async delete callback failed", "error", cErr.Error())
}
ph.err = n.terraformPluginFrameworkExternalClient.Delete(ctx, mg)
}()

return n.opTracker.LastOperation.Error()
Expand Down
80 changes: 54 additions & 26 deletions pkg/controller/external_async_tfpluginsdk.go
Original file line number Diff line number Diff line change
Expand Up @@ -143,18 +143,27 @@ func (n *terraformPluginSDKAsyncExternal) Create(_ context.Context, mg xpresourc

ctx, cancel := context.WithDeadline(context.Background(), n.opTracker.LastOperation.StartTime().Add(defaultAsyncTimeout))
go func() {
// The order of deferred functions, executed last-in-first-out, is
// significant. The context should be canceled last, because it is
// used by the finishing operations. Panic recovery should execute
// first, because the finishing operations report the panic error,
// if any.
var ph panicHandler
defer cancel()
defer func() { // Finishing operations
err := tferrors.NewAsyncCreateFailed(ph.err)
n.opTracker.LastOperation.SetError(err)
n.opTracker.logger.Debug("Async create ended.", "error", err, "tfID", n.opTracker.GetTfID())

n.opTracker.LastOperation.MarkEnd()
if cErr := n.callback.Create(mg.GetName())(err, ctx); cErr != nil {
n.opTracker.logger.Info("Async create callback failed", "error", cErr.Error())
}
}()
defer ph.recoverIfPanic()

n.opTracker.logger.Debug("Async create starting...", "tfID", n.opTracker.GetTfID())
_, err := n.terraformPluginSDKExternal.Create(ctx, mg)
err = tferrors.NewAsyncCreateFailed(err)
n.opTracker.LastOperation.SetError(err)
n.opTracker.logger.Debug("Async create ended.", "error", err, "tfID", n.opTracker.GetTfID())

n.opTracker.LastOperation.MarkEnd()
if cErr := n.callback.Create(mg.GetName())(err, ctx); cErr != nil {
n.opTracker.logger.Info("Async create callback failed", "error", cErr.Error())
}
_, ph.err = n.terraformPluginSDKExternal.Create(ctx, mg)
}()

return managed.ExternalCreation{}, n.opTracker.LastOperation.Error()
Expand All @@ -167,18 +176,27 @@ func (n *terraformPluginSDKAsyncExternal) Update(_ context.Context, mg xpresourc

ctx, cancel := context.WithDeadline(context.Background(), n.opTracker.LastOperation.StartTime().Add(defaultAsyncTimeout))
go func() {
// The order of deferred functions, executed last-in-first-out, is
// significant. The context should be canceled last, because it is
// used by the finishing operations. Panic recovery should execute
// first, because the finishing operations report the panic error,
// if any.
var ph panicHandler
defer cancel()
defer func() { // Finishing operations
err := tferrors.NewAsyncUpdateFailed(ph.err)
n.opTracker.LastOperation.SetError(err)
n.opTracker.logger.Debug("Async update ended.", "error", err, "tfID", n.opTracker.GetTfID())

n.opTracker.LastOperation.MarkEnd()
if cErr := n.callback.Update(mg.GetName())(err, ctx); cErr != nil {
n.opTracker.logger.Info("Async update callback failed", "error", cErr.Error())
}
}()
defer ph.recoverIfPanic()

n.opTracker.logger.Debug("Async update starting...", "tfID", n.opTracker.GetTfID())
_, err := n.terraformPluginSDKExternal.Update(ctx, mg)
err = tferrors.NewAsyncUpdateFailed(err)
n.opTracker.LastOperation.SetError(err)
n.opTracker.logger.Debug("Async update ended.", "error", err, "tfID", n.opTracker.GetTfID())

n.opTracker.LastOperation.MarkEnd()
if cErr := n.callback.Update(mg.GetName())(err, ctx); cErr != nil {
n.opTracker.logger.Info("Async update callback failed", "error", cErr.Error())
}
_, ph.err = n.terraformPluginSDKExternal.Update(ctx, mg)
}()

return managed.ExternalUpdate{}, n.opTracker.LastOperation.Error()
Expand All @@ -195,17 +213,27 @@ func (n *terraformPluginSDKAsyncExternal) Delete(_ context.Context, mg xpresourc

ctx, cancel := context.WithDeadline(context.Background(), n.opTracker.LastOperation.StartTime().Add(defaultAsyncTimeout))
go func() {
// The order of deferred functions, executed last-in-first-out, is
// significant. The context should be canceled last, because it is
// used by the finishing operations. Panic recovery should execute
// first, because the finishing operations report the panic error,
// if any.
var ph panicHandler
defer cancel()
defer func() { // Finishing operations
err := tferrors.NewAsyncDeleteFailed(ph.err)
n.opTracker.LastOperation.SetError(err)
n.opTracker.logger.Debug("Async delete ended.", "error", err, "tfID", n.opTracker.GetTfID())

n.opTracker.LastOperation.MarkEnd()
if cErr := n.callback.Destroy(mg.GetName())(err, ctx); cErr != nil {
n.opTracker.logger.Info("Async delete callback failed", "error", cErr.Error())
}
}()
defer ph.recoverIfPanic()

n.opTracker.logger.Debug("Async delete starting...", "tfID", n.opTracker.GetTfID())
err := tferrors.NewAsyncDeleteFailed(n.terraformPluginSDKExternal.Delete(ctx, mg))
n.opTracker.LastOperation.SetError(err)
n.opTracker.logger.Debug("Async delete ended.", "error", err, "tfID", n.opTracker.GetTfID())

n.opTracker.LastOperation.MarkEnd()
if cErr := n.callback.Destroy(mg.GetName())(err, ctx); cErr != nil {
n.opTracker.logger.Info("Async delete callback failed", "error", cErr.Error())
}
ph.err = n.terraformPluginSDKExternal.Delete(ctx, mg)
}()

return n.opTracker.LastOperation.Error()
Expand Down

0 comments on commit 2e361ad

Please sign in to comment.