Skip to content

Commit

Permalink
Improve Upgrade-related logging (#3382)
Browse files Browse the repository at this point in the history
* Log all upgrade-related messages at INFO level

* Print Agent PID and version at startup

* Print Upgrade Watcher PID and Agent version at startup

* Use ECS keys

* Log invoked upgrade watcher PID along with invoking Agent PID

* Add CHANGELOG entry

* Revert to DEBUG level logging on file extractions

* Removing redundant log line

* Running mage fmt
  • Loading branch information
ycombinator authored Sep 14, 2023
1 parent fe0d427 commit 66fa5a7
Show file tree
Hide file tree
Showing 10 changed files with 69 additions and 26 deletions.
32 changes: 32 additions & 0 deletions changelog/fragments/1694190260-improve-upgrade-logging.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
# Kind can be one of:
# - breaking-change: a change to previously-documented behavior
# - deprecation: functionality that is being removed in a later release
# - bug-fix: fixes a problem in a previous version
# - enhancement: extends functionality but does not break or fix existing behavior
# - feature: new functionality
# - known-issue: problems that we are aware of in a given version
# - security: impacts on the security of a product or a user’s deployment.
# - upgrade: important information for someone upgrading from a prior version
# - other: does not fit into any of the other categories
kind: enhancement

# Change summary; a 80ish characters long description of the change.
summary: Improve logging during Agent upgrades

# Long description; in case the summary is not enough to describe the change
# this field accommodate a description without length limits.
# NOTE: This field will be rendered only for breaking-change and known-issue kinds at the moment.
#description:

# Affected component; usually one of "elastic-agent", "fleet-server", "filebeat", "metricbeat", "auditbeat", "all", etc.
component: elastic-agent

# PR URL; optional; the PR number that added the changeset.
# If not present is automatically filled by the tooling finding the PR where this changelog fragment has been added.
# NOTE: the tooling supports backports, so it's able to fill the original PR number instead of the backport PR number.
# Please provide it if you are adding a fragment for a different PR.
pr: https://github.com/elastic/elastic-agent/pull/3382

# Issue URL; optional; the GitHub issue related to this changeset (either closes or is part of).
# If not present is automatically filled by the tooling with the issue linked to the PR number.
#issue: https://github.com/owner/repo/1234
2 changes: 1 addition & 1 deletion internal/pkg/agent/application/upgrade/cleanup.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ import (
// cleanNonMatchingVersionsFromDownloads will remove files that do not have the passed version number from the downloads directory.
func cleanNonMatchingVersionsFromDownloads(log *logger.Logger, version string) error {
downloadsPath := paths.Downloads()
log.Debugw("Cleaning up non-matching downloaded versions", "version", version, "downloads.path", downloadsPath)
log.Infow("Cleaning up non-matching downloaded versions", "version", version, "downloads.path", downloadsPath)

files, err := os.ReadDir(downloadsPath)
if os.IsNotExist(err) {
Expand Down
8 changes: 3 additions & 5 deletions internal/pkg/agent/application/upgrade/crash_checker.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@ package upgrade
import (
"context"
"fmt"
"os"
"sync"
"time"

Expand Down Expand Up @@ -53,7 +52,7 @@ func NewCrashChecker(ctx context.Context, ch chan error, log *logger.Logger, che
return nil, err
}

log.Debugf("running checks using '%s' controller", c.sc.Name())
log.Infof("running checks using '%s' controller", c.sc.Name())

return c, nil
}
Expand All @@ -62,9 +61,8 @@ func NewCrashChecker(ctx context.Context, ch chan error, log *logger.Logger, che
func (ch *CrashChecker) Run(ctx context.Context) {
defer ch.sc.Close()

ch.log.Debug("Crash checker started")
ch.log.Info("Crash checker started")
for {
ch.log.Debugf("watcher having PID: %d", os.Getpid())
t := time.NewTimer(ch.checkInterval)

select {
Expand All @@ -77,7 +75,7 @@ func (ch *CrashChecker) Run(ctx context.Context) {
ch.log.Error(err)
}

ch.log.Debugf("retrieved service PID [%d]", pid)
ch.log.Infof("retrieved service PID [%d]", pid)
ch.q.Push(pid)

// We decide if the Agent process has crashed in either of
Expand Down
2 changes: 1 addition & 1 deletion internal/pkg/agent/application/upgrade/error_checker.go
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ func NewErrorChecker(ch chan error, log *logger.Logger, checkInterval time.Durat

// Run runs the checking loop.
func (ch *ErrorChecker) Run(ctx context.Context) {
ch.log.Debug("Error checker started")
ch.log.Info("Error checker started")
for {
t := time.NewTimer(ch.checkInterval)
select {
Expand Down
22 changes: 15 additions & 7 deletions internal/pkg/agent/application/upgrade/rollback.go
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ func Rollback(ctx context.Context, log *logger.Logger, prevHash string, currentH

// Cleanup removes all artifacts and files related to a specified version.
func Cleanup(log *logger.Logger, currentHash string, removeMarker bool, keepLogs bool) error {
log.Debugw("Cleaning up upgrade", "hash", currentHash, "remove_marker", removeMarker)
log.Infow("Cleaning up upgrade", "hash", currentHash, "remove_marker", removeMarker)
<-time.After(afterRestartDelay)

// remove upgrade marker
Expand All @@ -78,7 +78,7 @@ func Cleanup(log *logger.Logger, currentHash string, removeMarker bool, keepLogs

// remove symlink to avoid upgrade failures, ignore error
prevSymlink := prevSymlinkPath()
log.Debugw("Removing previous symlink path", "file.path", prevSymlinkPath())
log.Infow("Removing previous symlink path", "file.path", prevSymlinkPath())
_ = os.Remove(prevSymlink)

dirPrefix := fmt.Sprintf("%s-", agentName)
Expand All @@ -93,7 +93,7 @@ func Cleanup(log *logger.Logger, currentHash string, removeMarker bool, keepLogs
}

hashedDir := filepath.Join(paths.Data(), dir)
log.Debugw("Removing hashed data directory", "file.path", hashedDir)
log.Infow("Removing hashed data directory", "file.path", hashedDir)
var ignoredDirs []string
if keepLogs {
ignoredDirs = append(ignoredDirs, "logs")
Expand All @@ -110,20 +110,28 @@ func Cleanup(log *logger.Logger, currentHash string, removeMarker bool, keepLogs
// agent during upgrade period.
func InvokeWatcher(log *logger.Logger) error {
if !IsUpgradeable() {
log.Debug("agent is not upgradable, not starting watcher")
log.Info("agent is not upgradable, not starting watcher")
return nil
}

cmd := invokeCmd()
defer func() {
if cmd.Process != nil {
log.Debugf("releasing watcher %v", cmd.Process.Pid)
log.Infof("releasing watcher %v", cmd.Process.Pid)
_ = cmd.Process.Release()
}
}()

log.Debugw("Starting upgrade watcher", "path", cmd.Path, "args", cmd.Args, "env", cmd.Env, "dir", cmd.Dir)
return cmd.Start()
log.Infow("Starting upgrade watcher", "path", cmd.Path, "args", cmd.Args, "env", cmd.Env, "dir", cmd.Dir)
if err := cmd.Start(); err != nil {
return fmt.Errorf("failed to start Upgrade Watcher: %w", err)
}

upgradeWatcherPID := cmd.Process.Pid
agentPID := os.Getpid()
log.Infow("Upgrade Watcher invoked", "agent.upgrade.watcher.process.pid", upgradeWatcherPID, "agent.process.pid", agentPID)

return nil
}

func restartAgent(ctx context.Context, log *logger.Logger) error {
Expand Down
4 changes: 2 additions & 2 deletions internal/pkg/agent/application/upgrade/step_download.go
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ func (u *Upgrader) downloadArtifact(ctx context.Context, version, sourceURI stri
}
}

u.log.Debugw("Downloading upgrade artifact", "version", version,
u.log.Infow("Downloading upgrade artifact", "version", version,
"source_uri", settings.SourceURI, "drop_path", settings.DropPath,
"target_path", settings.TargetDirectory, "install_path", settings.InstallPath)

Expand Down Expand Up @@ -162,7 +162,7 @@ func (u *Upgrader) downloadWithRetries(

opFn := func() error {
attempt++
u.log.Debugf("download attempt %d", attempt)
u.log.Infof("download attempt %d", attempt)

downloader, err := downloaderCtor(version, u.log, settings)
if err != nil {
Expand Down
2 changes: 1 addition & 1 deletion internal/pkg/agent/application/upgrade/step_mark.go
Original file line number Diff line number Diff line change
Expand Up @@ -139,7 +139,7 @@ func UpdateActiveCommit(log *logger.Logger, hash string) error {
// CleanMarker removes a marker from disk.
func CleanMarker(log *logger.Logger) error {
markerFile := markerFilePath()
log.Debugw("Removing marker file", "file.path", markerFile)
log.Infow("Removing marker file", "file.path", markerFile)
if err := os.Remove(markerFile); !os.IsNotExist(err) {
return err
}
Expand Down
12 changes: 6 additions & 6 deletions internal/pkg/agent/application/upgrade/upgrade.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,11 +11,11 @@ import (
"path/filepath"
"strings"

"github.com/elastic/elastic-agent/internal/pkg/config"

"github.com/otiai10/copy"
"go.elastic.co/apm"

"github.com/elastic/elastic-agent/internal/pkg/config"

"github.com/elastic/elastic-agent/internal/pkg/agent/application/info"
"github.com/elastic/elastic-agent/internal/pkg/agent/application/paths"
"github.com/elastic/elastic-agent/internal/pkg/agent/application/reexec"
Expand Down Expand Up @@ -172,7 +172,7 @@ func (u *Upgrader) Upgrade(ctx context.Context, version string, sourceURI string
cb := shutdownCallback(u.log, paths.Home(), release.Version(), version, release.TrimCommit(newHash))

// Clean everything from the downloads dir
u.log.Debugw("Removing downloads directory", "file.path", paths.Downloads())
u.log.Infow("Removing downloads directory", "file.path", paths.Downloads())
err = os.RemoveAll(paths.Downloads())
if err != nil {
u.log.Errorw("Unable to clean downloads after update", "error.message", err, "file.path", paths.Downloads())
Expand Down Expand Up @@ -231,11 +231,11 @@ func copyActionStore(log *logger.Logger, newHash string) error {
// copies legacy action_store.yml, state.yml and state.enc encrypted file if exists
storePaths := []string{paths.AgentActionStoreFile(), paths.AgentStateStoreYmlFile(), paths.AgentStateStoreFile()}
newHome := filepath.Join(filepath.Dir(paths.Home()), fmt.Sprintf("%s-%s", agentName, newHash))
log.Debugw("Copying action store", "new_home_path", newHome)
log.Infow("Copying action store", "new_home_path", newHome)

for _, currentActionStorePath := range storePaths {
newActionStorePath := filepath.Join(newHome, filepath.Base(currentActionStorePath))
log.Debugw("Copying action store path", "from", currentActionStorePath, "to", newActionStorePath)
log.Infow("Copying action store path", "from", currentActionStorePath, "to", newActionStorePath)
currentActionStore, err := os.ReadFile(currentActionStorePath)
if os.IsNotExist(err) {
// nothing to copy
Expand Down Expand Up @@ -266,7 +266,7 @@ func copyRunDirectory(log *logger.Logger, newHash string) error {
err := copyDir(log, oldRunPath, newRunPath, true)
if os.IsNotExist(err) {
// nothing to copy, operation ok
log.Debugw("Run directory not present", "old_run_path", oldRunPath)
log.Infow("Run directory not present", "old_run_path", oldRunPath)
return nil
}
if err != nil {
Expand Down
2 changes: 2 additions & 0 deletions internal/pkg/agent/cmd/run.go
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,8 @@ func run(override cfgOverrider, testingMode bool, fleetInitTimeout time.Duration
"source": agentName,
})

l.Infow("Elastic Agent started", "process.pid", os.Getpid(), "agent.version", version.GetAgentPackageVersion())

cfg, err = tryDelayEnroll(ctx, l, cfg, override)
if err != nil {
err = errors.New(err, "failed to perform delayed enrollment")
Expand Down
9 changes: 6 additions & 3 deletions internal/pkg/agent/cmd/watch.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@ import (
"syscall"
"time"

"github.com/elastic/elastic-agent/version"

"github.com/elastic/elastic-agent/internal/pkg/config"

"github.com/spf13/cobra"
Expand Down Expand Up @@ -62,21 +64,22 @@ func newWatchCommandWithArgs(_ []string, streams *cli.IOStreams) *cobra.Command
}

func watchCmd(log *logp.Logger, cfg *configuration.Configuration) error {
log.Infow("Upgrade Watcher started", "process.pid", os.Getpid(), "agent.version", version.GetAgentPackageVersion())
marker, err := upgrade.LoadMarker()
if err != nil {
log.Error("failed to load marker", err)
return err
}
if marker == nil {
// no marker found we're not in upgrade process
log.Debugf("update marker not present at '%s'", paths.Data())
log.Infof("update marker not present at '%s'", paths.Data())
return nil
}

locker := filelock.NewAppLocker(paths.Top(), watcherLockFile)
if err := locker.TryLock(); err != nil {
if errors.Is(err, filelock.ErrAppAlreadyRunning) {
log.Debugf("exiting, lock already exists")
log.Info("exiting, lock already exists")
return nil
}

Expand All @@ -89,7 +92,7 @@ func watchCmd(log *logp.Logger, cfg *configuration.Configuration) error {

isWithinGrace, tilGrace := gracePeriod(marker, cfg.Settings.Upgrade.Watcher.GracePeriod)
if !isWithinGrace {
log.Debugf("not within grace [updatedOn %v] %v", marker.UpdatedOn.String(), time.Since(marker.UpdatedOn).String())
log.Infof("not within grace [updatedOn %v] %v", marker.UpdatedOn.String(), time.Since(marker.UpdatedOn).String())
// if it is started outside of upgrade loop
// if we're not within grace and marker is still there it might mean
// that cleanup was not performed ok, cleanup everything except current version
Expand Down

0 comments on commit 66fa5a7

Please sign in to comment.