Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[7.17](backport #39392) [Bug] fix high IO after sudden filebeat stop (#35893) #39795

Merged
merged 10 commits into from
Jun 24, 2024
2 changes: 2 additions & 0 deletions CHANGELOG.next.asciidoc
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@ https://github.com/elastic/beats/compare/v7.0.0-alpha2...master[Check the HEAD d

*Filebeat*

- Fix high IO and handling of a corrupted registry log file. {pull}35893[35893]

*Heartbeat*

*Metricbeat*
Expand Down
35 changes: 22 additions & 13 deletions libbeat/statestore/backend/memlog/diskstore.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,9 @@ package memlog
import (
"bufio"
"encoding/json"
"errors"
"fmt"
"io/ioutil"
"io"
"os"
"path/filepath"
"sort"
Expand Down Expand Up @@ -102,11 +103,11 @@ const (
checkpointTmpFileName = "checkpoint.new"

storeVersion = "1"

keyField = "_key"
)

// newDiskStore initializes the disk store stucture only. The store must have
var ErrCorruptStore = errors.New("corrupted data file")

// newDiskStore initializes the disk store structure only. The store must have
// been opened already. It tries to open the update log file for append
// operations. If opening the update log file fails, it is marked as
// 'corrupted', triggering a checkpoint operation on the first update to the store.
Expand Down Expand Up @@ -180,7 +181,7 @@ func (s *diskstore) tryOpenLog() error {
f.Close()
})

_, err = f.Seek(0, os.SEEK_END)
_, err = f.Seek(0, io.SeekEnd)
if err != nil {
return err
}
Expand Down Expand Up @@ -261,12 +262,16 @@ func (s *diskstore) LogOperation(op op) error {
if err := enc.Encode(logAction{Op: op.name(), ID: s.nextTxID}); err != nil {
return err
}
writer.WriteByte('\n')
if err := writer.WriteByte('\n'); err != nil {
s.log.Errorf("could not write to registry log file: %s", err)
}

if err := enc.Encode(op); err != nil {
return err
}
writer.WriteByte('\n')
if err := writer.WriteByte('\n'); err != nil {
s.log.Errorf("could not write to registry log file: %s", err)
}

if err := writer.Flush(); err != nil {
return err
Expand Down Expand Up @@ -327,7 +332,10 @@ func (s *diskstore) WriteCheckpoint(state map[string]entry) error {
}

// delete old transaction files
updateActiveMarker(s.log, s.home, s.activeDataFile.path)
if err := updateActiveMarker(s.log, s.home, s.activeDataFile.path); err != nil {
s.log.Warnf("could not update active marker: %s", err)
}

s.removeOldDataFiles()

trySyncPath(s.home)
Expand Down Expand Up @@ -399,7 +407,8 @@ func (s *diskstore) checkpointClearLog() {

err := s.logFile.Truncate(0)
if err == nil {
_, err = s.logFile.Seek(0, os.SEEK_SET)
_, err = s.logFile.Seek(0, io.SeekStart)
s.logInvalid = false
}

if err != nil {
Expand Down Expand Up @@ -436,7 +445,7 @@ func updateActiveMarker(log *logp.Logger, homePath, checkpointFilePath string) e
log.Errorf("Failed to remove old temporary active.dat.tmp file: %v", err)
return err
}
if err := ioutil.WriteFile(tmpLink, []byte(checkpointFilePath), 0600); err != nil {
if err := os.WriteFile(tmpLink, []byte(checkpointFilePath), 0600); err != nil {
log.Errorf("Failed to write temporary pointer file: %v", err)
return err
}
Expand Down Expand Up @@ -534,7 +543,7 @@ func readDataFile(path string, fn func(string, common.MapStr)) error {
var states []map[string]interface{}
dec := json.NewDecoder(f)
if err := dec.Decode(&states); err != nil {
return fmt.Errorf("corrupted data file: %v", err)
return fmt.Errorf("%w: %w", ErrCorruptStore, err)
}

for _, state := range states {
Expand All @@ -555,7 +564,7 @@ func readDataFile(path string, fn func(string, common.MapStr)) error {
// memStore.
// The txid is the transaction ID of the last known valid data file.
// Transactions older then txid will be ignored.
// loadLogFile returns the last commited txid in logTxid and the total number
// loadLogFile returns the last committed txid in logTxid and the total number
// of operations in logCount.
func loadLogFile(
store *memstore,
Expand Down Expand Up @@ -679,7 +688,7 @@ func readMetaFile(home string) (storeMeta, error) {

dec := json.NewDecoder(f)
if err := dec.Decode(&meta); err != nil {
return meta, fmt.Errorf("can not read store meta file: %v", err)
return meta, fmt.Errorf("can not read store meta file: %w", err)
}

return meta, nil
Expand Down
55 changes: 55 additions & 0 deletions libbeat/statestore/backend/memlog/store_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
// Licensed to Elasticsearch B.V. under one or more contributor
// license agreements. See the NOTICE file distributed with
// this work for additional information regarding copyright
// ownership. Elasticsearch B.V. licenses this file to you under
// the Apache License, Version 2.0 (the "License"); you may
// not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

package memlog

import (
"os"
"path/filepath"
"testing"

"github.com/stretchr/testify/require"

"github.com/elastic/beats/v7/libbeat/common"
"github.com/elastic/beats/v7/libbeat/logp"
)

func TestRecoverFromCorruption(t *testing.T) {
path := t.TempDir()
logp.DevelopmentSetup() //nolint: errcheck // it's a test and we don't rely on the logs

if err := copyPath(path, "testdata/1/logfile_incomplete/"); err != nil {
t.Fatalf("Failed to copy test file to the temporary directory: %v", err)
}

store, err := openStore(logp.NewLogger("test"), path, 0660, 4096, false, func(_ uint64) bool {
return false
})
require.NoError(t, err, "openStore must succeed")
require.True(t, store.disk.logInvalid, "expecting the log file to be invalid")

err = store.logOperation(&opSet{K: "key", V: common.MapStr{
"field": 42,
}})
require.NoError(t, err, "logOperation must succeed")
require.False(t, store.disk.logInvalid, "log file must be valid")
require.FileExistsf(t, filepath.Join(path, "7.json"), "expecting the checkpoint file to have been created")

file, err := os.Stat(filepath.Join(path, "log.json"))
require.NoError(t, err, "Stat on the log file must succeed")
require.Equal(t, int64(0), file.Size(), "expecting the log file to be truncated")
}
Loading