From e345f285634d1478874a9a0ca8e4aab3611d33d4 Mon Sep 17 00:00:00 2001 From: ShourieG <105607378+ShourieG@users.noreply.github.com> Date: Wed, 2 Oct 2024 08:40:08 +0530 Subject: [PATCH] [filebeat][azure-blob-storage] - Simplified state checkpoint calculation (#40936) --- CHANGELOG-developer.next.asciidoc | 1 + .../input/azureblobstorage/scheduler.go | 48 ++++++------------- 2 files changed, 15 insertions(+), 34 deletions(-) diff --git a/CHANGELOG-developer.next.asciidoc b/CHANGELOG-developer.next.asciidoc index abe0164fe70d..685d641ad0c6 100644 --- a/CHANGELOG-developer.next.asciidoc +++ b/CHANGELOG-developer.next.asciidoc @@ -205,6 +205,7 @@ The list below covers the major changes between 7.0.0-rc2 and main only. - Add a configuration option for TCP/UDP network type. {issue}40407[40407] {pull}40623[40623] - Added debug logging to parquet reader in x-pack/libbeat/reader. {pull}40651[40651] - Added filebeat debug histograms for s3 object size and events per processed s3 object. {pull}40775[40775] +- Simplified Azure Blob Storage input state checkpoint calculation logic. {issue}40674[40674] {pull}40936[40936] ==== Deprecated diff --git a/x-pack/filebeat/input/azureblobstorage/scheduler.go b/x-pack/filebeat/input/azureblobstorage/scheduler.go index 781514d52712..335d4e80320f 100644 --- a/x-pack/filebeat/input/azureblobstorage/scheduler.go +++ b/x-pack/filebeat/input/azureblobstorage/scheduler.go @@ -7,6 +7,8 @@ package azureblobstorage import ( "context" "fmt" + "slices" + "sort" "sync" azruntime "github.com/Azure/azure-sdk-for-go/sdk/azcore/runtime" @@ -190,41 +192,19 @@ func (s *scheduler) fetchBlobPager(batchSize int32) *azruntime.Pager[azblob.List // moveToLastSeenJob, moves to the latest job position past the last seen job // Jobs are stored in lexicographical order always, hence the latest position can be found either on the basis of job name or timestamp func (s *scheduler) moveToLastSeenJob(jobs []*job) []*job { - var latestJobs []*job - jobsToReturn := make([]*job, 0) - counter := 0 - flag := false - ignore := false - - for _, job := range jobs { - switch { - case job.timestamp().After(s.state.checkpoint().LatestEntryTime): - latestJobs = append(latestJobs, job) - case job.name() == s.state.checkpoint().BlobName: - flag = true - case job.name() > s.state.checkpoint().BlobName: - flag = true - counter-- - case job.name() <= s.state.checkpoint().BlobName && (!ignore): - ignore = true - } - counter++ - } - - if flag && (counter < len(jobs)-1) { - jobsToReturn = jobs[counter+1:] - } else if !flag && !ignore { - jobsToReturn = jobs - } - - // in a senario where there are some jobs which have a greater timestamp - // but lesser alphanumeric order and some jobs have greater alphanumeric order - // than the current checkpoint blob name, then we append the latest jobs - if len(jobsToReturn) != len(jobs) && len(latestJobs) > 0 { - jobsToReturn = append(latestJobs, jobsToReturn...) - } + cp := s.state.checkpoint() + jobs = slices.DeleteFunc(jobs, func(j *job) bool { + return !(j.timestamp().After(cp.LatestEntryTime) || j.name() > cp.BlobName) + }) - return jobsToReturn + // In a scenario where there are some jobs which have a greater timestamp + // but lesser lexicographic order and some jobs have greater lexicographic order + // than the current checkpoint blob name, we then sort around the pivot checkpoint + // timestamp. + sort.SliceStable(jobs, func(i, _ int) bool { + return jobs[i].timestamp().After(cp.LatestEntryTime) + }) + return jobs } func (s *scheduler) isFileSelected(name string) bool {