Skip to content

Commit

Permalink
shipper: Be strict about upload order unless it's specified so & cut …
Browse files Browse the repository at this point in the history
…v0.13.0-rc.2 (#2765)

* shipper: Be strict about upload order unless it's specified so.

Signed-off-by: Bartlomiej Plotka <[email protected]>

* Cut 0.13.0-rc.2

Signed-off-by: Bartlomiej Plotka <[email protected]>
  • Loading branch information
bwplotka authored Jun 15, 2020
1 parent f06c13a commit 3bf1397
Show file tree
Hide file tree
Showing 13 changed files with 125 additions and 101 deletions.
3 changes: 2 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ We use *breaking* word for marking changes that are not backward compatible (rel

## Unreleased

## [v0.13.0](https://github.com/thanos-io/thanos/releases/tag/v0.13.0) - 2020.06.15
## [v0.13.0-rc.2](https://github.com/thanos-io/thanos/releases/tag/v0.13.0-rc.2) - 2020.06.15

### Fixed

Expand All @@ -26,6 +26,7 @@ We use *breaking* word for marking changes that are not backward compatible (rel
- [#2416](https://github.com/thanos-io/thanos/pull/2416) Bucket: Fixed issue #2416 bug in `inspect --sort-by` doesn't work correctly in all cases.
- [#2719](https://github.com/thanos-io/thanos/pull/2719) Query: `irate` and `resets` use now counter downsampling aggregations.
- [#2705](https://github.com/thanos-io/thanos/pull/2705) minio-go: Added support for `af-south-1` and `eu-south-1` regions.
- [#2753](https://github.com/thanos-io/thanos/issues/2753) Sidecar, Receive, Rule: Fixed possibility of out of order uploads in error cases. This could potentially cause Compactor to create overlapping blocks.

### Added

Expand Down
2 changes: 1 addition & 1 deletion VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
0.13.0
0.13.0-rc.2
14 changes: 10 additions & 4 deletions cmd/thanos/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -112,17 +112,23 @@ func (rc *reloaderConfig) registerFlag(cmd *kingpin.CmdClause) *reloaderConfig {
}

type shipperConfig struct {
uploadCompacted bool
ignoreBlockSize bool
uploadCompacted bool
ignoreBlockSize bool
allowOutOfOrderUpload bool
}

func (sc *shipperConfig) registerFlag(cmd *kingpin.CmdClause) *shipperConfig {
cmd.Flag("shipper.upload-compacted",
"If true sidecar will try to upload compacted blocks as well. Useful for migration purposes. Works only if compaction is disabled on Prometheus. Do it once and then disable the flag when done.").
"If true shipper will try to upload compacted blocks as well. Useful for migration purposes. Works only if compaction is disabled on Prometheus. Do it once and then disable the flag when done.").
Default("false").BoolVar(&sc.uploadCompacted)
cmd.Flag("shipper.ignore-unequal-block-size",
"If true sidecar will not require prometheus min and max block size flags to be set to the same value. Only use this if you want to keep long retention and compaction enabled on your Prometheus instance, as in the worst case it can result in ~2h data loss for your Thanos bucket storage.").
"If true shipper will not require prometheus min and max block size flags to be set to the same value. Only use this if you want to keep long retention and compaction enabled on your Prometheus instance, as in the worst case it can result in ~2h data loss for your Thanos bucket storage.").
Default("false").Hidden().BoolVar(&sc.ignoreBlockSize)
cmd.Flag("shipper.allow-out-of-order-uploads",
"If true, shipper will skip failed block uploads in the given iteration and retry later. This means that some newer blocks might be uploaded sooner than older blocks."+
"This can trigger compaction without those blocks and as a result will create an overlap situation. Set it to true if you have vertical compaction enabled and wish to upload blocks as soon as possible without caring"+
"about order.").
Default("false").Hidden().BoolVar(&sc.allowOutOfOrderUpload)
return sc
}

Expand Down
9 changes: 9 additions & 0 deletions cmd/thanos/receive.go
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,12 @@ func registerReceive(m map[string]setupFunc, app *kingpin.Application) {

walCompression := cmd.Flag("tsdb.wal-compression", "Compress the tsdb WAL.").Default("true").Bool()

allowOutOfOrderUpload := cmd.Flag("shipper.allow-out-of-order-uploads",
"If true, shipper will skip failed block uploads in the given iteration and retry later. This means that some newer blocks might be uploaded sooner than older blocks."+
"This can trigger compaction without those blocks and as a result will create an overlap situation. Set it to true if you have vertical compaction enabled and wish to upload blocks as soon as possible without caring"+
"about order.").
Default("false").Hidden().Bool()

m[comp.String()] = func(g *run.Group, logger log.Logger, reg *prometheus.Registry, tracer opentracing.Tracer, _ <-chan struct{}, _ bool) error {
lset, err := parseFlagLabels(*labelStrs)
if err != nil {
Expand Down Expand Up @@ -157,6 +163,7 @@ func registerReceive(m map[string]setupFunc, app *kingpin.Application) {
*replicationFactor,
time.Duration(*forwardTimeout),
comp,
*allowOutOfOrderUpload,
)
}
}
Expand Down Expand Up @@ -195,6 +202,7 @@ func runReceive(
replicationFactor uint64,
forwardTimeout time.Duration,
comp component.SourceStoreAPI,
allowOutOfOrderUpload bool,
) error {
logger = log.With(logger, "component", "receive")
level.Warn(logger).Log("msg", "setting up receive; the Thanos receive component is EXPERIMENTAL, it may break significantly without notice")
Expand Down Expand Up @@ -246,6 +254,7 @@ func runReceive(
lset,
tenantLabelName,
bkt,
allowOutOfOrderUpload,
)
writer := receive.NewWriter(log.With(logger, "component", "receive-writer"), dbs)
webHandler := receive.NewHandler(log.With(logger, "component", "receive-handler"), &receive.Options{
Expand Down
10 changes: 9 additions & 1 deletion cmd/thanos/rule.go
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,12 @@ func registerRule(m map[string]setupFunc, app *kingpin.Application) {
dnsSDResolver := cmd.Flag("query.sd-dns-resolver", "Resolver to use. Possible options: [golang, miekgdns]").
Default("golang").Hidden().String()

allowOutOfOrderUpload := cmd.Flag("shipper.allow-out-of-order-uploads",
"If true, shipper will skip failed block uploads in the given iteration and retry later. This means that some newer blocks might be uploaded sooner than older blocks."+
"This can trigger compaction without those blocks and as a result will create an overlap situation. Set it to true if you have vertical compaction enabled and wish to upload blocks as soon as possible without caring"+
"about order.").
Default("false").Hidden().Bool()

m[comp.String()] = func(g *run.Group, logger log.Logger, reg *prometheus.Registry, tracer opentracing.Tracer, reload <-chan struct{}, _ bool) error {
lset, err := parseFlagLabels(*labelStrs)
if err != nil {
Expand Down Expand Up @@ -197,6 +203,7 @@ func registerRule(m map[string]setupFunc, app *kingpin.Application) {
time.Duration(*dnsSDInterval),
*dnsSDResolver,
comp,
*allowOutOfOrderUpload,
)
}
}
Expand Down Expand Up @@ -283,6 +290,7 @@ func runRule(
dnsSDInterval time.Duration,
dnsSDResolver string,
comp component.Component,
allowOutOfOrderUpload bool,
) error {
metrics := newRuleMetrics(reg)

Expand Down Expand Up @@ -615,7 +623,7 @@ func runRule(
}
}()

s := shipper.New(logger, reg, dataDir, bkt, func() labels.Labels { return lset }, metadata.RulerSource)
s := shipper.New(logger, reg, dataDir, bkt, func() labels.Labels { return lset }, metadata.RulerSource, allowOutOfOrderUpload)

ctx, cancel := context.WithCancel(context.Background())

Expand Down
4 changes: 2 additions & 2 deletions cmd/thanos/sidecar.go
Original file line number Diff line number Diff line change
Expand Up @@ -273,9 +273,9 @@ func runSidecar(

var s *shipper.Shipper
if conf.shipper.uploadCompacted {
s = shipper.NewWithCompacted(logger, reg, conf.tsdb.path, bkt, m.Labels, metadata.SidecarSource)
s = shipper.NewWithCompacted(logger, reg, conf.tsdb.path, bkt, m.Labels, metadata.SidecarSource, conf.shipper.allowOutOfOrderUpload)
} else {
s = shipper.New(logger, reg, conf.tsdb.path, bkt, m.Labels, metadata.SidecarSource)
s = shipper.New(logger, reg, conf.tsdb.path, bkt, m.Labels, metadata.SidecarSource, conf.shipper.allowOutOfOrderUpload)
}

return runutil.Repeat(30*time.Second, ctx.Done(), func() error {
Expand Down
2 changes: 1 addition & 1 deletion docs/components/sidecar.md
Original file line number Diff line number Diff line change
Expand Up @@ -151,7 +151,7 @@ Flags:
details:
https://thanos.io/storage.md/#configuration
--shipper.upload-compacted
If true sidecar will try to upload compacted
If true shipper will try to upload compacted
blocks as well. Useful for migration purposes.
Works only if compaction is disabled on
Prometheus. Do it once and then disable the
Expand Down
3 changes: 2 additions & 1 deletion docs/operating/troubleshooting.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@ slug: /troubleshooting.md

# Troubleshooting; Common cases


## Overlaps

**Block overlap**: Set of blocks with exactly the same external labels in meta.json and for the same time or overlapping time period.
Expand All @@ -29,13 +28,15 @@ Checking producers log for such ULID, and checking meta.json (e.g if sample stat

### Reasons

- You are running Thanos (sidecar, ruler or receive) older than 0.13.0. During transient upload errors there is a possibility to have overlaps caused by the compactor not being aware of all blocks See: [this](https://github.com/thanos-io/thanos/issues/2753)
- Misconfiguraiton of sidecar/ruler: Same external labels or no external labels across many block producers.
- Running multiple compactors for single block "stream", even for short duration.
- Manually uploading blocks to the bucket.
- Eventually consistent block storage until we fully implement [RW for bucket](https://thanos.io/proposals/201901-read-write-operations-bucket.md)

### Solutions

- Upgrade sidecar, ruler and receive to 0.13.0+
- Compactor can be blocked for some time, but if it is urgent. Mitigate by removing overlap or better: Backing up somewhere else (you can rename block ULID to non-ulid).
- Who uploaded the block? Search for logs with this ULID across all sidecars/rulers. Check access logs to object storage. Check debug/metas or meta.json of problematic block to see how blocks looks like and what is the `source`.
- Determine what you misconfigured.
Expand Down
26 changes: 15 additions & 11 deletions pkg/receive/multitsdb.go
Original file line number Diff line number Diff line change
Expand Up @@ -38,8 +38,9 @@ type MultiTSDB struct {
labels labels.Labels
bucket objstore.Bucket

mtx *sync.RWMutex
tenants map[string]*tenant
mtx *sync.RWMutex
tenants map[string]*tenant
allowOutOfOrderUpload bool
}

func NewMultiTSDB(
Expand All @@ -50,21 +51,23 @@ func NewMultiTSDB(
labels labels.Labels,
tenantLabelName string,
bucket objstore.Bucket,
allowOutOfOrderUpload bool,
) *MultiTSDB {
if l == nil {
l = log.NewNopLogger()
}

return &MultiTSDB{
dataDir: dataDir,
logger: l,
reg: reg,
tsdbOpts: tsdbOpts,
mtx: &sync.RWMutex{},
tenants: map[string]*tenant{},
labels: labels,
tenantLabelName: tenantLabelName,
bucket: bucket,
dataDir: dataDir,
logger: l,
reg: reg,
tsdbOpts: tsdbOpts,
mtx: &sync.RWMutex{},
tenants: map[string]*tenant{},
labels: labels,
tenantLabelName: tenantLabelName,
bucket: bucket,
allowOutOfOrderUpload: allowOutOfOrderUpload,
}
}

Expand Down Expand Up @@ -256,6 +259,7 @@ func (t *MultiTSDB) getOrLoadTenant(tenantID string, blockingStart bool) (*tenan
t.bucket,
func() labels.Labels { return lbls },
metadata.ReceiveSource,
t.allowOutOfOrderUpload,
)
}

Expand Down
2 changes: 2 additions & 0 deletions pkg/receive/multitsdb_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ func TestMultiTSDB(t *testing.T) {
labels.FromStrings("replica", "01"),
"tenant_id",
nil,
false,
)
defer testutil.Ok(t, m.Flush())

Expand Down Expand Up @@ -109,6 +110,7 @@ func TestMultiTSDB(t *testing.T) {
labels.FromStrings("replica", "01"),
"tenant_id",
nil,
false,
)
defer testutil.Ok(t, m.Flush())

Expand Down
Loading

0 comments on commit 3bf1397

Please sign in to comment.