Skip to content

Commit

Permalink
Prevent checkpoints during snapshots
Browse files Browse the repository at this point in the history
Use a mutex that is held during snapshots and only try locking it during
checkpoints.

If the database is in high write pressure and taking a snapshot takes
long it is possible for the automatic checkpointing to go from PASSIVE
to RESTART.

When a RESTART checkpoint is issued SQLite will block new read
transactions and wait for the existing ones to finish. However, during a
snapshot Litestream keeps a persistent read transaction open until it
finishes which will in turn create a deadlock situation for the
checkpointer as the RESTART checkpoint will start blocking writers as
long as the snapshot is being written out.

This failure condition doesn't break everything persistently but it will
create an unfortunate persistent write lock for the application until
the snapshot finishes.
  • Loading branch information
hifi committed May 7, 2023
1 parent d161d14 commit 18ef271
Show file tree
Hide file tree
Showing 2 changed files with 24 additions and 0 deletions.
20 changes: 20 additions & 0 deletions db.go
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ type DB struct {
rtx *sql.Tx // long running read transaction
pageSize int // page size, in bytes
notify chan struct{} // closes on WAL change
chkMu sync.Mutex // checkpoint lock

fileInfo os.FileInfo // db info cached during init
dirInfo os.FileInfo // parent dir info cached during init
Expand Down Expand Up @@ -1288,6 +1289,12 @@ func (db *DB) Checkpoint(ctx context.Context, mode string) (err error) {
// checkpointAndInit performs a checkpoint on the WAL file and initializes a
// new shadow WAL file.
func (db *DB) checkpoint(ctx context.Context, generation, mode string) error {
// Try getting a checkpoint lock, will fail during snapshots.
if !db.chkMu.TryLock() {
return nil
}
defer db.chkMu.Unlock()

shadowWALPath, err := db.CurrentShadowWALPath(generation)
if err != nil {
return err
Expand Down Expand Up @@ -1523,6 +1530,19 @@ func (db *DB) CRC64(ctx context.Context) (uint64, Pos, error) {
return h.Sum64(), pos, nil
}

// BeginSnapshot takes an internal snapshot lock preventing checkpoints.
//
// When calling this the caller must also call EndSnapshot() once the snapshot
// is finished.
func (db *DB) BeginSnapshot() {
db.chkMu.Lock()
}

// EndSnapshot releases the internal snapshot lock that prevents checkpoints.
func (db *DB) EndSnapshot() {
db.chkMu.Unlock()
}

// DefaultRestoreParallelism is the default parallelism when downloading WAL files.
const DefaultRestoreParallelism = 8

Expand Down
4 changes: 4 additions & 0 deletions replica.go
Original file line number Diff line number Diff line change
Expand Up @@ -467,6 +467,10 @@ func (r *Replica) Snapshot(ctx context.Context) (info SnapshotInfo, err error) {
r.muf.Lock()
defer r.muf.Unlock()

// Prevent checkpoints during snapshot.
r.db.BeginSnapshot()
defer r.db.EndSnapshot()

// Issue a passive checkpoint to flush any pages to disk before snapshotting.
if _, err := r.db.db.ExecContext(ctx, `PRAGMA wal_checkpoint(PASSIVE);`); err != nil {
return info, fmt.Errorf("pre-snapshot checkpoint: %w", err)
Expand Down

0 comments on commit 18ef271

Please sign in to comment.