Skip to content

Commit

Permalink
fix: data lost when BecomeLeader failed (#621)
Browse files Browse the repository at this point in the history
### Motivation

The status already became `LEADER` if `addFollower` failed. that will
make public server accept requests.


```
{"level":"debug","time":"2025-02-25T03:15:00.886431474Z","component":"db","key":"2f52b44b5408436894ff635087ff0f37-000000000000000013
97-00000000000000002794-00000000000000004191","namespace":"default","shard":11,"version":{"version_id":2932110,"created_timestamp":1
740453300879,"modified_timestamp":1740453300879},"time":"2025-02-25T03:15:00.886435947Z","message":"Applied put operation"}
{"level":"debug","time":"2025-02-25T03:15:00.886448265Z","component":"db","key":"__oxia/commit-offset","namespace":"default","shard"
:11,"version":{"version_id":-1,"modifications_count":2577974,"created_timestamp":1740396891493,"modified_timestamp":1740453300879},"
time":"2025-02-25T03:15:00.886453932Z","message":"Applied put operation"}
{"level":"debug","time":"2025-02-25T03:15:00.886460251Z","component":"db","key":"__oxia/last-version-id","namespace":"default","shar
d":11,"version":{"version_id":-1,"modifications_count":2577974,"created_timestamp":1740396891493,"modified_timestamp":1740453300879}
,"time":"2025-02-25T03:15:00.886464265Z","message":"Applied put operation"}
{"level":"debug","time":"2025-02-25T03:15:00.97985226Z","req":{"shard":11,"puts":[{"key":"2f52b44b5408436894ff635087ff0f37","value":
"AAAAAAAAKTg=","partition_key":"2f52b44b5408436894ff635087ff0f37","sequence_key_delta":[1,2,3]}]},"time":"2025-02-25T03:15:00.979863
138Z","message":"Got request in stream"}
{"level":"debug","time":"2025-02-25T03:15:00.9798711Z","component":"leader-controller","namespace":"default","req":{"shard":11,"puts
":[{"key":"2f52b44b5408436894ff635087ff0f37","value":"AAAAAAAAKTg=","partition_key":"2f52b44b5408436894ff635087ff0f37","sequence_key
_delta":[1,2,3]}]},"shard":11,"term":344,"time":"2025-02-25T03:15:00.979878047Z","message":"Append operation"}
{"level":"debug","time":"2025-02-25T03:15:00.98082737Z","component":"follower-cursor","follower":"data-lost-oxia-0.data-lost-oxia-he
adless:6649","namespace":"default","offset":2585821,"shard":11,"term":344,"time":"2025-02-25T03:15:00.980834674Z","message":"Sending
 entries to follower"}
{"level":"debug","time":"2025-02-25T03:15:06.3798797Z","component":"public-rpc-server","namespace":"default","peer":"10.60.20.44:598
72","shard":11,"time":"2025-02-25T03:15:06.37988677Z","message":"Write Stream request"}
{"level":"debug","time":"2025-02-25T03:15:06.379890628Z","component":"shards-director","shard":11,"time":"2025-02-25T03:15:06.379892
379Z","message":"This node is not hosting shard"}
{"level":"info","time":"2025-02-25T03:15:49.803805685Z","component":"internal-rpc-server","peer":"10.60.31.210:38304","req":{"namesp
ace":"default","shard":11,"term":345,"options":{}},"time":"2025-02-25T03:15:49.803814676Z","message":"Received NewTerm request"}
{"level":"debug","time":"2025-02-25T03:15:49.80382062Z","component":"shards-director","shard":11,"time":"2025-02-25T03:15:49.8038236
71Z","message":"This node is not hosting shard"}
{"level":"debug","time":"2025-02-25T03:15:49.803828329Z","component":"internal-rpc-server","error":{"error":"rpc error: code = Code(
107) desc = node is not follower for shard 11","kind":"*status.Error","stack":null},"peer":"10.60.31.210:38304","req":{"namespace":"
default","shard":11,"term":345,"options":{}},"time":"2025-02-25T03:15:49.803840725Z","message":"Node is not follower, getting leader
"}
{"level":"info","time":"2025-02-25T03:15:49.812867614Z","component":"pebble","shard":11,"time":"2025-02-25T03:15:49.812873089Z","mes
sage":"[JOB 1] WAL file /data/db/default/shard-11/001221.log with log number 001221 stopped reading at offset: 0; replayed 0 keys in
 0 batches"}
{"level":"info","time":"2025-02-25T03:15:49.825914038Z","component":"leader-controller","namespace":"default","shard":11,"term":344,
"time":"2025-02-25T03:15:49.825919153Z","message":"Created leader controller"}
{"level":"debug","time":"2025-02-25T03:15:49.82678302Z","component":"db","key":"__oxia/term","namespace":"default","shard":11,"versi
on":{"version_id":-1,"modifications_count":319,"created_timestamp":1740396117249,"modified_timestamp":1740453349826},"time":"2025-02
-25T03:15:49.826790546Z","message":"Applied put operation"}
{"level":"debug","time":"2025-02-25T03:15:49.826799889Z","component":"db","key":"__oxia/term-options","namespace":"default","shard":
11,"version":{"version_id":-1,"modifications_count":319,"created_timestamp":1740396117249,"modified_timestamp":1740453349826},"time"
:"2025-02-25T03:15:49.826805124Z","message":"Applied put operation"}
{"level":"info","time":"2025-02-25T03:15:49.835051901Z","component":"leader-controller","last-entry":{"term":344,"offset":2585821},"
namespace":"default","shard":11,"term":345,"time":"2025-02-25T03:15:49.835067743Z","message":"Leader successfully initialized in new
 term"}
{"level":"info","time":"2025-02-25T03:15:49.836224479Z","component":"internal-rpc-server","leaderTerm":345,"peer":"10.60.31.210:3830
4","req":{"namespace":"default","shard":11,"term":345,"options":{}},"response":{"head_entry_id":{"term":344,"offset":2585821}},"time
":"2025-02-25T03:15:49.836238087Z","message":"New term processing completed"}
{"level":"info","time":"2025-02-25T03:15:49.837994448Z","component":"internal-rpc-server","peer":"10.60.31.210:38304","request":{"na
mespace":"default","shard":11,"term":345,"replication_factor":3,"follower_maps":{"data-lost-oxia-0.data-lost-oxia-headless:6649":{"t
erm":344,"offset":2585821},"data-lost-oxia-2.data-lost-oxia-headless:6649":{"term":156,"offset":2564735}}},"time":"2025-02-25T03:15:
49.838007987Z","message":"Received BecomeLeader request"}
{"level":"debug","time":"2025-02-25T03:15:49.838202183Z","component":"leader-controller","follower":"data-lost-oxia-0.data-lost-oxia
-headless:6649","follower-head-entry":{"term":344,"offset":2585821},"leader-head-entry":{"term":344,"offset":2585821},"namespace":"d
efault","shard":11,"term":345,"time":"2025-02-25T03:15:49.838214088Z","message":"Needs truncation?"}
{"level":"info","time":"2025-02-25T03:15:49.838267883Z","component":"leader-controller","follower":"data-lost-oxia-0.data-lost-oxia-
headless:6649","follower-head-entry":{"term":344,"offset":2585821},"head-offset":2585821,"leader-election-head-entry":{"term":344,"o
ffset":2585821},"namespace":"default","shard":11,"term":345,"time":"2025-02-25T03:15:49.838276908Z","message":"Added follower"}
{"level":"debug","time":"2025-02-25T03:15:49.838284161Z","component":"leader-controller","follower":"data-lost-oxia-2.data-lost-oxia
-headless:6649","follower-head-entry":{"term":156,"offset":2564735},"leader-head-entry":{"term":344,"offset":2585821},"namespace":"d
efault","shard":11,"term":345,"time":"2025-02-25T03:15:49.838290241Z","message":"Needs truncation?"}
{"level":"info","time":"2025-02-25T03:15:49.838404107Z","ack-offset":2585821,"component":"follower-cursor","follower":"data-lost-oxi
a-0.data-lost-oxia-headless:6649","namespace":"default","shard":11,"term":345,"time":"2025-02-25T03:15:49.838416072Z","message":"Suc
cessfully attached cursor follower"}
{"level":"debug","time":"2025-02-25T03:15:49.879930854Z","component":"public-rpc-server","namespace":"default","peer":"10.60.20.44:5
9872","shard":11,"time":"2025-02-25T03:15:49.879936692Z","message":"Write Stream request"}
{"level":"debug","time":"2025-02-25T03:15:49.879959526Z","req":{"shard":11,"puts":[{"key":"2f52b44b5408436894ff635087ff0f37","value"
:"AAAAAAAAKyE=","partition_key":"2f52b44b5408436894ff635087ff0f37","sequence_key_delta":[1,2,3]}]},"time":"2025-02-25T03:15:49.87997
3537Z","message":"Got request in stream"}
{"level":"error","time":"2025-02-25T03:15:49.903652509Z","component":"leader-controller","error":{"error":"rpc error: code = Unknown
 desc = failed to truncate wal. truncate-offset: 2564844 - wal-last-offset: 2564735: oxia: offset out of bounds","kind":"*status.Err
or","stack":null},"follower":"data-lost-oxia-2.data-lost-oxia-headless:6649","follower-head-entry":null,"namespace":"default","shard
":11,"term":345,"time":"2025-02-25T03:15:49.90367001Z","message":"Failed to truncate follower"}
{"level":"warn","time":"2025-02-25T03:15:49.903678997Z","component":"internal-rpc-server","error":{"error":"rpc error: code = Unknow
n desc = failed to truncate wal. truncate-offset: 2564844 - wal-last-offset: 2564735: oxia: offset out of bounds","kind":"*status.Er
ror","stack":null},"peer":"10.60.31.210:38304","request":{"namespace":"default","shard":11,"term":345,"replication_factor":3,"follow
er_maps":{"data-lost-oxia-0.data-lost-oxia-headless:6649":{"term":344,"offset":2585821},"data-lost-oxia-2.data-lost-oxia-headless:66
49":{"term":156,"offset":2564735}}},"time":"2025-02-25T03:15:49.903691104Z","message":"BecomeLeader failed"}
{"level":"debug","time":"2025-02-25T03:15:49.903714208Z","component":"leader-controller","namespace":"default","req":{"shard":11,"pu
ts":[{"key":"2f52b44b5408436894ff635087ff0f37","value":"AAAAAAAAKyE=","partition_key":"2f52b44b5408436894ff635087ff0f37","sequence_k
ey_delta":[1,2,3]}]},"shard":11,"term":345,"time":"2025-02-25T03:15:49.903721952Z","message":"Append operation"}
{"level":"debug","time":"2025-02-25T03:15:49.904634716Z","component":"follower-cursor","follower":"data-lost-oxia-0.data-lost-oxia-h
eadless:6649","namespace":"default","offset":2585822,"shard":11,"term":345,"time":"2025-02-25T03:15:49.904641202Z","message":"Sendin
g entries to follower"}
{"level":"debug","time":"2025-02-25T03:15:49.906468805Z","component":"follower-cursor","follower":"data-lost-oxia-0.data-lost-oxia-h
eadless:6649","namespace":"default","offset":2585822,"shard":11,"term":345,"time":"2025-02-25T03:15:49.906484415Z","message":"Receiv
ed ack"}
{"level":"debug","time":"2025-02-25T03:15:49.906818814Z","component":"db","key":"2f52b44b5408436894ff635087ff0f37-000000000000000013
05-00000000000000002610-00000000000000003915","namespace":"default","shard":11,"version":{"version_id":2932018,"created_timestamp":1
740453349903,"modified_timestamp":1740453349903},"time":"2025-02-25T03:15:49.906830244Z","message":"Applied put operation"}
{"level":"debug","time":"2025-02-25T03:15:49.906881056Z","component":"db","key":"__oxia/commit-offset","namespace":"default","shard"
:11,"version":{"version_id":-1,"modifications_count":2577882,"created_timestamp":1740396891493,"modified_timestamp":1740453349903},"
time":"2025-02-25T03:15:49.90688618Z","message":"Applied put operation"}
{"level":"debug","time":"2025-02-25T03:15:49.906894687Z","component":"db","key":"__oxia/last-version-id","namespace":"default","shar
d":11,"version":{"version_id":-1,"modifications_count":2577882,"created_timestamp":1740396891493,"modified_timestamp":1740453349903}
,"time":"2025-02-25T03:15:49.906899077Z","message":"Applied put operation"}
{"level":"debug","time":"2025-02-25T03:15:49.979839865Z","req":{"shard":11,"puts":[{"key":"2f52b44b5408436894ff635087ff0f37","value"
:"AAAAAAAAKyI=","partition_key":"2f52b44b5408436894ff635087ff0f37","sequence_key_delta":[1,2,3]}]},"time":"2025-02-25T03:15:49.97984
5378Z","message":"Got request in stream"}

```

### Modification

- Change status to `ServingStatus_LEADER` when all the steps done.
  • Loading branch information
mattisonchao authored Feb 26, 2025
1 parent f015623 commit fad311f
Showing 1 changed file with 2 additions and 1 deletion.
3 changes: 2 additions & 1 deletion server/leader_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -321,7 +321,6 @@ func (lc *leaderController) BecomeLeader(ctx context.Context, req *proto.BecomeL
return nil, common.ErrorInvalidTerm
}

lc.status = proto.ServingStatus_LEADER
lc.replicationFactor = req.GetReplicationFactor()
lc.followers = make(map[string]FollowerCursor)

Expand Down Expand Up @@ -362,6 +361,8 @@ func (lc *leaderController) BecomeLeader(ctx context.Context, req *proto.BecomeL
slog.Int64("term", lc.term),
slog.Int64("head-offset", lc.leaderElectionHeadEntryId.Offset),
)

lc.status = proto.ServingStatus_LEADER
return &proto.BecomeLeaderResponse{}, nil
}

Expand Down

0 comments on commit fad311f

Please sign in to comment.