-
Notifications
You must be signed in to change notification settings - Fork 4.1k
storage: RHS HardState clobbering during splits #16749
Description
observed this curious behavior recently where we seem to clobber on-disk HardState; naively this should not happen. See #7600 for a strikingly similar occurrence.
The following diff adding some extra instrumentation, coupled with this test demonstrates the clobbering behavior.
They are both available on this branch, TestMinimal is just a simplified version of the existing TestStoreRangeSplitRaceUninitializedRHS test but with only one split operation for simplified, albeit hacky, logging.
commit fe0203a96d35d4ffd8c3ae6ba55da8b98f8b1651
Author: irfan sharif <irfanmahmoudsharif@gmail.com>
Date: Tue Jun 27 00:38:53 2017 -0400
storage: debug hardstate clobbering
diff --git a/pkg/storage/replica.go b/pkg/storage/replica.go
index f8a6dd9b0..e04324999 100644
--- a/pkg/storage/replica.go
+++ b/pkg/storage/replica.go
@@ -3112,6 +3112,9 @@ func (r *Replica) handleRaftReadyRaftMuLocked(
if err := batch.Commit(syncRaftLog.Get() && rd.MustSync); err != nil {
return stats, err
}
+ if !raft.IsEmptyHardState(rd.HardState) {
+ log.Infof(ctx, "set hs: %+v", rd.HardState)
+ }
elapsed := timeutil.Since(start)
r.store.metrics.RaftLogCommitLatency.RecordValue(elapsed.Nanoseconds())
@@ -4195,10 +4198,27 @@ func (r *Replica) applyRaftCommand(
writer.Close()
start := timeutil.Now()
+
+ oldHS, err := loadHardState(ctx, r.store.Engine(), roachpb.RangeID(2))
+ if err != nil {
+ panic(err)
+ }
if err := batch.Commit(false); err != nil {
return enginepb.MVCCStats{}, roachpb.NewError(NewReplicaCorruptionError(
errors.Wrap(err, "could not commit batch")))
}
+
+ // This is the hard state received through raft.
+ newHS, err := loadHardState(ctx, r.store.Engine(), roachpb.RangeID(2))
+ if err != nil {
+ panic(err)
+ }
+
+ if newHS.Term < oldHS.Term || (newHS.Term == oldHS.Term && newHS.Commit < oldHS.Commit) {
+ log.Warningf(ctx, "previously: %+v", oldHS)
+ log.Warningf(ctx, "over-written with: %+v", newHS)
+ log.Warningf(ctx, "clobbered hard state: %s", pretty.Diff(newHS, oldHS))
+ }
elapsed := timeutil.Since(start)
r.store.metrics.RaftCommandCommitLatency.RecordValue(elapsed.Nanoseconds())
return rResult.Delta, nil
diff --git a/pkg/storage/replica_command.go b/pkg/storage/replica_command.go
index 9b173619a..ef33f4be2 100644
--- a/pkg/storage/replica_command.go
+++ b/pkg/storage/replica_command.go
@@ -3033,6 +3033,9 @@ func splitTrigger(
if err != nil {
return enginepb.MVCCStats{}, EvalResult{}, errors.Wrap(err, "unable to load hard state")
}
+
+ log.Warningf(ctx, "current hard state, deliver through raft: %+v", oldHS)
+
// Initialize the right-hand lease to be the same as the left-hand lease.
// Various pieces of code rely on a replica's lease never being unitialized,
// but it's more than that - it ensures that we properly initialize the
Observing the logs for make test PKG=./pkg/storage TESTFLAGS="-v" TESTS=TestMinimal:
I170627 16:43:16.470806 228 storage/replica.go:3116 [s1,r2/1:{-}] set hs: {Term:9 Vote:2 Commit:0 XXX_unrecognized:[]}
I170627 16:43:16.470983 228 storage/replica.go:3116 [s1,r2/1:{-}] set hs: {Term:10 Vote:2 Commit:0 XXX_unrecognized:[]}
W170627 16:43:16.471264 218 storage/replica_command.go:3037 [s1,r1/1:/M{in-ax}] current hard state, deliver through raft: {Term:10 Vote:2 Commit:0 XXX_unrecognized:[]}
I170627 16:43:16.471365 228 storage/replica.go:3116 [s1,r2/1:{-}] set hs: {Term:11 Vote:2 Commit:0 XXX_unrecognized:[]}
I170627 16:43:16.471677 228 storage/replica.go:3116 [s1,r2/1:{-}] set hs: {Term:12 Vote:2 Commit:0 XXX_unrecognized:[]}
I170627 16:43:16.472065 228 storage/replica.go:3116 [s1,r2/1:{-}] set hs: {Term:13 Vote:2 Commit:0 XXX_unrecognized:[]}
W170627 16:43:16.472636 74 storage/replica.go:4218 [s1,r1/1:/M{in-ax}] previously: {Term:13 Vote:2 Commit:0 XXX_unrecognized:[]}
W170627 16:43:16.472677 74 storage/replica.go:4219 [s1,r1/1:/M{in-ax}] over-written with: {Term:10 Vote:2 Commit:10 XXX_unrecognized:[]}
W170627 16:43:16.472716 74 storage/replica.go:4220 [s1,r1/1:/M{in-ax}] clobbered hard state: [Term: 10 != 13 Commit: 10 != 0]
I170627 16:43:16.473938 228 storage/replica.go:3116 [s1,r2/1:{a-/Max}] set hs: {Term:14 Vote:0 Commit:10 XXX_unrecognized:[]}
I170627 16:43:16.474799 155 storage/replica.go:3116 [s2,r2/2:{a-/Max}] set hs: {Term:14 Vote:0 Commit:10 XXX_unrecognized:[]}
Important bit: clobbered hard state: [Term: 10 != 13 Commit: 10 != 0]
Also interesting to note is that following this we have set hs: {Term:14 Vote:0 Commit:10 XXX_unrecognized:[]}, not seeing the effect of the HardState reset.