-
Notifications
You must be signed in to change notification settings - Fork 4.1k
storage: unexpected Raft re-proposals during split transaction #10160
Description
@bdarnell this simple test (just add it to the pkg/sql directory) exhibits behavior I'm not understanding. I have a triplicated cluster and create a table. I then wait for the table to be split along the expected boundary. Most times I run it, it takes 3-5s waiting for Raft reproposals. I've done a fair bit of digging, and it's very consistent what happens. The lost Raft batch includes just the start of the txn, which adjust the LHS RangeDescriptor. After a few hours tracking it this far, I felt like it'd be more reasonable to turn this over to the expert.
Sometimes it takes about 10s to run (seems to be a race related to not adding to the split queue), and other times it takes 25s to run (not sure about that case as it's rare and I lost the logs).
package sql_test
import (
"context"
"testing"
"github.com/cockroachdb/cockroach/pkg/base"
"github.com/cockroachdb/cockroach/pkg/keys"
"github.com/cockroachdb/cockroach/pkg/testutils/sqlutils"
"github.com/cockroachdb/cockroach/pkg/testutils/testcluster"
"github.com/cockroachdb/cockroach/pkg/util"
"github.com/cockroachdb/cockroach/pkg/util/leaktest"
"github.com/cockroachdb/cockroach/pkg/util/log"
"github.com/pkg/errors"
)
func TestSlowSplit(t *testing.T) {
defer leaktest.AfterTest(t)()
// Create a command filter which prevents EndTransaction from succeeding.
tableStartKey := keys.MakeTablePrefix(51 /* initial table ID */)
testClusterArgs := base.TestClusterArgs{
ReplicationMode: base.ReplicationAuto,
}
tc := testcluster.StartTestCluster(t, 3, testClusterArgs)
defer tc.Stopper().Stop()
if err := tc.WaitForFullReplication(); err != nil {
t.Error(err)
}
sqlDB := sqlutils.MakeSQLRunner(t, tc.Conns[0])
_ = sqlDB.Exec(`CREATE DATABASE test`)
_ = sqlDB.Exec(`CREATE TABLE test.t (k SERIAL PRIMARY KEY, v INT)`)
log.Infof(context.TODO(), "created table")
// Wait for new table to split.
util.SucceedsSoon(t, func() error {
desc, err := tc.LookupRange(keys.MakeRowSentinelKey(tableStartKey))
if err != nil {
t.Fatal(err)
}
if !desc.StartKey.Equal(tableStartKey) {
log.Infof(context.TODO(), "waiting on split results")
return errors.Errorf("expected range start key %s; got %s", tableStartKey, desc.StartKey)
}
return nil
})
}