Skip to content

Commit fc2d180

Browse files
committed
kvserver,clusterversion: version-gate lease upgrade logic
Informs cockroachdb#88301. EnableLeaseUpgrade version gates a change in the lease transfer protocol whereby we only ever transfer expiration-based leases (and have recipients later upgrade them to the more efficient epoch based ones). This was done to limit the effects of ill-advised lease transfers since the incoming leaseholder would need to recognize itself as such within a few seconds. This needs version gating so that in mixed-version clusters, as part of lease transfers, we don't start sending out expiration based leases to nodes that (i) don't expect them for certain keyspans, and (ii) don't know to upgrade them to efficient epoch-based ones. While here, we also introduce a (hidden, default=true) cluster setting kv.transfer_expiration_leases_first.enabled to feature gate this protocol change.
1 parent 6450a9d commit fc2d180

6 files changed

Lines changed: 102 additions & 5 deletions

File tree

docs/generated/settings/settings-for-tenants.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -295,4 +295,4 @@ trace.jaeger.agent string the address of a Jaeger agent to receive traces using
295295
trace.opentelemetry.collector string address of an OpenTelemetry trace collector to receive traces using the otel gRPC protocol, as <host>:<port>. If no port is specified, 4317 will be used.
296296
trace.span_registry.enabled boolean true if set, ongoing traces can be seen at https://<ui>/#/debug/tracez
297297
trace.zipkin.collector string the address of a Zipkin instance to receive traces, as <host>:<port>. If no port is specified, 9411 will be used.
298-
version version 1000022.1-70 set the active cluster version in the format '<major>.<minor>'
298+
version version 1000022.1-72 set the active cluster version in the format '<major>.<minor>'

docs/generated/settings/settings.html

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -229,6 +229,6 @@
229229
<tr><td><code>trace.opentelemetry.collector</code></td><td>string</td><td><code></code></td><td>address of an OpenTelemetry trace collector to receive traces using the otel gRPC protocol, as <host>:<port>. If no port is specified, 4317 will be used.</td></tr>
230230
<tr><td><code>trace.span_registry.enabled</code></td><td>boolean</td><td><code>true</code></td><td>if set, ongoing traces can be seen at https://<ui>/#/debug/tracez</td></tr>
231231
<tr><td><code>trace.zipkin.collector</code></td><td>string</td><td><code></code></td><td>the address of a Zipkin instance to receive traces, as <host>:<port>. If no port is specified, 9411 will be used.</td></tr>
232-
<tr><td><code>version</code></td><td>version</td><td><code>1000022.1-70</code></td><td>set the active cluster version in the format '<major>.<minor>'</td></tr>
232+
<tr><td><code>version</code></td><td>version</td><td><code>1000022.1-72</code></td><td>set the active cluster version in the format '<major>.<minor>'</td></tr>
233233
</tbody>
234234
</table>

pkg/clusterversion/cockroach_versions.go

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -288,6 +288,16 @@ const (
288288
// version is enabled, the receiver will look at the priority of snapshots
289289
// using the fields added in 22.2.
290290
PrioritizeSnapshots
291+
// EnableLeaseUpgrade version gates a change in the lease transfer protocol
292+
// whereby we only ever transfer expiration-based leases (and have
293+
// recipients later upgrade them to the more efficient epoch based ones).
294+
// This was done to limit the effects of ill-advised lease transfers since
295+
// the incoming leaseholder would need to recognize itself as such within a
296+
// few seconds. This needs version gating so that in mixed-version clusters,
297+
// as part of lease transfers, we don't start sending out expiration based
298+
// leases to nodes that (i) don't expect them for certain keyspans, and (ii)
299+
// don't know to upgrade them to efficient epoch-based ones.
300+
EnableLeaseUpgrade
291301

292302
// *************************************************
293303
// Step (1): Add new versions here.
@@ -467,6 +477,10 @@ var rawVersionsSingleton = keyedVersions{
467477
Key: PrioritizeSnapshots,
468478
Version: roachpb.Version{Major: 22, Minor: 1, Internal: 70},
469479
},
480+
{
481+
Key: EnableLeaseUpgrade,
482+
Version: roachpb.Version{Major: 22, Minor: 1, Internal: 72},
483+
},
470484

471485
// *************************************************
472486
// Step (2): Add new versions here.

pkg/kv/kvserver/client_lease_test.go

Lines changed: 72 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ import (
2222
"time"
2323

2424
"github.com/cockroachdb/cockroach/pkg/base"
25+
"github.com/cockroachdb/cockroach/pkg/clusterversion"
2526
"github.com/cockroachdb/cockroach/pkg/config/zonepb"
2627
"github.com/cockroachdb/cockroach/pkg/gossip"
2728
"github.com/cockroachdb/cockroach/pkg/keys"
@@ -1364,7 +1365,6 @@ func TestLeaseTransfersUseExpirationLeasesAndBumpToEpochBasedOnes(t *testing.T)
13641365
if !li.Current().OwnedBy(n2.GetFirstStoreID()) {
13651366
return errors.New("lease still owned by n1")
13661367
}
1367-
require.Equal(t, roachpb.LeaseExpiration, li.Current().Type())
13681368
return nil
13691369
})
13701370

@@ -1376,3 +1376,74 @@ func TestLeaseTransfersUseExpirationLeasesAndBumpToEpochBasedOnes(t *testing.T)
13761376
defer mu.Unlock()
13771377
require.Equal(t, roachpb.LeaseExpiration, mu.lease.Type())
13781378
}
1379+
1380+
// TestLeaseUpgradeVersionGate tests the version gating for the lease-upgrade
1381+
// process.
1382+
//
1383+
// TODO(irfansharif): Delete this in 23.1 (or whenever we get rid of the
1384+
// clusterversion.EnableLeaseUpgrade).
1385+
func TestLeaseUpgradeVersionGate(t *testing.T) {
1386+
defer leaktest.AfterTest(t)()
1387+
defer log.Scope(t).Close(t)
1388+
1389+
ctx := context.Background()
1390+
st := cluster.MakeTestingClusterSettingsWithVersions(
1391+
clusterversion.TestingBinaryVersion,
1392+
clusterversion.ByKey(clusterversion.EnableLeaseUpgrade-1),
1393+
false, /* initializeVersion */
1394+
)
1395+
tci := serverutils.StartNewTestCluster(t, 2, base.TestClusterArgs{
1396+
ReplicationMode: base.ReplicationManual,
1397+
ServerArgs: base.TestServerArgs{
1398+
Settings: st,
1399+
Knobs: base.TestingKnobs{
1400+
Server: &server.TestingKnobs{
1401+
DisableAutomaticVersionUpgrade: make(chan struct{}),
1402+
BinaryVersionOverride: clusterversion.ByKey(clusterversion.EnableLeaseUpgrade - 1),
1403+
},
1404+
},
1405+
},
1406+
})
1407+
tc := tci.(*testcluster.TestCluster)
1408+
defer tc.Stopper().Stop(ctx)
1409+
1410+
scratchKey := tc.ScratchRange(t)
1411+
n1, n1Target := tc.Server(0), tc.Target(0)
1412+
n2, n2Target := tc.Server(1), tc.Target(1)
1413+
1414+
// Add a replica; we're going to move the lease to and from it below.
1415+
desc := tc.AddVotersOrFatal(t, scratchKey, n2Target)
1416+
1417+
// Transfer the lease from n1 to n2. It should be transferred as an
1418+
// epoch-based one since we've not upgraded past
1419+
// clusterversion.EnableLeaseUpgrade yet.
1420+
tc.TransferRangeLeaseOrFatal(t, desc, n2Target)
1421+
testutils.SucceedsSoon(t, func() error {
1422+
li, _, err := tc.FindRangeLeaseEx(ctx, desc, nil)
1423+
require.NoError(t, err)
1424+
if !li.Current().OwnedBy(n2.GetFirstStoreID()) {
1425+
return errors.New("lease still owned by n1")
1426+
}
1427+
require.Equal(t, roachpb.LeaseEpoch, li.Current().Type())
1428+
return nil
1429+
})
1430+
1431+
// Enable the version gate.
1432+
_, err := tc.Conns[0].ExecContext(ctx, `SET CLUSTER SETTING version = $1`,
1433+
clusterversion.ByKey(clusterversion.EnableLeaseUpgrade).String())
1434+
require.NoError(t, err)
1435+
1436+
// Transfer the lease back from n2 to n1. It should be transferred as an
1437+
// expiration-based lease that's later upgraded to an epoch based one now
1438+
// that we're past the version gate.
1439+
tc.TransferRangeLeaseOrFatal(t, desc, n1Target)
1440+
testutils.SucceedsSoon(t, func() error {
1441+
li, _, err := tc.FindRangeLeaseEx(ctx, desc, nil)
1442+
require.NoError(t, err)
1443+
if !li.Current().OwnedBy(n1.GetFirstStoreID()) {
1444+
return errors.New("lease still owned by n2")
1445+
}
1446+
return nil
1447+
})
1448+
tc.WaitForLeaseUpgrade(ctx, t, desc)
1449+
}

pkg/kv/kvserver/replica_range_lease.go

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -48,12 +48,14 @@ import (
4848
"time"
4949

5050
"github.com/cockroachdb/cockroach/pkg/base"
51+
"github.com/cockroachdb/cockroach/pkg/clusterversion"
5152
"github.com/cockroachdb/cockroach/pkg/keys"
5253
"github.com/cockroachdb/cockroach/pkg/kv/kvserver/constraint"
5354
"github.com/cockroachdb/cockroach/pkg/kv/kvserver/kvserverpb"
5455
"github.com/cockroachdb/cockroach/pkg/kv/kvserver/liveness"
5556
"github.com/cockroachdb/cockroach/pkg/kv/kvserver/raftutil"
5657
"github.com/cockroachdb/cockroach/pkg/roachpb"
58+
"github.com/cockroachdb/cockroach/pkg/settings"
5759
"github.com/cockroachdb/cockroach/pkg/util/hlc"
5860
"github.com/cockroachdb/cockroach/pkg/util/log"
5961
"github.com/cockroachdb/cockroach/pkg/util/retry"
@@ -65,6 +67,13 @@ import (
6567
"github.com/cockroachdb/redact"
6668
)
6769

70+
var transferExpirationLeasesFirstEnabled = settings.RegisterBoolSetting(
71+
settings.SystemOnly,
72+
"kv.transfer_expiration_leases_first.enabled",
73+
"controls whether we transfer expiration-based leases that are later upgraded to epoch-based ones",
74+
true,
75+
)
76+
6877
var leaseStatusLogLimiter = func() *log.EveryN {
6978
e := log.Every(15 * time.Second)
7079
e.ShouldLog() // waste the first shot
@@ -246,7 +255,10 @@ func (p *pendingLeaseRequest) InitOrJoinRequest(
246255
ProposedTS: &status.Now,
247256
}
248257

249-
if p.repl.requiresExpiringLeaseRLocked() || transfer {
258+
if p.repl.requiresExpiringLeaseRLocked() ||
259+
(transfer &&
260+
transferExpirationLeasesFirstEnabled.Get(&p.repl.store.ClusterSettings().SV) &&
261+
p.repl.store.ClusterSettings().Version.IsActive(ctx, clusterversion.EnableLeaseUpgrade)) {
250262
// In addition to ranges that unconditionally require expiration-based
251263
// leases (node liveness and earlier), we also use them during lease
252264
// transfers for all other ranges. After acquiring these expiration

pkg/sql/logictest/logictestbase/logictestbase.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -463,7 +463,7 @@ var LogicTestConfigs = []TestClusterConfig{
463463
NumNodes: 1,
464464
OverrideDistSQLMode: "off",
465465
BootstrapVersion: clusterversion.ByKey(clusterversion.V22_1),
466-
BinaryVersion: clusterversion.ByKey(clusterversion.PrioritizeSnapshots), //TODO: switch to 22.2.
466+
BinaryVersion: clusterversion.ByKey(clusterversion.EnableLeaseUpgrade), // TODO(dt): switch to 22.2.
467467
DisableUpgrade: true,
468468
DeclarativeCorpusCollection: true,
469469
},

0 commit comments

Comments
 (0)