kvserver: e2e flow control for raft messages

**Is your feature request related to a problem? Please describe.**

The flow control for raft messages is currently overly simplistic. A high-level overview of the current state follows.

#### Sending

Raft messages originate in a `raft.Ready`:

https://github.com/cockroachdb/cockroach/blob/40fd5d346cb64943d0d6655767b400b1177ce285/pkg/kv/kvserver/replica_raft.go#L596-L598

and are passed to the `RaftTransport` queues here:

https://github.com/cockroachdb/cockroach/blob/40fd5d346cb64943d0d6655767b400b1177ce285/pkg/kv/kvserver/replica_raft.go#L1443-L1465

There is a single queue for messages to each destination store (irrespective of rangeID), which is essentially a 10k-buffered channel:

https://github.com/cockroachdb/cockroach/blob/c23d1aacbc6c1fb93035eb6e0ff0794b37e89a77/pkg/kv/kvserver/raft_transport.go#L556-L557

from which batches of messages are put on the wire:

https://github.com/cockroachdb/cockroach/blob/c23d1aacbc6c1fb93035eb6e0ff0794b37e89a77/pkg/kv/kvserver/raft_transport.go#L515-L534

#### Receiving

Messages arrive here:

https://github.com/cockroachdb/cockroach/blob/c23d1aacbc6c1fb93035eb6e0ff0794b37e89a77/pkg/kv/kvserver/raft_transport.go#L386-L395

and are put on the raft receive queue:

https://github.com/cockroachdb/cockroach/blob/98a66b59d6a23b970159743c11c24d947c757f11/pkg/kv/kvserver/store_raft.go#L167-L185

The next available raft scheduler goroutine will pick up the nonempty queue in

https://github.com/cockroachdb/cockroach/blob/98a66b59d6a23b970159743c11c24d947c757f11/pkg/kv/kvserver/store_raft.go#L442-L460

and hands it to the raft group (which will not perform work yet, but instead stages everything for the next Ready):

https://github.com/cockroachdb/cockroach/blob/98a66b59d6a23b970159743c11c24d947c757f11/pkg/kv/kvserver/store_raft.go#L254-L256

Finally, a scheduler goroutine will actually process the replica for raft ready handling:

https://github.com/cockroachdb/cockroach/blob/98a66b59d6a23b970159743c11c24d947c757f11/pkg/kv/kvserver/store_raft.go#L497-L506

which is where I/O happens and the memory is flushed to disk.

**Describe the solution you'd like**

Introduce a model that more deliberately handles the case in which a follower is receiving more load than it can handle. Control the memory usage, while prioritizing heartbeats (and generally allowing QOS), without excessive performance cliffs. Integrate with admission control.

See also https://github.com/cockroachdb/cockroach/issues/79215.


Jira issue: CRDB-15932

Epic CRDB-15069

	if r.maybeCoalesceHeartbeat(ctx, msg, toReplica, fromReplica, false, nil) {
	return
	}

	req := newRaftMessageRequest()
	*req = kvserverpb.RaftMessageRequest{
	RangeID: r.RangeID,
	ToReplica: toReplica,
	FromReplica: fromReplica,
	Message: msg,
	RangeStartKey: startKey, // usually nil
	}
	if !r.sendRaftMessageRequest(ctx, req) {
	if err := r.withRaftGroup(true, func(raftGroup *raft.RawNode) (bool, error) {
	r.mu.droppedMessages++
	raftGroup.ReportUnreachable(msg.To)
	return true, nil
	}); err != nil && !errors.Is(err, errRemoved) {
	log.Fatalf(ctx, "%v", err)
	}
	}
	}

	case req := <-ch:
	budget := targetRaftOutgoingBatchSize.Get(&t.st.SV) - int64(req.Size())
	batch.Requests = append(batch.Requests, *req)
	releaseRaftMessageRequest(req)
	// Pull off as many queued requests as possible, within reason.
	for budget > 0 {
	select {
	case req = <-ch:
	budget -= int64(req.Size())
	batch.Requests = append(batch.Requests, *req)
	releaseRaftMessageRequest(req)
	default:
	budget = -1
	}
	}

	err := stream.Send(batch)
	if err != nil {
	return err
	}

	for i := range batch.Requests {
	req := &batch.Requests[i]
	atomic.AddInt64(&stats.serverRecv, 1)
	if pErr := t.handleRaftRequest(ctx, req, stream); pErr != nil {
	atomic.AddInt64(&stats.serverSent, 1)
	if err := stream.Send(newRaftMessageResponse(req, pErr)); err != nil {
	return err
	}
	}
	}

	s.metrics.RaftRcvdMessages[req.Message.Type].Inc(1)

	value, ok := s.replicaQueues.Load(int64(req.RangeID))
	if !ok {
	value, _ = s.replicaQueues.LoadOrStore(int64(req.RangeID), unsafe.Pointer(&raftRequestQueue{}))
	}
	q := (*raftRequestQueue)(value)
	q.Lock()
	defer q.Unlock()
	if len(q.infos) >= replicaRequestQueueSize {
	// TODO(peter): Return an error indicating the request was dropped. Note
	// that dropping the request is safe. Raft will retry.
	s.metrics.RaftRcvdMsgDropped.Inc(1)
	return false
	}
	q.infos = append(q.infos, raftRequestInfo{
	req: req,
	respStream: respStream,
	})

	func (s *Store) processRequestQueue(ctx context.Context, rangeID roachpb.RangeID) bool {
	value, ok := s.replicaQueues.Load(int64(rangeID))
	if !ok {
	return false
	}
	q := (*raftRequestQueue)(value)
	infos, ok := q.drain()
	if !ok {
	return false
	}
	defer q.recycle(infos)

	var hadError bool
	for i := range infos {
	info := &infos[i]
	if pErr := s.withReplicaForRequest(
	ctx, info.req, func(_ context.Context, r Replica) roachpb.Error {
	return s.processRaftRequestWithReplica(r.raftCtx, r, info.req)
	},

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

kvserver: e2e flow control for raft messages #79755

Sending

Receiving

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

	if hasReady = raftGroup.HasReady(); hasReady {
	rd = raftGroup.Ready()
	}

	ch := make(chan *kvserverpb.RaftMessageRequest, raftSendBufferSize)
	value, ok = queuesMap.LoadOrStore(int64(nodeID), unsafe.Pointer(&ch))

	if err := r.stepRaftGroup(req); err != nil {
	return roachpb.NewError(err)
	}

	func (s *Store) processReady(rangeID roachpb.RangeID) {
	r, ok := s.mu.replicasByRangeID.Load(rangeID)
	if !ok {
	return
	}

	ctx := r.raftCtx
	start := timeutil.Now()
	stats, expl, err := r.handleRaftReady(ctx, noSnap)
	maybeFatalOnRaftReadyErr(ctx, expl, err)

kvserver: e2e flow control for raft messages #79755

Description

Sending

Receiving

Metadata

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Issue actions