perf(blockstore): Add LRU caches to blockstore operations used in consensus (backport cometbft#3003) (cometbft#3083)

mergify[bot] · ValarDragon · melekes · ValarDragon · commit 0c10bd5a6d0e · 2024-05-25T11:48:37.000+09:00
Closes cometbft#2844 We are seeing that the blockstore loading operations get used in hot loops within gossip routines, and queryMaj23 routines. This PR reduces that overhead using an LRU cache. The LRU cache does have a mutex on every get, but the time with the LRU cache is 9x faster than without (before even adding in DB overheads), due to the proto unmarshalling saved. We could imagine a setup where we avoided a lock there entirely. I don't think this is worth right now, since the new code is 9x faster, and these mostly appear in catchup code which should not be highly contended for across peers at the same time. With the new benchmark I added: OLD: ``` BenchmarkRepeatedLoadSeenCommit-12 24447 54691 ns/op 46495 B/op 319 allocs/op ``` NEW: ``` BenchmarkRepeatedLoadSeenCommit-12 224131 6401 ns/op 8320 B/op 2 allocs/op ``` It turns out these gossip routines don't need mutative copies, so we could optimize out the large allocation in the future if we want. 1 hour cpu profile that shows this appearing in prod: ![image](https://github.com/cometbft/cometbft/assets/6440154/5a7e0f02-8385-4c01-aa6a-dba2a2bf376d) The state machine execution time here for context is 92 seconds. So this is adding up in system load (and GC! The GC load is 52GB, the entire trace is 200GB, with other parts being optimized down from recent PRs) --- - [ ] Tests written/updated - [ ] Changelog entry added in `.changelog` (we use [unclog](https://github.com/informalsystems/unclog) to manage our changelog) - [ ] Updated relevant documentation (`docs/` or `spec/`) and code comments - [ ] Title follows the [Conventional Commits](https://www.conventionalcommits.org/en/v1.0.0/) spec <hr>This is an automatic backport of pull request cometbft#3003 done by [Mergify](https://mergify.com). --------- Co-authored-by: Dev Ojha <ValarDragon@users.noreply.github.com> Co-authored-by: Anton Kaliaev <anton.kalyaev@gmail.com>
diff --git a/.changelog/unreleased/improvements/3003-use-lru-caches-in-blockstore.md b/.changelog/unreleased/improvements/3003-use-lru-caches-in-blockstore.md
@@ -0,0 +1,2 @@
+- [`blockstore`] Use LRU caches in blockstore, significiantly improving consensus gossip routine performance
+  ([\#3003](https://github.com/cometbft/cometbft/issues/3003)
diff --git a/.golangci.yml b/.golangci.yml
@@ -64,6 +64,7 @@ linters-settings:
           - github.com/google
           - github.com/gorilla/websocket
           - github.com/informalsystems/tm-load-test/pkg/loadtest
+          - github.com/hashicorp/golang-lru/v2
           - github.com/lib/pq
           - github.com/libp2p/go-buffer-pool
           - github.com/Masterminds/semver/v3
diff --git a/go.mod b/go.mod
@@ -52,6 +52,7 @@ require (
 	github.com/cosmos/gogoproto v1.4.2
 	github.com/go-git/go-git/v5 v5.11.0
 	github.com/golang/protobuf v1.5.3
+	github.com/hashicorp/golang-lru/v2 v2.0.7
 	github.com/oasisprotocol/curve25519-voi v0.0.0-20220708102147-0a8a51822cae
 	github.com/vektra/mockery/v2 v2.14.0
 	golang.org/x/sync v0.3.0
diff --git a/go.sum b/go.sum
@@ -494,6 +494,8 @@ github.com/hashicorp/go-version v1.6.0 h1:feTTfFNnjP967rlCxM/I9g701jU+RN74YKx2mO
 github.com/hashicorp/go-version v1.6.0/go.mod h1:fltr4n8CU8Ke44wwGCBoEymUuxUHl09ZGVZPK5anwXA=
 github.com/hashicorp/golang-lru v0.5.0/go.mod h1:/m3WP610KZHVQ1SGc6re/UDhFvYD7pJ4Ao+sR/qLZy8=
 github.com/hashicorp/golang-lru v0.5.1/go.mod h1:/m3WP610KZHVQ1SGc6re/UDhFvYD7pJ4Ao+sR/qLZy8=
+github.com/hashicorp/golang-lru/v2 v2.0.7 h1:a+bsQ5rvGLjzHuww6tVxozPZFVghXaHOwFs4luLUK2k=
+github.com/hashicorp/golang-lru/v2 v2.0.7/go.mod h1:QeFd9opnmA6QUJc5vARoKUSoFhyfM2/ZepoAG6RGpeM=
 github.com/hashicorp/hcl v1.0.0 h1:0Anlzjpi4vEasTeNFn2mLJgTSwt0+6sfsiTG8qcWGx4=
 github.com/hashicorp/hcl v1.0.0/go.mod h1:E5yfLk+7swimpb2L/Alb/PJmXilQ/rhwaUYs4T20WEQ=
 github.com/hexops/gotextdiff v1.0.3 h1:gitA9+qJrrTCsiCl7+kh75nPqQt1cx4ZkudSTLoUqJM=
diff --git a/store/bench_test.go b/store/bench_test.go
@@ -0,0 +1,37 @@
+package store
+
+import (
+	"bytes"
+	"testing"
+
+	"github.com/stretchr/testify/require"
+
+	"github.com/cometbft/cometbft/internal/test"
+	"github.com/cometbft/cometbft/libs/log"
+	"github.com/cometbft/cometbft/types"
+	cmttime "github.com/cometbft/cometbft/types/time"
+)
+
+// TestLoadBlockExtendedCommit tests loading the extended commit for a previously
+// saved block. The load method should return nil when only a commit was saved and
+// return the extended commit otherwise.
+func BenchmarkRepeatedLoadSeenCommitSameBlock(b *testing.B) {
+	state, bs, cleanup := makeStateAndBlockStore(log.NewTMLogger(new(bytes.Buffer)))
+	defer cleanup()
+	h := bs.Height() + 1
+	block := state.MakeBlock(h, test.MakeNTxs(h, 10), new(types.Commit), nil, state.Validators.GetProposer().Address)
+	seenCommit := makeTestCommit(block.Header.Height, cmttime.Now())
+	ps, err := block.MakePartSet(types.BlockPartSizeBytes)
+	require.NoError(b, err)
+	bs.SaveBlock(block, ps, seenCommit)
+
+	// sanity check
+	res := bs.LoadSeenCommit(block.Height)
+	require.Equal(b, seenCommit, res)
+
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		res := bs.LoadSeenCommit(block.Height)
+		require.NotNil(b, res)
+	}
+}
diff --git a/store/store.go b/store/store.go
@@ -4,6 +4,8 @@ import (
 	"fmt"
 	"strconv"
 
+	lru "github.com/hashicorp/golang-lru/v2"
+
 	dbm "github.com/cometbft/cometbft-db"
 	"github.com/cosmos/gogoproto/proto"
 
@@ -41,17 +43,35 @@ type BlockStore struct {
 	mtx    cmtsync.RWMutex
 	base   int64
 	height int64
+
+	seenCommitCache  *lru.Cache[int64, *types.Commit]
+	blockCommitCache *lru.Cache[int64, *types.Commit]
 }
 
 // NewBlockStore returns a new BlockStore with the given DB,
 // initialized to the last height that was committed to the DB.
 func NewBlockStore(db dbm.DB) *BlockStore {
 	bs := LoadBlockStoreState(db)
-	return &BlockStore{
+	bStore := &BlockStore{
 		base:   bs.Base,
 		height: bs.Height,
 		db:     db,
 	}
+	bStore.addCaches()
+	return bStore
+}
+
+func (bs *BlockStore) addCaches() {
+	var err error
+	// err can only occur if the argument is non-positive, so is impossible in context.
+	bs.blockCommitCache, err = lru.New[int64, *types.Commit](100)
+	if err != nil {
+		panic(err)
+	}
+	bs.seenCommitCache, err = lru.New[int64, *types.Commit](100)
+	if err != nil {
+		panic(err)
+	}
 }
 
 func (bs *BlockStore) IsEmpty() bool {
@@ -224,6 +244,10 @@ func (bs *BlockStore) LoadBlockMetaByHash(hash []byte) *types.BlockMeta {
 // and it comes from the block.LastCommit for `height+1`.
 // If no commit is found for the given height, it returns nil.
 func (bs *BlockStore) LoadBlockCommit(height int64) *types.Commit {
+	comm, ok := bs.blockCommitCache.Get(height)
+	if ok {
+		return comm.Clone()
+	}
 	pbc := new(cmtproto.Commit)
 	bz, err := bs.db.Get(calcBlockCommitKey(height))
 	if err != nil {
@@ -240,13 +264,18 @@ func (bs *BlockStore) LoadBlockCommit(height int64) *types.Commit {
 	if err != nil {
 		panic(fmt.Sprintf("Error reading block commit: %v", err))
 	}
-	return commit
+	bs.blockCommitCache.Add(height, commit)
+	return commit.Clone()
 }
 
 // LoadSeenCommit returns the locally seen Commit for the given height.
 // This is useful when we've seen a commit, but there has not yet been
 // a new block at `height + 1` that includes this commit in its block.LastCommit.
 func (bs *BlockStore) LoadSeenCommit(height int64) *types.Commit {
+	comm, ok := bs.seenCommitCache.Get(height)
+	if ok {
+		return comm.Clone()
+	}
 	pbc := new(cmtproto.Commit)
 	bz, err := bs.db.Get(calcSeenCommitKey(height))
 	if err != nil {
@@ -264,7 +293,8 @@ func (bs *BlockStore) LoadSeenCommit(height int64) *types.Commit {
 	if err != nil {
 		panic(fmt.Errorf("error from proto commit: %w", err))
 	}
-	return commit
+	bs.seenCommitCache.Add(height, commit)
+	return commit.Clone()
 }
 
 // PruneBlocks removes block up to (but not including) a height. It returns number of blocks pruned.
diff --git a/types/block.go b/types/block.go
@@ -801,6 +801,15 @@ func (commit *Commit) GetVote(valIdx int32) *Vote {
 	}
 }
 
+// Clone creates a deep copy of this commit.
+func (commit *Commit) Clone() *Commit {
+	sigs := make([]CommitSig, len(commit.Signatures))
+	copy(sigs, commit.Signatures)
+	commCopy := *commit
+	commCopy.Signatures = sigs
+	return &commCopy
+}
+
 // VoteSignBytes returns the bytes of the Vote corresponding to valIdx for
 // signing.
 //

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	+- [`blockstore`] Use LRU caches in blockstore, significiantly improving consensus gossip routine performance
	`2`	`+ ([\#3003](https://github.com/cometbft/cometbft/issues/3003)`