Skip to content

Commit 91af030

Browse files
author
j82w
committed
sql: limit statistics discard log message
Problem: The discard log message occurs for every transaction end after the limit is hit. This causes the log to be flooded with discard messages. This is not useful for users and can cause issues with telemetry pipelines. Solution: The discard message will only be logged once per minute. The log rate is controlled by a cluster setting. This allows the message to be set to a very large message if this expected behavior for a cluster. Refactored: The SQLStats creates and hold the reference to the counts. Then each container which is per an app name is passed the counts by reference. It's not obvious that the counts are shared between the containers. The code was refactored to make a single object to hold the counts and pass all the related content together. This makes the code easier to read and expand in the future if other values need to be added. Fixes: #110454 Release note (sql change): The discard log message is now limited to once per minute by default. The message was also changed to have both the number of transactions and the number of statements that were discarded.
1 parent 0b2ced7 commit 91af030

8 files changed

Lines changed: 285 additions & 111 deletions

File tree

pkg/sql/sqlstats/persistedsqlstats/flush_test.go

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ import (
1515
gosql "database/sql"
1616
"fmt"
1717
"math"
18+
"regexp"
1819
"strconv"
1920
"testing"
2021
"time"
@@ -251,6 +252,68 @@ func TestSQLStatsInitialDelay(t *testing.T) {
251252
"expected latest nextFlushAt to be %s, but found %s", maxNextRunAt, initialNextFlushAt)
252253
}
253254

255+
func TestSQLStatsLogDiscardMessage(t *testing.T) {
256+
defer leaktest.AfterTest(t)()
257+
defer log.Scope(t).Close(t)
258+
259+
ctx := context.Background()
260+
fakeTime := stubTime{
261+
aggInterval: time.Hour,
262+
}
263+
fakeTime.setTime(timeutil.Now())
264+
265+
var params base.TestServerArgs
266+
params.Knobs.SQLStatsKnobs = &sqlstats.TestingKnobs{
267+
StubTimeNow: fakeTime.Now,
268+
}
269+
srv, conn, _ := serverutils.StartServer(t, params)
270+
defer srv.Stopper().Stop(ctx)
271+
272+
sqlConn := sqlutils.MakeSQLRunner(conn)
273+
274+
sqlConn.Exec(t, "SET CLUSTER SETTING sql.stats.flush.minimum_interval = '10m'")
275+
sqlConn.Exec(t, fmt.Sprintf("SET CLUSTER SETTING sql.metrics.max_mem_stmt_fingerprints=%d", 8))
276+
277+
for i := 0; i < 20; i++ {
278+
appName := fmt.Sprintf("logDiscardTestApp%d", i)
279+
sqlConn.Exec(t, "SET application_name = $1", appName)
280+
sqlConn.Exec(t, "SELECT 1")
281+
}
282+
283+
log.FlushFiles()
284+
285+
entries, err := log.FetchEntriesFromFiles(
286+
0,
287+
math.MaxInt64,
288+
10000,
289+
regexp.MustCompile(`statistics discarded due to memory limit. transaction discard count:`),
290+
log.WithFlattenedSensitiveData,
291+
)
292+
require.NoError(t, err)
293+
require.Equal(t, 1, len(entries), "there should only be 1 log for the initial execution because the test should take less than 1 minute to execute the 20 commands. cnt: %v", entries)
294+
295+
// lower the time frame to verify log still occurs after the initial one
296+
sqlConn.Exec(t, "SET CLUSTER SETTING sql.metrics.discarded_stats_log.interval='0.00001ms'")
297+
298+
for i := 0; i < 20; i++ {
299+
appName := fmt.Sprintf("logDiscardTestApp2%d", i)
300+
sqlConn.Exec(t, "SET application_name = $1", appName)
301+
sqlConn.Exec(t, "SELECT 1")
302+
}
303+
304+
log.FlushFiles()
305+
306+
entries, err = log.FetchEntriesFromFiles(
307+
0,
308+
math.MaxInt64,
309+
10000,
310+
regexp.MustCompile(`statistics discarded due to memory limit. transaction discard count:`),
311+
log.WithFlattenedSensitiveData,
312+
)
313+
require.NoError(t, err)
314+
require.GreaterOrEqual(t, len(entries), 1, "there should only be 1 log for the initial execution because the test should take less than 1 minute to execute the 20 commands. cnt: %v", entries)
315+
}
316+
254317
func TestSQLStatsMinimumFlushInterval(t *testing.T) {
255318
defer leaktest.AfterTest(t)()
256319
defer log.Scope(t).Close(t)

pkg/sql/sqlstats/sslocal/sql_stats.go

Lines changed: 13 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,6 @@ package sslocal
1313
import (
1414
"context"
1515
"math"
16-
"sync/atomic"
1716
"time"
1817

1918
"github.com/cockroachdb/cockroach/pkg/settings"
@@ -31,14 +30,6 @@ import (
3130
type SQLStats struct {
3231
st *cluster.Settings
3332

34-
// uniqueStmtFingerprintLimit is the limit on number of unique statement
35-
// fingerprints we can store in memory.
36-
uniqueStmtFingerprintLimit *settings.IntSetting
37-
38-
// uniqueTxnFingerprintLimit is the limit on number of unique transaction
39-
// fingerprints we can store in memory.
40-
uniqueTxnFingerprintLimit *settings.IntSetting
41-
4233
mu struct {
4334
syncutil.Mutex
4435

@@ -50,15 +41,8 @@ type SQLStats struct {
5041
apps map[string]*ssmemstorage.Container
5142
}
5243

53-
atomic struct {
54-
// uniqueStmtFingerprintCount is the number of unique statement fingerprints
55-
// we are storing in memory.
56-
uniqueStmtFingerprintCount int64
57-
58-
// uniqueTxnFingerprintCount is the number of unique transaction fingerprints
59-
// we are storing in memory.
60-
uniqueTxnFingerprintCount int64
61-
}
44+
// Server level counter
45+
atomic *ssmemstorage.SQLStatsAtomicCounters
6246

6347
// flushTarget is a Sink that, when the SQLStats resets at the end of its
6448
// reset interval, the SQLStats will dump all of the stats into if it is not
@@ -93,14 +77,16 @@ func newSQLStats(
9377
st,
9478
)
9579
s := &SQLStats{
96-
st: st,
97-
uniqueStmtFingerprintLimit: uniqueStmtFingerprintLimit,
98-
uniqueTxnFingerprintLimit: uniqueTxnFingerprintLimit,
99-
flushTarget: flushTarget,
100-
knobs: knobs,
101-
insights: insightsWriter,
102-
latencyInformation: latencyInformation,
80+
st: st,
81+
flushTarget: flushTarget,
82+
knobs: knobs,
83+
insights: insightsWriter,
84+
latencyInformation: latencyInformation,
10385
}
86+
s.atomic = ssmemstorage.NewSQLStatsAtomicCounters(
87+
st,
88+
uniqueStmtFingerprintLimit,
89+
uniqueTxnFingerprintLimit)
10490
s.mu.apps = make(map[string]*ssmemstorage.Container)
10591
s.mu.mon = monitor
10692
s.mu.mon.StartNoReserved(context.Background(), parentMon)
@@ -110,7 +96,7 @@ func newSQLStats(
11096
// GetTotalFingerprintCount returns total number of unique statement and
11197
// transaction fingerprints stored in the current SQLStats.
11298
func (s *SQLStats) GetTotalFingerprintCount() int64 {
113-
return atomic.LoadInt64(&s.atomic.uniqueStmtFingerprintCount) + atomic.LoadInt64(&s.atomic.uniqueTxnFingerprintCount)
99+
return s.atomic.GetTotalFingerprintCount()
114100
}
115101

116102
// GetTotalFingerprintBytes returns the total amount of bytes currently
@@ -130,10 +116,7 @@ func (s *SQLStats) getStatsForApplication(appName string) *ssmemstorage.Containe
130116
}
131117
a := ssmemstorage.New(
132118
s.st,
133-
s.uniqueStmtFingerprintLimit,
134-
s.uniqueTxnFingerprintLimit,
135-
&s.atomic.uniqueStmtFingerprintCount,
136-
&s.atomic.uniqueTxnFingerprintCount,
119+
s.atomic,
137120
s.mu.mon,
138121
appName,
139122
s.knobs,

pkg/sql/sqlstats/sslocal/sslocal_provider.go

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -107,10 +107,7 @@ func (s *SQLStats) GetApplicationStats(appName string, internal bool) sqlstats.A
107107
}
108108
a := ssmemstorage.New(
109109
s.st,
110-
s.uniqueStmtFingerprintLimit,
111-
s.uniqueTxnFingerprintLimit,
112-
&s.atomic.uniqueStmtFingerprintCount,
113-
&s.atomic.uniqueTxnFingerprintCount,
110+
s.atomic,
114111
s.mu.mon,
115112
appName,
116113
s.knobs,

pkg/sql/sqlstats/sslocal/sslocal_stats_collector.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,6 @@ import (
1717
"github.com/cockroachdb/cockroach/pkg/sql/appstatspb"
1818
"github.com/cockroachdb/cockroach/pkg/sql/sessionphase"
1919
"github.com/cockroachdb/cockroach/pkg/sql/sqlstats"
20-
"github.com/cockroachdb/cockroach/pkg/util/log"
2120
)
2221

2322
// StatsCollector is used to collect statement and transaction statistics
@@ -106,8 +105,9 @@ func (s *StatsCollector) EndTransaction(
106105
s.ApplicationStats,
107106
)
108107

108+
// Avoid taking locks if no stats are discarded.
109109
if discardedStats > 0 {
110-
log.Warningf(ctx, "%d statement statistics discarded due to memory limit", discardedStats)
110+
s.flushTarget.MaybeLogDiscardMessage(ctx)
111111
}
112112

113113
s.ApplicationStats.Free(ctx)

pkg/sql/sqlstats/ssmemstorage/BUILD.bazel

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ load("@io_bazel_rules_go//go:def.bzl", "go_library")
33
go_library(
44
name = "ssmemstorage",
55
srcs = [
6+
"ss_mem_counter.go",
67
"ss_mem_iterator.go",
78
"ss_mem_storage.go",
89
"ss_mem_writer.go",
Lines changed: 177 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,177 @@
1+
// Copyright 2023 The Cockroach Authors.
2+
//
3+
// Use of this software is governed by the Business Source License
4+
// included in the file licenses/BSL.txt.
5+
//
6+
// As of the Change Date specified in that file, in accordance with
7+
// the Business Source License, use of this software will be governed
8+
// by the Apache License, Version 2.0, included in the file
9+
// licenses/APL.txt.
10+
11+
// Package sqlstats is a subsystem that is responsible for tracking the
12+
// statistics of statements and transactions.
13+
14+
package ssmemstorage
15+
16+
import (
17+
"context"
18+
"sync/atomic"
19+
"time"
20+
21+
"github.com/cockroachdb/cockroach/pkg/settings"
22+
"github.com/cockroachdb/cockroach/pkg/settings/cluster"
23+
"github.com/cockroachdb/cockroach/pkg/util/log"
24+
"github.com/cockroachdb/cockroach/pkg/util/syncutil"
25+
"github.com/cockroachdb/cockroach/pkg/util/timeutil"
26+
)
27+
28+
type SQLStatsAtomicCounters struct {
29+
st *cluster.Settings
30+
31+
// uniqueStmtFingerprintLimit is the limit on number of unique statement
32+
// fingerprints we can store in memory.
33+
UniqueStmtFingerprintLimit *settings.IntSetting
34+
35+
// uniqueTxnFingerprintLimit is the limit on number of unique transaction
36+
// fingerprints we can store in memory.
37+
UniqueTxnFingerprintLimit *settings.IntSetting
38+
39+
// uniqueStmtFingerprintCount is the number of unique statement fingerprints
40+
// we are storing in memory.
41+
uniqueStmtFingerprintCount int64
42+
43+
// uniqueTxnFingerprintCount is the number of unique transaction fingerprints
44+
// we are storing in memory.
45+
uniqueTxnFingerprintCount int64
46+
47+
// discardUniqueStmtFingerprintCount is the number of unique statement
48+
// fingerprints that are discard because of memory limitations.
49+
discardUniqueStmtFingerprintCount int64
50+
51+
// discardUniqueTxnFingerprintCount is the number of unique transaction
52+
// fingerprints that are discard because of memory limitations.
53+
discardUniqueTxnFingerprintCount int64
54+
55+
mu struct {
56+
syncutil.Mutex
57+
58+
// lastDiscardLogMessageSent is the last time a log message was sent for
59+
// statistics being discarded because of memory pressure.
60+
lastDiscardLogMessageSent time.Time
61+
}
62+
}
63+
64+
// DiscardedStatsLogInterval specifies the interval between log emissions for discarded
65+
// statement and transaction statistics due to reaching the SQL statistics memory limit.
66+
var DiscardedStatsLogInterval = settings.RegisterDurationSetting(
67+
settings.TenantWritable,
68+
"sql.metrics.discarded_stats_log.interval",
69+
"interval between log emissions for discarded statistics due to SQL statistics memory limit",
70+
1*time.Minute,
71+
settings.NonNegativeDuration,
72+
settings.WithVisibility(settings.Reserved))
73+
74+
func NewSQLStatsAtomicCounters(
75+
st *cluster.Settings,
76+
uniqueStmtFingerprintLimit *settings.IntSetting,
77+
uniqueTxnFingerprintLimit *settings.IntSetting,
78+
) *SQLStatsAtomicCounters {
79+
return &SQLStatsAtomicCounters{
80+
st: st,
81+
UniqueStmtFingerprintLimit: uniqueStmtFingerprintLimit,
82+
UniqueTxnFingerprintLimit: uniqueTxnFingerprintLimit,
83+
}
84+
}
85+
86+
// maybeLogDiscardMessage logs a warning if statement or transaction
87+
// fingerprints were discarded because of memory limits and enough time passed
88+
// since the last time the warning was logged. This is necessary to avoid
89+
// flooding the log with warnings once the limit is hit.
90+
func (s *SQLStatsAtomicCounters) maybeLogDiscardMessage(ctx context.Context) {
91+
discardSmtCnt := atomic.LoadInt64(&s.discardUniqueStmtFingerprintCount)
92+
discardTxnCnt := atomic.LoadInt64(&s.discardUniqueTxnFingerprintCount)
93+
if discardSmtCnt == 0 && discardTxnCnt == 0 {
94+
return
95+
}
96+
97+
// Get the config values before the lock to reduce time in the lock.
98+
discardLogInterval := DiscardedStatsLogInterval.Get(&s.st.SV)
99+
stmtLimit := s.UniqueStmtFingerprintLimit.Get(&s.st.SV)
100+
txnLimit := s.UniqueTxnFingerprintLimit.Get(&s.st.SV)
101+
s.mu.Lock()
102+
defer s.mu.Unlock()
103+
timeNow := timeutil.Now()
104+
105+
// Not enough time has passed since the last log message was sent.
106+
if timeNow.Sub(s.mu.lastDiscardLogMessageSent) < discardLogInterval {
107+
return
108+
}
109+
110+
// The discard counts might be slightly off because it's possible that the
111+
// count changed after the initial load and before the log message is sent.
112+
// The count being slightly off won't impact users looking at the message. It
113+
// also avoids holding a lock on the counts which would block requests until
114+
// the log is sent.
115+
log.Warningf(ctx, "statistics discarded due to memory limit. transaction discard count: %d with limit: %d, statement discard count: %d with limit: %d, logged at interval: %s, last logged: %s",
116+
discardTxnCnt, stmtLimit, discardSmtCnt, txnLimit, discardLogInterval, s.mu.lastDiscardLogMessageSent)
117+
s.mu.lastDiscardLogMessageSent = timeNow
118+
119+
// Reset the discard count back to 0 since the value was logged
120+
atomic.StoreInt64(&s.discardUniqueStmtFingerprintCount, int64(0))
121+
atomic.StoreInt64(&s.discardUniqueTxnFingerprintCount, int64(0))
122+
}
123+
124+
// tryAddStmtFingerprint attempts to add 1 to the server level count for
125+
// statement level fingerprints and returns false if it is being throttled.
126+
func (s *SQLStatsAtomicCounters) tryAddStmtFingerprint() (ok bool) {
127+
limit := s.UniqueStmtFingerprintLimit.Get(&s.st.SV)
128+
129+
// We check if we have reached the limit of unique fingerprints we can
130+
// store.
131+
incrementedFingerprintCount :=
132+
atomic.AddInt64(&s.uniqueStmtFingerprintCount, int64(1) /* delts */)
133+
134+
// Abort if we have exceeded limit of unique statement fingerprints.
135+
if incrementedFingerprintCount < limit {
136+
return true
137+
}
138+
139+
atomic.AddInt64(&s.discardUniqueStmtFingerprintCount, int64(1))
140+
atomic.AddInt64(&s.uniqueStmtFingerprintCount, -int64(1) /* delts */)
141+
return false
142+
}
143+
144+
// tryAddTxnFingerprint attempts to add 1 to the server level count for
145+
// transaction level fingerprints and returns false if it is being throttled.
146+
func (s *SQLStatsAtomicCounters) tryAddTxnFingerprint() (ok bool) {
147+
limit := s.UniqueTxnFingerprintLimit.Get(&s.st.SV)
148+
149+
// We check if we have reached the limit of unique fingerprints we can
150+
// store.
151+
incrementedFingerprintCount :=
152+
atomic.AddInt64(&s.uniqueTxnFingerprintCount, int64(1) /* delts */)
153+
154+
if incrementedFingerprintCount < limit {
155+
return true
156+
}
157+
158+
atomic.AddInt64(&s.discardUniqueTxnFingerprintCount, int64(1))
159+
atomic.AddInt64(&s.uniqueTxnFingerprintCount, -int64(1) /* delts */)
160+
return false
161+
}
162+
163+
// freeByCnt decrements the statement and transaction count by the value
164+
// passed in. This is used in scenarios where an entire container which is
165+
// per an app name is being cleaned up.
166+
func (s *SQLStatsAtomicCounters) freeByCnt(
167+
uniqueStmtFingerprintCount, uniqueTxnFingerprintCount int64,
168+
) {
169+
atomic.AddInt64(&s.uniqueStmtFingerprintCount, uniqueStmtFingerprintCount)
170+
atomic.AddInt64(&s.uniqueTxnFingerprintCount, uniqueTxnFingerprintCount)
171+
}
172+
173+
// GetTotalFingerprintCount returns total number of unique statement and
174+
// transaction fingerprints stored in the current SQLStats.
175+
func (s *SQLStatsAtomicCounters) GetTotalFingerprintCount() int64 {
176+
return atomic.LoadInt64(&s.uniqueStmtFingerprintCount) + atomic.LoadInt64(&s.uniqueTxnFingerprintCount)
177+
}

0 commit comments

Comments
 (0)