Skip to content

Commit 7c36a9d

Browse files
craig[bot]Sajjad Rizvidterikgrinaker
committed
68995: sql: add columns in jobs virtual table for overview in DBConsole r=ajwerner a=sajjadrizvi This commit adds new columns in `crdb_internal.jobs` table, which show the current exponential-backoff state of a job and its execution history. Release justification: This commit adds low-risk updates to new functionality. Jobs subsystem now supports job retries with exponential-backoff. We want to give users more insights about the backoff state of jobs and jobs' lifecycles through additional columns in `crdb_internal.jobs` table. Release note (general change): The functionality to retry failed jobs with exponential-backoff has introduced recently in the system. This commit adds new columns in `crdb_internal.jobs` table, which show the current backoff-state of a job and its execution log. The execution log consists of a sequence of job start and end events and any associated errors that were encountered during the job's each execution. Now users can query internal jobs table to get more insights about jobs through the following columns: (a) `last_run` shows the last execution time of a job, (b) `next_run` shows the next execution time of a job based on exponential-backoff delay, (c) `num_runs` shows the number of times the job has been executed, and (d) `execution_log` provides a set of events that are generated when a job starts and ends its execution. Relates to #68179 69044: storageccl: remove non-ReturnSST ExportRequest r=dt a=dt Release justification: bug fix in new functionality. Release note: none. 69239: roachtest: move roachtest stress CI job instructions to README r=tbg,stevendanna a=erikgrinaker Release justification: non-production code changes Release note: None 69285: roachtest: increase consistency check timeout, and ignore errors r=tbg a=erikgrinaker This bumps the consistency check timeout to 5 minutes. There are indications that a recent libpq upgrade unmasked previously ignored context cancellation errors, caused by the timeout here being too low. It also ignores errors during the consistency check, since it is best-effort anyway. Resolves #68883. Release justification: non-production code changes Release note: None Co-authored-by: Sajjad Rizvi <sajjad@cockroachlabs.com> Co-authored-by: David Taylor <tinystatemachine@gmail.com> Co-authored-by: Erik Grinaker <grinaker@cockroachlabs.com>
5 parents ec0fa4e + 3ccdba8 + 67ad54f + a452dbc + 6b89733 commit 7c36a9d

20 files changed

Lines changed: 1110 additions & 532 deletions

File tree

pkg/ccl/backupccl/BUILD.bazel

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,7 @@ go_library(
8585
"//pkg/sql/roleoption",
8686
"//pkg/sql/rowenc",
8787
"//pkg/sql/rowexec",
88+
"//pkg/sql/sem/builtins",
8889
"//pkg/sql/sem/tree",
8990
"//pkg/sql/sessiondata",
9091
"//pkg/sql/sqlerrors",

pkg/ccl/backupccl/backup_processor.go

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ import (
2929
"github.com/cockroachdb/cockroach/pkg/sql/execinfrapb"
3030
"github.com/cockroachdb/cockroach/pkg/sql/rowenc"
3131
"github.com/cockroachdb/cockroach/pkg/sql/rowexec"
32+
"github.com/cockroachdb/cockroach/pkg/sql/sem/builtins"
3233
"github.com/cockroachdb/cockroach/pkg/sql/types"
3334
"github.com/cockroachdb/cockroach/pkg/storage"
3435
"github.com/cockroachdb/cockroach/pkg/util/contextutil"
@@ -631,7 +632,7 @@ func (s *sstSink) flushFile(ctx context.Context) error {
631632
}
632633

633634
func (s *sstSink) open(ctx context.Context) error {
634-
s.outName = storageccl.GenerateUniqueSSTName(s.conf.id)
635+
s.outName = generateUniqueSSTName(s.conf.id)
635636
if s.ctx == nil {
636637
s.ctx, s.cancel = context.WithCancel(ctx)
637638
}
@@ -740,6 +741,12 @@ func (s *sstSink) write(ctx context.Context, resp returnedSST) error {
740741
return nil
741742
}
742743

744+
func generateUniqueSSTName(nodeID base.SQLInstanceID) string {
745+
// The data/ prefix, including a /, is intended to group SSTs in most of the
746+
// common file/bucket browse UIs.
747+
return fmt.Sprintf("data/%d.sst", builtins.GenerateUniqueInt(nodeID))
748+
}
749+
743750
func init() {
744751
rowexec.NewBackupDataProcessor = newBackupDataProcessor
745752
}

pkg/ccl/storageccl/BUILD.bazel

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,11 +23,9 @@ go_library(
2323
"//pkg/roachpb:with-mocks",
2424
"//pkg/settings",
2525
"//pkg/settings/cluster",
26-
"//pkg/sql/sem/builtins",
2726
"//pkg/storage",
2827
"//pkg/util/hlc",
2928
"//pkg/util/humanizeutil",
30-
"//pkg/util/log",
3129
"//pkg/util/retry",
3230
"//pkg/util/tracing",
3331
"@com_github_cockroachdb_errors//:errors",

pkg/ccl/storageccl/export.go

Lines changed: 14 additions & 122 deletions
Original file line numberDiff line numberDiff line change
@@ -9,23 +9,17 @@
99
package storageccl
1010

1111
import (
12-
"bytes"
1312
"context"
1413
"fmt"
1514

16-
"github.com/cockroachdb/cockroach/pkg/base"
17-
"github.com/cockroachdb/cockroach/pkg/cloud"
1815
"github.com/cockroachdb/cockroach/pkg/keys"
1916
"github.com/cockroachdb/cockroach/pkg/kv/kvserver/batcheval"
2017
"github.com/cockroachdb/cockroach/pkg/kv/kvserver/batcheval/result"
2118
"github.com/cockroachdb/cockroach/pkg/kv/kvserver/spanset"
2219
"github.com/cockroachdb/cockroach/pkg/roachpb"
2320
"github.com/cockroachdb/cockroach/pkg/settings"
24-
"github.com/cockroachdb/cockroach/pkg/sql/sem/builtins"
2521
"github.com/cockroachdb/cockroach/pkg/storage"
2622
"github.com/cockroachdb/cockroach/pkg/util/hlc"
27-
"github.com/cockroachdb/cockroach/pkg/util/log"
28-
"github.com/cockroachdb/cockroach/pkg/util/retry"
2923
"github.com/cockroachdb/cockroach/pkg/util/tracing"
3024
"github.com/cockroachdb/errors"
3125
"github.com/gogo/protobuf/types"
@@ -63,8 +57,6 @@ var ExportRequestMaxAllowedFileSizeOverage = settings.RegisterByteSizeSetting(
6357
64<<20, /* 64 MiB */
6458
).WithPublic()
6559

66-
const maxUploadRetries = 5
67-
6860
func init() {
6961
batcheval.RegisterReadOnlyCommand(roachpb.Export, declareKeysExport, evalExport)
7062
}
@@ -101,6 +93,14 @@ func evalExport(
10193
}
10294
evalExportSpan.RecordStructured(&evalExportTrace)
10395

96+
if !args.ReturnSST {
97+
return result.Result{}, errors.New("ReturnSST is required")
98+
}
99+
100+
if args.Encryption != nil {
101+
return result.Result{}, errors.New("returned SSTs cannot be encrypted")
102+
}
103+
104104
// For MVCC_All backups with no start time, they'll only be capturing the
105105
// *revisions* since the gc threshold, so noting that in the reply allows the
106106
// BACKUP to correctly note the supported time bounds for RESTORE AS OF SYSTEM
@@ -109,47 +109,6 @@ func evalExport(
109109
reply.StartTime = cArgs.EvalCtx.GetGCThreshold()
110110
}
111111

112-
makeExternalStorage := !args.ReturnSST || args.Storage != roachpb.ExternalStorage{} ||
113-
(args.StorageByLocalityKV != nil && len(args.StorageByLocalityKV) > 0)
114-
if makeExternalStorage || log.V(1) {
115-
log.Infof(ctx, "export [%s,%s)", args.Key, args.EndKey)
116-
} else {
117-
// Requests that don't write to export storage are expected to be small.
118-
log.Eventf(ctx, "export [%s,%s)", args.Key, args.EndKey)
119-
}
120-
121-
if makeExternalStorage {
122-
if _, ok := roachpb.TenantFromContext(ctx); ok {
123-
if args.Storage.Provider == roachpb.ExternalStorageProvider_userfile {
124-
return result.Result{}, errors.Errorf("requests to userfile on behalf of tenants must be made by the tenant's SQL process")
125-
}
126-
}
127-
}
128-
129-
// To get the store to export to, first try to match the locality of this node
130-
// to the locality KVs in args.StorageByLocalityKV (used for partitioned
131-
// backups). If that map isn't set or there's no match, fall back to
132-
// args.Storage.
133-
var localityKV string
134-
var exportStore cloud.ExternalStorage
135-
if makeExternalStorage {
136-
var storeConf roachpb.ExternalStorage
137-
var err error
138-
foundStoreByLocality := false
139-
if args.StorageByLocalityKV != nil && len(args.StorageByLocalityKV) > 0 {
140-
locality := cArgs.EvalCtx.GetNodeLocality()
141-
localityKV, storeConf, foundStoreByLocality = getMatchingStore(&locality, args.StorageByLocalityKV)
142-
}
143-
if !foundStoreByLocality {
144-
storeConf = args.Storage
145-
}
146-
exportStore, err = cArgs.EvalCtx.GetExternalStorage(ctx, storeConf)
147-
if err != nil {
148-
return result.Result{}, err
149-
}
150-
defer exportStore.Close()
151-
}
152-
153112
var exportAllRevisions bool
154113
switch args.MVCCFilter {
155114
case roachpb.MVCCFilter_Latest:
@@ -181,11 +140,9 @@ func evalExport(
181140
// Only use resume timestamp if splitting mid key is enabled.
182141
resumeKeyTS := hlc.Timestamp{}
183142
if args.SplitMidKey {
184-
if !args.ReturnSST {
185-
return result.Result{}, errors.New("SplitMidKey could only be used with ReturnSST option")
186-
}
187143
resumeKeyTS = args.ResumeKeyTS
188144
}
145+
189146
var curSizeOfExportedSSTs int64
190147
for start := args.Key; start != nil; {
191148
destFile := &storage.MemFile{}
@@ -214,57 +171,16 @@ func evalExport(
214171
span.EndKey = args.EndKey
215172
}
216173
exported := roachpb.ExportResponse_File{
217-
Span: span,
218-
EndKeyTS: resumeTS,
219-
Exported: summary,
220-
LocalityKV: localityKV,
221-
}
222-
223-
returnSST := args.ReturnSST
224-
if args.ReturnSstBelowSize > 0 && len(data) < int(args.ReturnSstBelowSize) {
225-
returnSST = true
226-
}
227-
228-
if returnSST {
229-
exported.SST = data
230-
} else {
231-
if args.Encryption != nil {
232-
data, err = EncryptFile(data, args.Encryption.Key)
233-
if err != nil {
234-
return result.Result{}, err
235-
}
236-
}
237-
238-
exported.Path = GenerateUniqueSSTName(base.SQLInstanceID(cArgs.EvalCtx.NodeID()))
239-
var attemptNum int
240-
if err := retry.WithMaxAttempts(ctx, base.DefaultRetryOptions(), maxUploadRetries, func() error {
241-
attemptNum++
242-
retryTracingEvent := roachpb.RetryTracingEvent{
243-
Operation: fmt.Sprintf("%s.ExportRequest.WriteFile", exportStore.Conf().Provider.String()),
244-
AttemptNumber: int32(attemptNum),
245-
}
246-
// We blindly retry any error here because we expect the caller to have
247-
// verified the target is writable before sending ExportRequests for it.
248-
if err := cloud.WriteFile(ctx, exportStore, exported.Path, bytes.NewReader(data)); err != nil {
249-
log.VEventf(ctx, 1, "failed to put file: %+v", err)
250-
retryTracingEvent.RetryError = fmt.Sprintf("failed to put file: %s", tracing.RedactAndTruncateError(err))
251-
evalExportSpan.RecordStructured(&retryTracingEvent)
252-
return err
253-
}
254-
evalExportSpan.RecordStructured(&retryTracingEvent)
255-
return nil
256-
}); err != nil {
257-
return result.Result{}, err
258-
}
174+
Span: span,
175+
EndKeyTS: resumeTS,
176+
Exported: summary,
177+
SST: data,
259178
}
260179
reply.Files = append(reply.Files, exported)
261180
start = resume
262181
resumeKeyTS = resumeTS
263182

264-
// If we are not returning the SSTs to the processor, there is no need to
265-
// paginate the ExportRequest since the reply size will not grow large
266-
// enough to cause an OOM.
267-
if args.ReturnSST && h.TargetBytes > 0 {
183+
if h.TargetBytes > 0 {
268184
curSizeOfExportedSSTs += summary.DataSize
269185
// There could be a situation where the size of exported SSTs is larger
270186
// than the TargetBytes. In such a scenario, we want to report back
@@ -307,27 +223,3 @@ func evalExport(
307223

308224
return result.Result{}, nil
309225
}
310-
311-
func getMatchingStore(
312-
locality *roachpb.Locality, storageByLocalityKV map[string]*roachpb.ExternalStorage,
313-
) (string, roachpb.ExternalStorage, bool) {
314-
kvs := locality.Tiers
315-
// When matching, more specific KVs in the node locality take precedence
316-
// over less specific ones.
317-
for i := len(kvs) - 1; i >= 0; i-- {
318-
if store, ok := storageByLocalityKV[kvs[i].String()]; ok {
319-
return kvs[i].String(), *store, true
320-
}
321-
}
322-
return "", roachpb.ExternalStorage{}, false
323-
}
324-
325-
// GenerateUniqueSSTName generates a name for a backup SST that will not collide
326-
// with another name generated by this node or another node.
327-
func GenerateUniqueSSTName(nodeID base.SQLInstanceID) string {
328-
// The data/ prefix, including a /, is intended to group SSTs in most of the
329-
// common file/bucket browse UIs.
330-
// TODO(dt): don't reach out into a SQL builtin here; this code lives in KV.
331-
// Create a unique int differently.
332-
return fmt.Sprintf("data/%d.sst", builtins.GenerateUniqueInt(nodeID))
333-
}

pkg/cmd/roachtest/README.md

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -185,10 +185,22 @@ The HTTP endpoint (by default `:8080`) is useful to find the "run" numbers for
185185
failing tests (to find the artifacts) and to get a general overview of the
186186
progress of the invocation.
187187

188+
### Stressing a roachtest
189+
188190
A solid foundation for building the binaries and stressing a roachtest is
189191
provided via the [roachstress.sh] script, which can either be used outright or
190192
saved and adjusted. The script can be invoked without parameters from a clean
191193
checkout of the cockroach repository at the revision to be tested. It will
192194
prompt for user input on which test to stress.
193195

194196
[roachstress.sh]: https://github.com/cockroachdb/cockroach/blob/master/pkg/cmd/roachtest/roachstress.sh
197+
198+
Another option is to start a [`Cockroach_Nightlies_RoachtestStress`](https://teamcity.cockroachdb.com/buildConfiguration/Cockroach_Nightlies_RoachtestStress)
199+
CI job, which allows running a bunch of tests without having to keep your
200+
laptop online. The CI job is run as follows:
201+
202+
1. Go to https://teamcity.cockroachdb.com/buildConfiguration/Cockroach_Nightlies_RoachtestStress
203+
2. Click the ellipsis (...) next to the Run button and fill in:
204+
* Changes → Build branch: `<branch>`
205+
* Parameters → `env.TESTS`: `^<test>$`
206+
* Parameters → `env.COUNT`: `<runs>`

pkg/cmd/roachtest/cluster.go

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1266,7 +1266,7 @@ func (c *clusterImpl) CheckReplicaDivergenceOnDB(
12661266
//
12671267
// We've seen the consistency checks hang indefinitely in some cases.
12681268
rows, err := db.QueryContext(ctx, `
1269-
SET statement_timeout = '3m';
1269+
SET statement_timeout = '5m';
12701270
SELECT t.range_id, t.start_key_pretty, t.status, t.detail
12711271
FROM
12721272
crdb_internal.check_consistency(true, '', '') as t
@@ -1278,20 +1278,22 @@ WHERE t.status NOT IN ('RANGE_CONSISTENT', 'RANGE_INDETERMINATE')`)
12781278
l.Printf("consistency check failed with %v; ignoring", err)
12791279
return nil
12801280
}
1281+
defer rows.Close()
12811282
var finalErr error
12821283
for rows.Next() {
12831284
var rangeID int32
12841285
var prettyKey, status, detail string
12851286
if scanErr := rows.Scan(&rangeID, &prettyKey, &status, &detail); scanErr != nil {
1286-
return scanErr
1287+
l.Printf("consistency check failed with %v; ignoring", scanErr)
1288+
return nil
12871289
}
12881290
finalErr = errors.CombineErrors(finalErr,
12891291
errors.Newf("r%d (%s) is inconsistent: %s %s\n", rangeID, prettyKey, status, detail))
12901292
}
12911293
if err := rows.Err(); err != nil {
1292-
finalErr = errors.CombineErrors(finalErr, err)
1294+
l.Printf("consistency check failed with %v; ignoring", err)
1295+
return nil
12931296
}
1294-
12951297
return finalErr
12961298
}
12971299

@@ -1330,7 +1332,7 @@ func (c *clusterImpl) FailOnReplicaDivergence(ctx context.Context, t test.Test)
13301332
defer db.Close()
13311333

13321334
if err := contextutil.RunWithTimeout(
1333-
ctx, "consistency check", time.Minute,
1335+
ctx, "consistency check", 5*time.Minute,
13341336
func(ctx context.Context) error {
13351337
return c.CheckReplicaDivergenceOnDB(ctx, t.L(), db)
13361338
},

pkg/cmd/roachtest/test_runner.go

Lines changed: 1 addition & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -946,18 +946,8 @@ func (r *testRunner) maybePostGithubIssue(
946946
ReproductionCommand: func(renderer *issues.Renderer) {
947947
issues.ReproductionAsLink(
948948
"roachtest README",
949-
"https://github.com/cockroachdb/cockroach/tree/master/pkg/cmd/roachtest",
949+
"https://github.com/cockroachdb/cockroach/blob/master/pkg/cmd/roachtest/README.md",
950950
)(renderer)
951-
issues.ReproductionAsLink(
952-
"CI job to stress roachtests",
953-
"https://teamcity.cockroachdb.com/buildConfiguration/Cockroach_Nightlies_RoachtestStress",
954-
)(renderer)
955-
renderer.P(func() {
956-
renderer.Escaped("For the CI stress job, click the ellipsis (...) next to the Run button and fill in:\n")
957-
renderer.Escaped(fmt.Sprintf("* Changes / Build branch: %s\n", branch))
958-
renderer.Escaped(fmt.Sprintf("* Parameters / `env.TESTS`: `^%s$`\n", t.Name()))
959-
renderer.Escaped("* Parameters / `env.COUNT`: <number of runs>\n")
960-
})
961951
},
962952
}
963953
if err := issues.Post(

0 commit comments

Comments
 (0)