Skip to content

Commit ae0ad5c

Browse files
author
Miral Gadani
committed
roachtest: This change introduces a transparent to caller, default retry when
encountering an error of 255 (SSH). The retry could be exposed to callers fairly simply, but this PR does not introduce that. Today there appears the concept of a "roachprod" and a "command error", the former of which denotes an issue in roachprod handling code, the latter representing an error executing a command over SSH. This PR preserves this behaviour and introduces an updated function signature from `f(i int) ([]byte, error) to f(i int) (*RunResultDetails, error). RunResultDetails has been expanded to capture information about the execution of a command, such as stderr/our, exit code, error, attempt number. Any roachprod error will result in an error being returned, and set in RunResultDetails.Err. Any command error would be only set in RunResultDetails.Err with the returned error being nil. This allows callers to differentiate between a roachprod and a command error, which existing code depends on. Retry handling code can use a function's returned RunResultDetails to determine whether a retry should take place. The default retry handling occurs on `RunResultDetails.RemoteExitStatus == 255` Release note: None Epic: CRDB-21386
1 parent 15369db commit ae0ad5c

19 files changed

Lines changed: 571 additions & 455 deletions

pkg/cmd/roachtest/tests/BUILD.bazel

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -189,6 +189,7 @@ go_library(
189189
"//pkg/roachpb",
190190
"//pkg/roachprod",
191191
"//pkg/roachprod/config",
192+
"//pkg/roachprod/errors",
192193
"//pkg/roachprod/install",
193194
"//pkg/roachprod/logger",
194195
"//pkg/roachprod/prometheus",

pkg/cmd/roachtest/tests/activerecord.go

Lines changed: 5 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ import (
2121
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/option"
2222
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/registry"
2323
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/test"
24+
rperrors "github.com/cockroachdb/cockroach/pkg/roachprod/errors"
2425
"github.com/cockroachdb/cockroach/pkg/roachprod/install"
2526
"github.com/cockroachdb/errors"
2627
)
@@ -181,16 +182,10 @@ func registerActiveRecord(r registry.Registry) {
181182
`sudo RUBYOPT="-W0" TESTOPTS="-v" bundle exec rake test`,
182183
)
183184

184-
// Expected to fail but we should still scan the error to check if
185-
// there's an SSH/roachprod error.
186-
if err != nil {
187-
// install.NonZeroExitCode includes unrelated to SSH errors ("255")
188-
// or roachprod errors, so we call t.Fatal if the error is not an
189-
// install.NonZeroExitCode error
190-
commandError := (*install.NonZeroExitCode)(nil)
191-
if !errors.As(err, &commandError) {
192-
t.Fatal(err)
193-
}
185+
// Fatal for a roachprod or SSH error. A roachprod error is when result.Err==nil.
186+
// Proceed for any other (command) errors
187+
if err != nil && (result.Err == nil || errors.Is(err, rperrors.ErrSSH255)) {
188+
t.Fatal(err)
194189
}
195190

196191
// Result error contains stdout, stderr, and any error content returned by exec package.

pkg/cmd/roachtest/tests/django.go

Lines changed: 5 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ import (
2020
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/registry"
2121
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/spec"
2222
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/test"
23+
rperrors "github.com/cockroachdb/cockroach/pkg/roachprod/errors"
2324
"github.com/cockroachdb/cockroach/pkg/roachprod/install"
2425
"github.com/cockroachdb/errors"
2526
)
@@ -186,16 +187,10 @@ func registerDjango(r registry.Registry) {
186187
t.Status("Running django test app ", testName)
187188
result, err := c.RunWithDetailsSingleNode(ctx, t.L(), node, fmt.Sprintf(djangoRunTestCmd, testName))
188189

189-
// Expected to fail but we should still scan the error to check if
190-
// there's an SSH/roachprod error.
191-
if err != nil {
192-
// install.NonZeroExitCode includes unrelated to SSH errors ("255")
193-
// or roachprod errors, so we call t.Fatal if the error is not an
194-
// install.NonZeroExitCode error
195-
commandError := (*install.NonZeroExitCode)(nil)
196-
if !errors.As(err, &commandError) {
197-
t.Fatal(err)
198-
}
190+
// Fatal for a roachprod or SSH error. A roachprod error is when result.Err==nil.
191+
// Proceed for any other (command) errors
192+
if err != nil && (result.Err == nil || errors.Is(err, rperrors.ErrSSH255)) {
193+
t.Fatal(err)
199194
}
200195

201196
rawResults := []byte(result.Stdout + result.Stderr)

pkg/cmd/roachtest/tests/gopg.go

Lines changed: 9 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ import (
2323
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/option"
2424
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/registry"
2525
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/test"
26+
rperrors "github.com/cockroachdb/cockroach/pkg/roachprod/errors"
2627
"github.com/cockroachdb/cockroach/pkg/roachprod/install"
2728
"github.com/cockroachdb/errors"
2829
)
@@ -115,16 +116,10 @@ func registerGopg(r registry.Registry) {
115116
destPath, removeColorCodes, resultsFilePath),
116117
)
117118

118-
// Expected to fail but we should still scan the error to check if
119-
// there's an SSH/roachprod error.
120-
if err != nil {
121-
// install.NonZeroExitCode includes unrelated to SSH errors ("255")
122-
// or roachprod errors, so we call t.Fatal if the error is not an
123-
// install.NonZeroExitCode error
124-
commandError := (*install.NonZeroExitCode)(nil)
125-
if !errors.As(err, &commandError) {
126-
t.Fatal(err)
127-
}
119+
// Fatal for a roachprod or SSH error. A roachprod error is when result.Err==nil.
120+
// Proceed for any other (command) errors
121+
if err != nil && (result.Err == nil || errors.Is(err, rperrors.ErrSSH255)) {
122+
t.Fatal(err)
128123
}
129124

130125
rawResults := []byte(result.Stdout + result.Stderr)
@@ -152,16 +147,10 @@ func registerGopg(r registry.Registry) {
152147
destPath, goPath, resultsFilePath, goPath),
153148
)
154149

155-
// Expected to fail but we should still scan the error to check if
156-
// there's an SSH/roachprod error.
157-
if err != nil {
158-
// install.NonZeroExitCode includes unrelated to SSH errors ("255")
159-
// or roachprod errors, so we call t.Fatal if the error is not an
160-
// install.NonZeroExitCode error
161-
commandError := (*install.NonZeroExitCode)(nil)
162-
if !errors.As(err, &commandError) {
163-
t.Fatal(err)
164-
}
150+
// Fatal for a roachprod or SSH error. A roachprod error is when result.Err==nil.
151+
// Proceed for any other (command) errors
152+
if err != nil && (result.Err == nil || errors.Is(err, rperrors.ErrSSH255)) {
153+
t.Fatal(err)
165154
}
166155

167156
xmlResults := []byte(result.Stdout + result.Stderr)

pkg/cmd/roachtest/tests/jepsen.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -165,7 +165,7 @@ func initJepsen(ctx context.Context, t test.Test, c cluster.Cluster, j jepsenCon
165165
ctx, t.L(), controller, "sh", "-c",
166166
`"sudo DEBIAN_FRONTEND=noninteractive apt-get -qqy install openjdk-8-jre openjdk-8-jre-headless libjna-java gnuplot > /dev/null 2>&1"`,
167167
); err != nil {
168-
if result.RemoteExitStatus == "100" {
168+
if result.RemoteExitStatus == 100 {
169169
t.Skip("apt-get failure (#31944)", result.Stdout+result.Stderr)
170170
}
171171
t.Fatal(err)

pkg/cmd/roachtest/tests/mixed_version_decl_schemachange_compat.go

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -84,14 +84,14 @@ func validateCorpusFile(
8484
// Detect validation failures in standard output first, and dump those out.
8585
failureRegex := regexp.MustCompile(`failed to validate.*`)
8686
if matches := failureRegex.FindAllString(details.Stdout, -1); len(matches) > 0 {
87-
t.Fatalf("Validation of corpus has failed (exit status %s): \n%s",
87+
t.Fatalf("Validation of corpus has failed (exit status %d): \n%s",
8888
details.RemoteExitStatus,
8989
strings.Join(matches, "\n"))
9090
}
9191

9292
// If no error is logged dump out both stdout and std error.
93-
if details.RemoteExitStatus != "0" {
94-
t.Fatalf("Validation command failed with exist status %s, output:\n %s\n%s\n",
93+
if details.RemoteExitStatus != 0 {
94+
t.Fatalf("Validation command failed with exit status %d, output:\n %s\n%s\n",
9595
details.RemoteExitStatus,
9696
details.Stdout,
9797
details.Stderr,

pkg/cmd/roachtest/tests/nodejs_postgres.go

Lines changed: 5 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ import (
1919
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/option"
2020
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/registry"
2121
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/test"
22+
rperrors "github.com/cockroachdb/cockroach/pkg/roachprod/errors"
2223
"github.com/cockroachdb/cockroach/pkg/roachprod/install"
2324
"github.com/cockroachdb/errors"
2425
"github.com/stretchr/testify/require"
@@ -141,16 +142,10 @@ PGSSLCERT=$HOME/certs/client.%s.crt PGSSLKEY=$HOME/certs/client.%s.key PGSSLROOT
141142
),
142143
)
143144

144-
// Expected to fail but we should still scan the error to check if
145-
// there's an SSH/roachprod error.
146-
if err != nil {
147-
// install.NonZeroExitCode includes unrelated to SSH errors ("255")
148-
// or roachprod errors, so we call t.Fatal if the error is not an
149-
// install.NonZeroExitCode error
150-
commandError := (*install.NonZeroExitCode)(nil)
151-
if !errors.As(err, &commandError) {
152-
t.Fatal(err)
153-
}
145+
// Fatal for a roachprod or SSH error. A roachprod error is when result.Err==nil.
146+
// Proceed for any other (command) errors
147+
if err != nil && (result.Err == nil || errors.Is(err, rperrors.ErrSSH255)) {
148+
t.Fatal(err)
154149
}
155150

156151
rawResultsStr := result.Stdout + result.Stderr

pkg/cmd/roachtest/tests/pgx.go

Lines changed: 5 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ import (
1919
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/option"
2020
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/registry"
2121
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/test"
22+
rperrors "github.com/cockroachdb/cockroach/pkg/roachprod/errors"
2223
"github.com/cockroachdb/cockroach/pkg/roachprod/install"
2324
"github.com/cockroachdb/errors"
2425
)
@@ -121,16 +122,10 @@ func registerPgx(r registry.Registry) {
121122
"`go env GOPATH`/bin/go-junit-report",
122123
)
123124

124-
// Expected to fail but we should still scan the error to check if
125-
// there's an SSH/roachprod error.
126-
if err != nil {
127-
// install.NonZeroExitCode includes unrelated to SSH errors ("255")
128-
// or roachprod errors, so we call t.Fatal if the error is not an
129-
// install.NonZeroExitCode error
130-
commandError := (*install.NonZeroExitCode)(nil)
131-
if !errors.As(err, &commandError) {
132-
t.Fatal(err)
133-
}
125+
// Fatal for a roachprod or SSH error. A roachprod error is when result.Err==nil.
126+
// Proceed for any other (command) errors
127+
if err != nil && (result.Err == nil || errors.Is(err, rperrors.ErrSSH255)) {
128+
t.Fatal(err)
134129
}
135130

136131
// Result error contains stdout, stderr, and any error content returned by exec package.

pkg/cmd/roachtest/tests/psycopg.go

Lines changed: 5 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ import (
1919
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/option"
2020
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/registry"
2121
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/test"
22+
rperrors "github.com/cockroachdb/cockroach/pkg/roachprod/errors"
2223
"github.com/cockroachdb/cockroach/pkg/roachprod/install"
2324
"github.com/cockroachdb/errors"
2425
)
@@ -124,16 +125,10 @@ func registerPsycopg(r registry.Registry) {
124125
make check PYTHON_VERSION=3`,
125126
)
126127

127-
// Expected to fail but we should still scan the error to check if
128-
// there's an SSH/roachprod error.
129-
if err != nil {
130-
// install.NonZeroExitCode includes unrelated to SSH errors ("255")
131-
// or roachprod errors, so we call t.Fatal if the error is not an
132-
// install.NonZeroExitCode error
133-
commandError := (*install.NonZeroExitCode)(nil)
134-
if !errors.As(err, &commandError) {
135-
t.Fatal(err)
136-
}
128+
// Fatal for a roachprod or SSH error. A roachprod error is when result.Err==nil.
129+
// Proceed for any other (command) errors
130+
if err != nil && (result.Err == nil || errors.Is(err, rperrors.ErrSSH255)) {
131+
t.Fatal(err)
137132
}
138133

139134
// Result error contains stdout, stderr, and any error content returned by exec package.

pkg/cmd/roachtest/tests/ruby_pg.go

Lines changed: 5 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ import (
2222
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/option"
2323
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/registry"
2424
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/test"
25+
rperrors "github.com/cockroachdb/cockroach/pkg/roachprod/errors"
2526
"github.com/cockroachdb/cockroach/pkg/roachprod/install"
2627
"github.com/cockroachdb/errors"
2728
"github.com/stretchr/testify/require"
@@ -152,16 +153,10 @@ func registerRubyPG(r registry.Registry) {
152153
`cd /mnt/data1/ruby-pg/ && bundle exec rake compile test`,
153154
)
154155

155-
// Expected to fail but we should still scan the error to check if
156-
// there's an SSH/roachprod error.
157-
if err != nil {
158-
// install.NonZeroExitCode includes unrelated to SSH errors ("255")
159-
// or roachprod errors, so we call t.Fatal if the error is not an
160-
// install.NonZeroExitCode error
161-
commandError := (*install.NonZeroExitCode)(nil)
162-
if !errors.As(err, &commandError) {
163-
t.Fatal(err)
164-
}
156+
// Fatal for a roachprod or SSH error. A roachprod error is when result.Err==nil.
157+
// Proceed for any other (command) errors
158+
if err != nil && (result.Err == nil || errors.Is(err, rperrors.ErrSSH255)) {
159+
t.Fatal(err)
165160
}
166161

167162
rawResults := []byte(result.Stdout + result.Stderr)

0 commit comments

Comments
 (0)