Skip to content

Commit 1e07b3a

Browse files
authored
Make failed-join logging deterministic (#143808)
No need for nanosecond precision here, and this is the only source of nondeterminism in the log output of `CoordinatorTests`. With this commit we move to retrieving the current time from `ThreadPool`, making the test outputs completely deterministic.
1 parent fa5a460 commit 1e07b3a

2 files changed

Lines changed: 40 additions & 15 deletions

File tree

server/src/main/java/org/elasticsearch/cluster/coordination/JoinHelper.java

Lines changed: 27 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -207,18 +207,19 @@ static class FailedJoinAttempt {
207207
private final DiscoveryNode destination;
208208
private final JoinRequest joinRequest;
209209
private final ElasticsearchException exception;
210-
private final long timestamp;
210+
private final long attemptTimeMillis;
211211

212212
/**
213-
* @param destination the master node targeted by the join request.
214-
* @param joinRequest the join request that was sent to the perceived master node.
215-
* @param exception the error response received in reply to the join request attempt.
213+
* @param destination the master node targeted by the join request.
214+
* @param joinRequest the join request that was sent to the perceived master node.
215+
* @param exception the error response received in reply to the join request attempt.
216+
* @param attemptTimeMillis the (relative milliseconds) time at which the failed attempt occurred.
216217
*/
217-
FailedJoinAttempt(DiscoveryNode destination, JoinRequest joinRequest, ElasticsearchException exception) {
218+
FailedJoinAttempt(DiscoveryNode destination, JoinRequest joinRequest, ElasticsearchException exception, long attemptTimeMillis) {
218219
this.destination = destination;
219220
this.joinRequest = joinRequest;
220221
this.exception = exception;
221-
this.timestamp = System.nanoTime();
222+
this.attemptTimeMillis = attemptTimeMillis;
222223
}
223224

224225
/**
@@ -243,15 +244,15 @@ static Level getLogLevel(ElasticsearchException e) {
243244
return Level.INFO;
244245
}
245246

246-
void logWarnWithTimestamp() {
247+
void logWarnWithTimestamp(LongSupplier relativeTimeMillisSupplier) {
247248
logger.warn(
248249
() -> format(
249250
"last failed join attempt was %s ago, failed to join %s with %s",
250-
// 'timestamp' is when this error exception was received by the local node. If the time that has passed since the error
251-
// was originally received is quite large, it could indicate that this is a stale error exception from some prior
252-
// out-of-order request response (where a later sent request but earlier received response was successful); or
251+
// 'attemptTimeMillis' is when this error exception was received by the local node. If the time that has passed since
252+
// the error was originally received is quite large, it could indicate that this is a stale error exception from some
253+
// prior out-of-order request response (where a later sent request but earlier received response was successful); or
253254
// alternatively an old error could indicate that this node did not retry the join request for a very long time.
254-
TimeValue.timeValueMillis(TimeValue.nsecToMSec(System.nanoTime() - timestamp)),
255+
TimeValue.timeValueMillis(relativeTimeMillisSupplier.getAsLong() - attemptTimeMillis),
255256
destination,
256257
joinRequest
257258
),
@@ -266,7 +267,7 @@ void logWarnWithTimestamp() {
266267
void logLastFailedJoinAttempt() {
267268
FailedJoinAttempt attempt = lastFailedJoinAttempt.get();
268269
if (attempt != null) {
269-
attempt.logWarnWithTimestamp();
270+
attempt.logWarnWithTimestamp(transportService.getThreadPool().relativeTimeInMillisSupplier());
270271
lastFailedJoinAttempt.compareAndSet(attempt, null);
271272
}
272273
}
@@ -299,7 +300,12 @@ public void sendJoinRequest(DiscoveryNode destination, long term, Optional<Join>
299300
pendingJoinInfo.message = PENDING_JOIN_FAILED;
300301
pendingOutgoingJoins.remove(dedupKey);
301302
if (e instanceof ElasticsearchException elasticsearchException) {
302-
final var attempt = new FailedJoinAttempt(destination, joinRequest, elasticsearchException);
303+
final var attempt = new FailedJoinAttempt(
304+
destination,
305+
joinRequest,
306+
elasticsearchException,
307+
transportService.getThreadPool().relativeTimeInMillis()
308+
);
303309
attempt.logNow();
304310
lastFailedJoinAttempt.set(attempt);
305311
assert elasticsearchException instanceof CircuitBreakingException : e; // others shouldn't happen, handle them anyway
@@ -374,7 +380,12 @@ public void onFailure(Exception e) {
374380
private void cleanUpOnFailure(TransportException exp) {
375381
pendingJoinInfo.message = PENDING_JOIN_FAILED;
376382
pendingOutgoingJoins.remove(dedupKey);
377-
final var attempt = new FailedJoinAttempt(destination, joinRequest, exp);
383+
final var attempt = new FailedJoinAttempt(
384+
destination,
385+
joinRequest,
386+
exp,
387+
transportService.getThreadPool().relativeTimeInMillis()
388+
);
378389
attempt.logNow();
379390
lastFailedJoinAttempt.set(attempt);
380391
unregisterAndReleaseConnection(destination, connectionReference);
@@ -390,7 +401,8 @@ public void onFailure(Exception e) {
390401
final var attempt = new FailedJoinAttempt(
391402
destination,
392403
joinRequest,
393-
new ConnectTransportException(destination, "failed to acquire connection", e)
404+
new ConnectTransportException(destination, "failed to acquire connection", e),
405+
transportService.getThreadPool().relativeTimeInMillis()
394406
);
395407
attempt.logNow();
396408
lastFailedJoinAttempt.set(attempt);

server/src/test/java/org/elasticsearch/cluster/coordination/JoinHelperTests.java

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -177,6 +177,19 @@ public void testJoinDeduplication() {
177177
if (mightSucceed) {
178178
// successful requests hold the connections open until the cluster state is applied
179179
joinHelper.onClusterStateApplied();
180+
} else {
181+
final var elapsedTime = randomBoolean() ? TimeValue.ZERO : TimeValue.timeValueMillis(between(0, 100000));
182+
deterministicTaskQueue.runTasksUpToTimeInOrder(deterministicTaskQueue.getCurrentTimeMillis() + elapsedTime.millis());
183+
MockLog.assertThatLogger(
184+
joinHelper::logLastFailedJoinAttempt,
185+
JoinHelper.class,
186+
new MockLog.SeenEventExpectation(
187+
"failed to join message",
188+
JoinHelper.class.getCanonicalName(),
189+
Level.WARN,
190+
"last failed join attempt was " + elapsedTime + " ago, failed to join *"
191+
)
192+
);
180193
}
181194
assertFalse(transportService.nodeConnected(node1));
182195
assertFalse(transportService.nodeConnected(node2));

0 commit comments

Comments
 (0)