Skip to content

Commit cbaf3c5

Browse files
Fix flock cluster config may cause failure to restart after kill -9 (redis#7674)
After fork, the child process(redis-aof-rewrite) will get the fd opened by the parent process(redis), when redis killed by kill -9, it will not graceful exit(call prepareForShutdown()), so redis-aof-rewrite thread may still alive, the fd(lock) will still be held by redis-aof-rewrite thread, and redis restart will fail to get lock, means fail to start. This issue was causing failures in the cluster tests in github actions. Co-authored-by: Oran Agra <oran@redislabs.com>
1 parent 34c3be3 commit cbaf3c5

4 files changed

Lines changed: 31 additions & 7 deletions

File tree

src/cluster.c

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -418,7 +418,15 @@ int clusterLockConfig(char *filename) {
418418
return C_ERR;
419419
}
420420
/* Lock acquired: leak the 'fd' by not closing it, so that we'll retain the
421-
* lock to the file as long as the process exists. */
421+
* lock to the file as long as the process exists.
422+
*
423+
* After fork, the child process will get the fd opened by the parent process,
424+
* we need save `fd` to `cluster_config_file_lock_fd`, so that in redisFork(),
425+
* it will be closed in the child process.
426+
* If it is not closed, when the main process is killed -9, but the child process
427+
* (redis-aof-rewrite) is still alive, the fd(lock) will still be held by the
428+
* child process, and the main process will fail to get lock, means fail to start. */
429+
server.cluster_config_file_lock_fd = fd;
422430
#endif /* __sun */
423431

424432
return C_OK;
@@ -468,6 +476,7 @@ void clusterInit(void) {
468476

469477
/* Lock the cluster config file to make sure every node uses
470478
* its own nodes.conf. */
479+
server.cluster_config_file_lock_fd = -1;
471480
if (clusterLockConfig(server.cluster_configfile) == C_ERR)
472481
exit(1);
473482

src/server.c

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4917,14 +4917,24 @@ void setupChildSignalHandlers(void) {
49174917
return;
49184918
}
49194919

4920+
/* After fork, the child process will inherit the resources
4921+
* of the parent process, e.g. fd(socket or flock) etc.
4922+
* should close the resources not used by the child process, so that if the
4923+
* parent restarts it can bind/lock despite the child possibly still running. */
4924+
void closeClildUnusedResourceAfterFork() {
4925+
closeListeningSockets(0);
4926+
if (server.cluster_enabled && server.cluster_config_file_lock_fd != -1)
4927+
close(server.cluster_config_file_lock_fd); /* don't care if this fails */
4928+
}
4929+
49204930
int redisFork() {
49214931
int childpid;
49224932
long long start = ustime();
49234933
if ((childpid = fork()) == 0) {
49244934
/* Child */
49254935
setOOMScoreAdj(CONFIG_OOM_BGCHILD);
4926-
closeListeningSockets(0);
49274936
setupChildSignalHandlers();
4937+
closeClildUnusedResourceAfterFork();
49284938
} else {
49294939
/* Parent */
49304940
server.stat_fork_time = ustime()-start;

src/server.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1419,6 +1419,7 @@ struct redisServer {
14191419
REDISMODULE_CLUSTER_FLAG_*. */
14201420
int cluster_allow_reads_when_down; /* Are reads allowed when the cluster
14211421
is down? */
1422+
int cluster_config_file_lock_fd; /* cluster config fd, will be flock */
14221423
/* Scripting */
14231424
lua_State *lua; /* The Lua interpreter. We use just one for all clients */
14241425
client *lua_client; /* The "fake client" to query Redis from Lua */

tests/instances.tcl

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -116,7 +116,9 @@ proc spawn_instance {type base_port count {conf {}}} {
116116

117117
# Check availability finally
118118
if {[server_is_up 127.0.0.1 $port 100] == 0} {
119-
abort_sentinel_test "Problems starting $type #$j: ping timeout"
119+
set logfile [file join $dirname log.txt]
120+
puts [exec tail $logfile]
121+
abort_sentinel_test "Problems starting $type #$j: ping timeout, maybe server start failed, check $logfile"
120122
}
121123

122124
# Push the instance into the right list
@@ -493,12 +495,12 @@ proc kill_instance {type id} {
493495

494496
# Wait for the port it was using to be available again, so that's not
495497
# an issue to start a new server ASAP with the same port.
496-
set retry 10
498+
set retry 100
497499
while {[incr retry -1]} {
498-
set port_is_free [catch {set s [socket 127.0.01 $port]}]
500+
set port_is_free [catch {set s [socket 127.0.0.1 $port]}]
499501
if {$port_is_free} break
500502
catch {close $s}
501-
after 1000
503+
after 100
502504
}
503505
if {$retry == 0} {
504506
error "Port $port does not return available after killing instance."
@@ -525,7 +527,9 @@ proc restart_instance {type id} {
525527

526528
# Check that the instance is running
527529
if {[server_is_up 127.0.0.1 $port 100] == 0} {
528-
abort_sentinel_test "Problems starting $type #$id: ping timeout"
530+
set logfile [file join $dirname log.txt]
531+
puts [exec tail $logfile]
532+
abort_sentinel_test "Problems starting $type #$id: ping timeout, maybe server start failed, check $logfile"
529533
}
530534

531535
# Connect with it with a fresh link

0 commit comments

Comments
 (0)