Skip to content

Commit bdcf80e

Browse files
committed
networking.c/writeToClient: handle WSAEWOULDBLOCK
- fixed writeToClient() after failed tests - adjusted tests to match antirez/redis@4.0.2
1 parent 155111f commit bdcf80e

30 files changed

Lines changed: 767 additions & 128 deletions

src/networking.c

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -939,6 +939,14 @@ int writeToClient(int fd, client *c, int handler_installed) {
939939
server.el, c, c->buf, NULL);
940940
if (result == SOCKET_ERROR && errno != WSA_IO_PENDING) {
941941
nwritten = -1;
942+
943+
//[tporadowski/#11] we might be bursting data too fast, so turn it into another try that will put back the client
944+
// in the sending queue
945+
if (errno == WSAEWOULDBLOCK) {
946+
serverLog(LL_DEBUG, "writeToClient: will try again (EAGAIN) due to WSAEWOULDBLOCK");
947+
errno = EAGAIN;
948+
}
949+
942950
break;
943951
}
944952
#else
@@ -1002,7 +1010,7 @@ int writeToClient(int fd, client *c, int handler_installed) {
10021010
if (listLength(c->reply) == 0)
10031011
serverAssert(c->reply_bytes == 0);
10041012
}
1005-
}
1013+
}
10061014
/* Note that we avoid to send more than NET_MAX_WRITES_PER_EVENT
10071015
* bytes, in a single threaded server it's a good idea to serve
10081016
* other clients as well, even if a very large request comes from
@@ -1014,7 +1022,7 @@ int writeToClient(int fd, client *c, int handler_installed) {
10141022
if (totwritten > NET_MAX_WRITES_PER_EVENT &&
10151023
(server.maxmemory == 0 ||
10161024
zmalloc_used_memory() < server.maxmemory)) break;
1017-
}
1025+
}
10181026
server.stat_net_output_bytes += totwritten;
10191027
if (nwritten == -1) {
10201028
if (errno == EAGAIN) {
@@ -1070,7 +1078,7 @@ int writeToClient(int fd, client *c, int handler_installed) {
10701078
}
10711079
#endif
10721080
return C_OK;
1073-
}
1081+
}
10741082

10751083
/* Write event handler. Just send data to the client. */
10761084
void sendReplyToClient(aeEventLoop *el, int fd, void *privdata, int mask) {

tests/assets/default.conf

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
# Redis configuration for testing.
22

3+
#always-show-logo yes
34
notify-keyspace-events KEA
45
daemonize no
56
pidfile /var/run/redis.pid

tests/cluster/tests/04-resharding.tcl

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,7 @@ array set content {}
5959
set tribpid {}
6060

6161
test "Cluster consistency during live resharding" {
62+
set ele 0
6263
for {set j 0} {$j < $numops} {incr j} {
6364
# Trigger the resharding once we execute half the ops.
6465
if {$tribpid ne {} &&
@@ -87,7 +88,7 @@ test "Cluster consistency during live resharding" {
8788
# Write random data to random list.
8889
set listid [randomInt $numkeys]
8990
set key "key:$listid"
90-
set ele [randomValue]
91+
incr ele
9192
# We write both with Lua scripts and with plain commands.
9293
# This way we are able to stress Lua -> Redis command invocation
9394
# as well, that has tests to prevent Lua to write into wrong
@@ -116,7 +117,9 @@ test "Cluster consistency during live resharding" {
116117
test "Verify $numkeys keys for consistency with logical content" {
117118
# Check that the Redis Cluster content matches our logical content.
118119
foreach {key value} [array get content] {
119-
assert {[$cluster lrange $key 0 -1] eq $value}
120+
if {[$cluster lrange $key 0 -1] ne $value} {
121+
fail "Key $key expected to hold '$value' but actual content is [$cluster lrange $key 0 -1]"
122+
}
120123
}
121124
}
122125

@@ -134,7 +137,9 @@ test "Cluster should eventually be up again" {
134137
test "Verify $numkeys keys after the crash & restart" {
135138
# Check that the Redis Cluster content matches our logical content.
136139
foreach {key value} [array get content] {
137-
assert {[$cluster lrange $key 0 -1] eq $value}
140+
if {[$cluster lrange $key 0 -1] ne $value} {
141+
fail "Key $key expected to hold '$value' but actual content is [$cluster lrange $key 0 -1]"
142+
}
138143
}
139144
}
140145

tests/integration/aof.tcl

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -69,10 +69,9 @@ tags {"aof"} {
6969
assert_equal 1 [is_alive $srv]
7070
}
7171

72-
test "Truncated AOF loaded: we expect foo to be equal to 6 now" {
73-
assert_equal 1 [is_alive $srv]
72+
set client [redis [dict get $srv host] [dict get $srv port]]
7473

75-
set client [redis [dict get $srv host] [dict get $srv port]]
74+
test "Truncated AOF loaded: we expect foo to be equal to 6 now" {
7675
assert {[$client get foo] eq "6"}
7776
}
7877
}

tests/integration/psync2-reg.tcl

Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
# Issue 3899 regression test.
2+
# We create a chain of three instances: master -> slave -> slave2
3+
# and continuously break the link while traffic is generated by
4+
# redis-benchmark. At the end we check that the data is the same
5+
# everywhere.
6+
7+
start_server {tags {"psync2"}} {
8+
start_server {} {
9+
start_server {} {
10+
# Config
11+
set debug_msg 0 ; # Enable additional debug messages
12+
13+
set no_exit 0 ; # Do not exit at end of the test
14+
15+
set duration 20 ; # Total test seconds
16+
17+
for {set j 0} {$j < 3} {incr j} {
18+
set R($j) [srv [expr 0-$j] client]
19+
set R_host($j) [srv [expr 0-$j] host]
20+
set R_port($j) [srv [expr 0-$j] port]
21+
if {$debug_msg} {puts "Log file: [srv [expr 0-$j] stdout]"}
22+
}
23+
24+
# Setup the replication and backlog parameters
25+
test "PSYNC2 #3899 regression: setup" {
26+
$R(1) slaveof $R_host(0) $R_port(0)
27+
$R(2) slaveof $R_host(0) $R_port(0)
28+
$R(0) set foo bar
29+
wait_for_condition 50 1000 {
30+
[$R(1) dbsize] == 1 && [$R(2) dbsize] == 1
31+
} else {
32+
fail "Slaves not replicating from master"
33+
}
34+
$R(0) config set repl-backlog-size 10mb
35+
$R(1) config set repl-backlog-size 10mb
36+
}
37+
38+
set cycle_start_time [clock milliseconds]
39+
set bench_pid [exec src/redis-benchmark -p $R_port(0) -n 10000000 -r 1000 incr __rand_int__ > /dev/null &]
40+
while 1 {
41+
set elapsed [expr {[clock milliseconds]-$cycle_start_time}]
42+
if {$elapsed > $duration*1000} break
43+
if {rand() < .05} {
44+
test "PSYNC2 #3899 regression: kill first slave" {
45+
$R(1) client kill type master
46+
}
47+
}
48+
if {rand() < .05} {
49+
test "PSYNC2 #3899 regression: kill chained slave" {
50+
$R(2) client kill type master
51+
}
52+
}
53+
after 100
54+
}
55+
exec kill -9 $bench_pid
56+
57+
if {$debug_msg} {
58+
for {set j 0} {$j < 100} {incr j} {
59+
if {
60+
[$R(0) debug digest] == [$R(1) debug digest] &&
61+
[$R(1) debug digest] == [$R(2) debug digest]
62+
} break
63+
puts [$R(0) debug digest]
64+
puts [$R(1) debug digest]
65+
puts [$R(2) debug digest]
66+
after 1000
67+
}
68+
}
69+
70+
test "PSYNC2 #3899 regression: verify consistency" {
71+
wait_for_condition 50 1000 {
72+
([$R(0) debug digest] eq [$R(1) debug digest]) &&
73+
([$R(1) debug digest] eq [$R(2) debug digest])
74+
} else {
75+
fail "The three instances have different data sets"
76+
}
77+
}
78+
}}}

tests/integration/psync2.tcl

Lines changed: 182 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,182 @@
1+
start_server {tags {"psync2"}} {
2+
start_server {} {
3+
start_server {} {
4+
start_server {} {
5+
start_server {} {
6+
set master_id 0 ; # Current master
7+
set start_time [clock seconds] ; # Test start time
8+
set counter_value 0 ; # Current value of the Redis counter "x"
9+
10+
# Config
11+
set debug_msg 0 ; # Enable additional debug messages
12+
13+
set no_exit 0; ; # Do not exit at end of the test
14+
15+
set duration 20 ; # Total test seconds
16+
17+
set genload 1 ; # Load master with writes at every cycle
18+
19+
set genload_time 5000 ; # Writes duration time in ms
20+
21+
set disconnect 1 ; # Break replication link between random
22+
# master and slave instances while the
23+
# master is loaded with writes.
24+
25+
set disconnect_period 1000 ; # Disconnect repl link every N ms.
26+
27+
for {set j 0} {$j < 5} {incr j} {
28+
set R($j) [srv [expr 0-$j] client]
29+
set R_host($j) [srv [expr 0-$j] host]
30+
set R_port($j) [srv [expr 0-$j] port]
31+
if {$debug_msg} {puts "Log file: [srv [expr 0-$j] stdout]"}
32+
}
33+
34+
set cycle 1
35+
while {([clock seconds]-$start_time) < $duration} {
36+
test "PSYNC2: --- CYCLE $cycle ---" {
37+
incr cycle
38+
}
39+
40+
# Create a random replication layout.
41+
# Start with switching master (this simulates a failover).
42+
43+
# 1) Select the new master.
44+
set master_id [randomInt 5]
45+
set used [list $master_id]
46+
test "PSYNC2: \[NEW LAYOUT\] Set #$master_id as master" {
47+
$R($master_id) slaveof no one
48+
if {$counter_value == 0} {
49+
$R($master_id) set x $counter_value
50+
}
51+
}
52+
53+
# 2) Attach all the slaves to a random instance
54+
while {[llength $used] != 5} {
55+
while 1 {
56+
set slave_id [randomInt 5]
57+
if {[lsearch -exact $used $slave_id] == -1} break
58+
}
59+
set rand [randomInt [llength $used]]
60+
set mid [lindex $used $rand]
61+
set master_host $R_host($mid)
62+
set master_port $R_port($mid)
63+
64+
test "PSYNC2: Set #$slave_id to replicate from #$mid" {
65+
$R($slave_id) slaveof $master_host $master_port
66+
}
67+
lappend used $slave_id
68+
}
69+
70+
# 3) Increment the counter and wait for all the instances
71+
# to converge.
72+
test "PSYNC2: cluster is consistent after failover" {
73+
$R($master_id) incr x; incr counter_value
74+
for {set j 0} {$j < 5} {incr j} {
75+
wait_for_condition 50 1000 {
76+
[$R($j) get x] == $counter_value
77+
} else {
78+
fail "Instance #$j x variable is inconsistent"
79+
}
80+
}
81+
}
82+
83+
# 4) Generate load while breaking the connection of random
84+
# slave-master pairs.
85+
test "PSYNC2: generate load while killing replication links" {
86+
set t [clock milliseconds]
87+
set next_break [expr {$t+$disconnect_period}]
88+
while {[clock milliseconds]-$t < $genload_time} {
89+
if {$genload} {
90+
$R($master_id) incr x; incr counter_value
91+
}
92+
if {[clock milliseconds] == $next_break} {
93+
set next_break \
94+
[expr {[clock milliseconds]+$disconnect_period}]
95+
set slave_id [randomInt 5]
96+
if {$disconnect} {
97+
$R($slave_id) client kill type master
98+
if {$debug_msg} {
99+
puts "+++ Breaking link for slave #$slave_id"
100+
}
101+
}
102+
}
103+
}
104+
}
105+
106+
# 5) Increment the counter and wait for all the instances
107+
set x [$R($master_id) get x]
108+
test "PSYNC2: cluster is consistent after load (x = $x)" {
109+
for {set j 0} {$j < 5} {incr j} {
110+
wait_for_condition 50 1000 {
111+
[$R($j) get x] == $counter_value
112+
} else {
113+
fail "Instance #$j x variable is inconsistent"
114+
}
115+
}
116+
}
117+
118+
# Put down the old master so that it cannot generate more
119+
# replication stream, this way in the next master switch, the time at
120+
# which we move slaves away is not important, each will have full
121+
# history (otherwise PINGs will make certain slaves have more history),
122+
# and sometimes a full resync will be needed.
123+
$R($master_id) slaveof 127.0.0.1 0 ;# We use port zero to make it fail.
124+
125+
if {$debug_msg} {
126+
for {set j 0} {$j < 5} {incr j} {
127+
puts "$j: sync_full: [status $R($j) sync_full]"
128+
puts "$j: id1 : [status $R($j) master_replid]:[status $R($j) master_repl_offset]"
129+
puts "$j: id2 : [status $R($j) master_replid2]:[status $R($j) second_repl_offset]"
130+
puts "$j: backlog : firstbyte=[status $R($j) repl_backlog_first_byte_offset] len=[status $R($j) repl_backlog_histlen]"
131+
puts "---"
132+
}
133+
}
134+
135+
test "PSYNC2: total sum of full synchronizations is exactly 4" {
136+
set sum 0
137+
for {set j 0} {$j < 5} {incr j} {
138+
incr sum [status $R($j) sync_full]
139+
}
140+
assert {$sum == 4}
141+
}
142+
}
143+
144+
test "PSYNC2: Bring the master back again for next test" {
145+
$R($master_id) slaveof no one
146+
set master_host $R_host($master_id)
147+
set master_port $R_port($master_id)
148+
for {set j 0} {$j < 5} {incr j} {
149+
if {$j == $master_id} continue
150+
$R($j) slaveof $master_host $master_port
151+
}
152+
153+
# Wait for slaves to sync
154+
wait_for_condition 50 1000 {
155+
[status $R($master_id) connected_slaves] == 4
156+
} else {
157+
fail "Slave not reconnecting"
158+
}
159+
}
160+
161+
test "PSYNC2: Partial resync after restart using RDB aux fields" {
162+
# Pick a random slave
163+
set slave_id [expr {($master_id+1)%5}]
164+
set sync_count [status $R($master_id) sync_full]
165+
catch {
166+
$R($slave_id) config rewrite
167+
$R($slave_id) debug restart
168+
}
169+
wait_for_condition 50 1000 {
170+
[status $R($master_id) connected_slaves] == 4
171+
} else {
172+
fail "Slave not reconnecting"
173+
}
174+
set new_sync_count [status $R($master_id) sync_full]
175+
assert {$sync_count == $new_sync_count}
176+
}
177+
178+
if {$no_exit} {
179+
while 1 { puts -nonewline .; flush stdout; after 1000}
180+
}
181+
182+
}}}}}

tests/integration/rdb.tcl

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,6 @@ proc start_server_and_kill_it {overrides code} {
5050
kill_server $srv
5151
}
5252

53-
if { $::tcl_platform(platform) != "windows" } {
5453
# Make the RDB file unreadable
5554
file attributes [file join $server_path dump.rdb] -permissions 0222
5655

@@ -67,7 +66,7 @@ if {!$isroot} {
6766
test {Server should not start if RDB file can't be open} {
6867
wait_for_condition 50 100 {
6968
[string match {*Fatal error loading*} \
70-
[exec tail -n1 < [dict get $srv stdout]]]
69+
[exec tail -1 < [dict get $srv stdout]]]
7170
} else {
7271
fail "Server started even if RDB was unreadable!"
7372
}
@@ -90,11 +89,10 @@ close $fd
9089
start_server_and_kill_it [list "dir" $server_path] {
9190
test {Server should not start if RDB is corrupted} {
9291
wait_for_condition 50 100 {
93-
[string match {*RDB checksum*} \
94-
[exec tail -n10 < [dict get $srv stdout]]]
92+
[string match {*CRC error*} \
93+
[exec tail -10 < [dict get $srv stdout]]]
9594
} else {
9695
fail "Server started even if RDB was corrupted!"
9796
}
9897
}
9998
}
100-
}

0 commit comments

Comments
 (0)