-
Notifications
You must be signed in to change notification settings - Fork 2.1k
ethos: race condition seems to cause packet loss #17254
Description
Description
I did not finish my final analysis yet, but there seems to be a race condition in the RIOT-side of ethos, that is the cause of packet loss over the wire: I was able to pinpoint loss in ethos to this if not being taken
Lines 94 to 98 in 68e1d29
| if (dev->accept_new) { | |
| if (tsrb_add_one(&dev->inbuf, c) == 0) { | |
| dev->framesize++; | |
| } | |
| } |
not being called, since dev->accept_new, when barraging the ethos line (with ethos_stdio).
Steps to reproduce the issue
I used the the following patch to analyze the loss behavior:
Patch
diff --git a/dist/tools/ethos/ethos.c b/dist/tools/ethos/ethos.c
index fc4f8c94ce..ba82ee65b9 100644
--- a/dist/tools/ethos/ethos.c
+++ b/dist/tools/ethos/ethos.c
@@ -245,6 +245,8 @@ static int _serial_handle_byte(serial_t *serial, char c)
return 0;
}
+static uint16_t _last_seq_sent;
+
static void _write_escaped(int fd, char* buf, ssize_t n)
{
/*
@@ -260,11 +262,36 @@ static void _write_escaped(int fd, char* buf, ssize_t n)
* call write() on the buffer instead of one char at a time */
uint8_t out[SERIAL_BUFFER_SIZE];
size_t buffered = 0;
+ char *orig = buf;
+ int icmpv6 = 0, echo = 0;
+ unsigned char seq_upper = 0;
while(n--) {
unsigned char c = *buf++;
+
+ if (icmpv6 && echo && (buf - orig) == 61) {
+ seq_upper = c;
+ }
+ else if (icmpv6 && echo && (buf - orig) == 62) {
+ uint16_t seq = seq_upper << 8;
+
+ seq |= c;
+ if (_last_seq_sent != (seq - 1)) {
+ fprintf(stderr, "%u sent out of order\n", seq);
+ }
+ _last_seq_sent = seq;
+ }
+ else if (icmpv6 && (buf - orig) == 55 && c == 128) {
+ echo = 1;
+ }
+ else if ((buf - orig) == 21 && c == 58) {
+ icmpv6 = 1;
+ }
if (c == LINE_FRAME_DELIMITER || c == LINE_ESC_CHAR) {
out[buffered++] = LINE_ESC_CHAR;
+ if (icmpv6 && echo) {
+ fprintf(stderr, "Escaped %u (offset %lu) in %u\n", c, buf - orig, _last_seq_sent);
+ }
c ^= 0x20;
if (buffered >= SERIAL_BUFFER_SIZE) {
checked_write(fd, out, buffered);
@@ -497,6 +524,8 @@ int _open_connection(char *name, char* option)
}
}
+static uint16_t _last_seq_recv;
+
int main(int argc, char *argv[])
{
char inbuf[MTU];
@@ -595,6 +624,15 @@ int main(int argc, char *argv[])
fprintf(stderr, "error reading from tap device. res=%zi\n", res);
continue;
}
+ if (res >= 62 && (((uint8_t *)inbuf)[20] == 58) && (((uint8_t *)inbuf)[54] == 128)) {
+ uint16_t seq = (((uint8_t *)inbuf)[60]) << 8;
+
+ seq |= (((uint8_t *)inbuf)[61]);
+ if (_last_seq_recv != (seq - 1)) {
+ fprintf(stderr, "%u received out of order\n", seq);
+ }
+ _last_seq_recv = seq;
+ }
char delim = LINE_FRAME_DELIMITER;
checked_write(serial_fd, &delim, 1);
_write_escaped(serial_fd, inbuf, res);
diff --git a/drivers/ethos/ethos.c b/drivers/ethos/ethos.c
index 8fd83b8692..d08b5421eb 100644
--- a/drivers/ethos/ethos.c
+++ b/drivers/ethos/ethos.c
@@ -52,6 +52,7 @@ static const netdev_driver_t netdev_driver_ethos;
static const uint8_t _esc_esc[] = {ETHOS_ESC_CHAR, (ETHOS_ESC_CHAR ^ 0x20)};
static const uint8_t _esc_delim[] = {ETHOS_ESC_CHAR, (ETHOS_FRAME_DELIMITER ^ 0x20)};
+static uint16_t _req_recv = 0;
void ethos_setup(ethos_t *dev, const ethos_params_t *params, uint8_t idx,
void *inbuf, size_t inbuf_size)
@@ -85,6 +86,8 @@ static void _reset_state(ethos_t *dev)
dev->accept_new = true;
}
+/* static uint16_t _last_seq_recv_serial; */
+
static void _handle_char(ethos_t *dev, char c)
{
switch (dev->frametype) {
@@ -95,9 +98,12 @@ static void _handle_char(ethos_t *dev, char c)
if (tsrb_add_one(&dev->inbuf, c) == 0) {
dev->framesize++;
}
+ else {
+ puts("!t");
+ }
}
else {
- //puts("lost frame");
+ puts("!a");
dev->inbuf.reads = 0;
dev->inbuf.writes = 0;
_reset_state(dev);
@@ -299,10 +305,14 @@ static void _get_mac_addr(netdev_t *encdev, uint8_t* buf)
memcpy(buf, dev->mac_addr, 6);
}
+static uint16_t _last_seq_recv_netdev;
+
static int _recv(netdev_t *netdev, void *buf, size_t len, void* info)
{
(void) info;
ethos_t * dev = (ethos_t *) netdev;
+ uint8_t *buf8 = buf;
+
if (buf) {
if (len < dev->last_framesize) {
@@ -317,7 +327,16 @@ static int _recv(netdev_t *netdev, void *buf, size_t len, void* info)
dev->last_framesize = 0;
return -1;
}
+ if ((len >= 62) && (buf8[20] == 58) && buf8[54] == 128) {
+ uint16_t seq = buf8[60] << 8;
+ seq |= buf8[61];
+ if (_last_seq_recv_netdev != (seq - 1)) {
+ printf("n;%u\n", seq);
+ }
+ _last_seq_recv_netdev = seq;
+ _req_recv++;
+ }
dev->last_framesize = 0;
return (int)len;
}
@@ -333,6 +352,15 @@ static int _recv(netdev_t *netdev, void *buf, size_t len, void* info)
}
}
+void print_req_recv(void)
+{
+ printf("req_recv: %d\n", _req_recv);
+}
+
+void print_resp_sent(void)
+{
+}
+
static int _get(netdev_t *dev, netopt_t opt, void *value, size_t max_len)
{
int res = 0;
diff --git a/examples/gnrc_border_router/main.c b/examples/gnrc_border_router/main.c
index 654b411a50..5bde49954d 100644
--- a/examples/gnrc_border_router/main.c
+++ b/examples/gnrc_border_router/main.c
@@ -26,6 +26,23 @@
#define MAIN_QUEUE_SIZE (8)
static msg_t _main_msg_queue[MAIN_QUEUE_SIZE];
+extern void print_req_recv(void);
+extern void print_resp_sent(void);
+
+static int _print_req_recv(int argc, char **argv)
+{
+ (void)argc;
+ (void)argv;
+ print_req_recv();
+ print_resp_sent();
+ return 0;
+}
+
+static shell_command_t cmd[] = {
+ { "req_recv", "", _print_req_recv },
+ { NULL, NULL, NULL },
+};
+
int main(void)
{
/* we need a message queue for the thread running the shell in order to
@@ -36,7 +53,7 @@ int main(void)
/* start shell */
puts("All up, running the shell now");
char line_buf[SHELL_DEFAULT_BUFSIZE];
- shell_run(NULL, line_buf, SHELL_DEFAULT_BUFSIZE);
+ shell_run(cmd, line_buf, SHELL_DEFAULT_BUFSIZE);
/* should be never reached */
return 0;and flashed the examples/gnrc_border_router example in its default config to an iotlab-m3.
The patch introduces several diagnostic tools:
- The output
Escaped X (offset Y) in Zcan be ignored, it was just used to assure the issue does not stem from escaping. !ais printed every time in_handle_charwhen theif (dev->accept_new)branch is not taken,n;Zis printed when in the netdevrecvmethod the echo request sequence numberZ-1was skipped- A shell command
req_recvis introduced to theexamples/gnrc_border_routerexample that prints the number of ICMPv6 echo requests seen by the netdevrecvmethod.
I used ifconfig to determine the link-local address of the ethos interface on the iotlab-m3 and pinged it with an interval of 10ms.
ping -i 0.01 fe80::a804:28ff:fefa:f3f8%tap0Expected results
The number of packets transmitted by ping should equal the number returned by the patched-in req_recv command, the live diagnostic output added by the patch should not appear (except for the Escaped X (offset Y) in Z output).
Actual results
The number of packets transmitted by ping is larger than the number returned by the patched-in req_recv command, the live diagnostic output shows
!a
n;Z
where Z is an echo request sequence number that saw no predecessor, is shown for every echo request that is missing in the number returned by the patched-in req_recv command, indicating, that the echo request was lost, due to branch above not taken (!a may be shown multiple times before n;Z due to other frames lost).
Versions
Analysis was done on 770fe2f, but I guess this issue exists since the inception of ethos.