Skip to content

ethos: race condition seems to cause packet loss #17254

@miri64

Description

@miri64

Description

I did not finish my final analysis yet, but there seems to be a race condition in the RIOT-side of ethos, that is the cause of packet loss over the wire: I was able to pinpoint loss in ethos to this if not being taken

if (dev->accept_new) {
if (tsrb_add_one(&dev->inbuf, c) == 0) {
dev->framesize++;
}
}

not being called, since dev->accept_new, when barraging the ethos line (with ethos_stdio).

Steps to reproduce the issue

I used the the following patch to analyze the loss behavior:

Patch
diff --git a/dist/tools/ethos/ethos.c b/dist/tools/ethos/ethos.c
index fc4f8c94ce..ba82ee65b9 100644
--- a/dist/tools/ethos/ethos.c
+++ b/dist/tools/ethos/ethos.c
@@ -245,6 +245,8 @@ static int _serial_handle_byte(serial_t *serial, char c)
     return 0;
 }
 
+static uint16_t _last_seq_sent;
+
 static void _write_escaped(int fd, char* buf, ssize_t n)
 {
     /*
@@ -260,11 +262,36 @@ static void _write_escaped(int fd, char* buf, ssize_t n)
      * call write() on the buffer instead of one char at a time */
     uint8_t out[SERIAL_BUFFER_SIZE];
     size_t buffered = 0;
+    char *orig = buf;
+    int icmpv6 = 0, echo = 0;
+    unsigned char seq_upper = 0;
 
     while(n--) {
         unsigned char c = *buf++;
+
+        if (icmpv6 && echo && (buf - orig) == 61) {
+            seq_upper = c;
+        }
+        else if (icmpv6 && echo && (buf - orig) == 62) {
+            uint16_t seq = seq_upper << 8;
+
+            seq |= c;
+            if (_last_seq_sent != (seq - 1)) {
+                fprintf(stderr, "%u sent out of order\n", seq);
+            }
+            _last_seq_sent = seq;
+        }
+        else if (icmpv6 && (buf - orig) == 55 && c == 128) {
+            echo = 1;
+        }
+        else if ((buf - orig) == 21 && c == 58) {
+            icmpv6 = 1;
+        }
         if (c == LINE_FRAME_DELIMITER || c == LINE_ESC_CHAR) {
             out[buffered++] = LINE_ESC_CHAR;
+            if (icmpv6 && echo) {
+                fprintf(stderr, "Escaped %u (offset %lu) in %u\n", c, buf - orig, _last_seq_sent);
+            }
             c ^= 0x20;
             if (buffered >= SERIAL_BUFFER_SIZE) {
                 checked_write(fd, out, buffered);
@@ -497,6 +524,8 @@ int _open_connection(char *name, char* option)
     }
 }
 
+static uint16_t _last_seq_recv;
+
 int main(int argc, char *argv[])
 {
     char inbuf[MTU];
@@ -595,6 +624,15 @@ int main(int argc, char *argv[])
                 fprintf(stderr, "error reading from tap device. res=%zi\n", res);
                 continue;
             }
+            if (res >= 62 && (((uint8_t *)inbuf)[20] == 58) && (((uint8_t *)inbuf)[54] == 128)) {
+                uint16_t seq = (((uint8_t *)inbuf)[60]) << 8;
+
+                seq |= (((uint8_t *)inbuf)[61]);
+                if (_last_seq_recv != (seq - 1)) {
+                    fprintf(stderr, "%u received out of order\n", seq);
+                }
+                _last_seq_recv = seq;
+            }
             char delim = LINE_FRAME_DELIMITER;
             checked_write(serial_fd, &delim, 1);
             _write_escaped(serial_fd, inbuf, res);
diff --git a/drivers/ethos/ethos.c b/drivers/ethos/ethos.c
index 8fd83b8692..d08b5421eb 100644
--- a/drivers/ethos/ethos.c
+++ b/drivers/ethos/ethos.c
@@ -52,6 +52,7 @@ static const netdev_driver_t netdev_driver_ethos;
 
 static const uint8_t _esc_esc[] = {ETHOS_ESC_CHAR, (ETHOS_ESC_CHAR ^ 0x20)};
 static const uint8_t _esc_delim[] = {ETHOS_ESC_CHAR, (ETHOS_FRAME_DELIMITER ^ 0x20)};
+static uint16_t _req_recv = 0;
 
 void ethos_setup(ethos_t *dev, const ethos_params_t *params, uint8_t idx,
                  void *inbuf, size_t inbuf_size)
@@ -85,6 +86,8 @@ static void _reset_state(ethos_t *dev)
     dev->accept_new = true;
 }
 
+/* static uint16_t _last_seq_recv_serial; */
+
 static void _handle_char(ethos_t *dev, char c)
 {
     switch (dev->frametype) {
@@ -95,9 +98,12 @@ static void _handle_char(ethos_t *dev, char c)
                 if (tsrb_add_one(&dev->inbuf, c) == 0) {
                     dev->framesize++;
                 }
+                else {
+                    puts("!t");
+                }
             }
             else {
-                //puts("lost frame");
+                puts("!a");
                 dev->inbuf.reads = 0;
                 dev->inbuf.writes = 0;
                 _reset_state(dev);
@@ -299,10 +305,14 @@ static void _get_mac_addr(netdev_t *encdev, uint8_t* buf)
     memcpy(buf, dev->mac_addr, 6);
 }
 
+static uint16_t _last_seq_recv_netdev;
+
 static int _recv(netdev_t *netdev, void *buf, size_t len, void* info)
 {
     (void) info;
     ethos_t * dev = (ethos_t *) netdev;
+    uint8_t *buf8 = buf;
+
 
     if (buf) {
         if (len < dev->last_framesize) {
@@ -317,7 +327,16 @@ static int _recv(netdev_t *netdev, void *buf, size_t len, void* info)
             dev->last_framesize = 0;
             return -1;
         }
+        if ((len >= 62) && (buf8[20] == 58) && buf8[54] == 128) {
+            uint16_t seq = buf8[60] << 8;
 
+            seq |= buf8[61];
+            if (_last_seq_recv_netdev != (seq - 1)) {
+                printf("n;%u\n", seq);
+            }
+            _last_seq_recv_netdev = seq;
+            _req_recv++;
+        }
         dev->last_framesize = 0;
         return (int)len;
     }
@@ -333,6 +352,15 @@ static int _recv(netdev_t *netdev, void *buf, size_t len, void* info)
     }
 }
 
+void print_req_recv(void)
+{
+    printf("req_recv: %d\n", _req_recv);
+}
+
+void print_resp_sent(void)
+{
+}
+
 static int _get(netdev_t *dev, netopt_t opt, void *value, size_t max_len)
 {
     int res = 0;
diff --git a/examples/gnrc_border_router/main.c b/examples/gnrc_border_router/main.c
index 654b411a50..5bde49954d 100644
--- a/examples/gnrc_border_router/main.c
+++ b/examples/gnrc_border_router/main.c
@@ -26,6 +26,23 @@
 #define MAIN_QUEUE_SIZE     (8)
 static msg_t _main_msg_queue[MAIN_QUEUE_SIZE];
 
+extern void print_req_recv(void);
+extern void print_resp_sent(void);
+
+static int _print_req_recv(int argc, char **argv)
+{
+    (void)argc;
+    (void)argv;
+    print_req_recv();
+    print_resp_sent();
+    return 0;
+}
+
+static shell_command_t cmd[] = {
+    { "req_recv", "", _print_req_recv },
+    { NULL, NULL, NULL },
+};
+
 int main(void)
 {
     /* we need a message queue for the thread running the shell in order to
@@ -36,7 +53,7 @@ int main(void)
     /* start shell */
     puts("All up, running the shell now");
     char line_buf[SHELL_DEFAULT_BUFSIZE];
-    shell_run(NULL, line_buf, SHELL_DEFAULT_BUFSIZE);
+    shell_run(cmd, line_buf, SHELL_DEFAULT_BUFSIZE);
 
     /* should be never reached */
     return 0;

and flashed the examples/gnrc_border_router example in its default config to an iotlab-m3.

The patch introduces several diagnostic tools:

  • The output Escaped X (offset Y) in Z can be ignored, it was just used to assure the issue does not stem from escaping.
  • !a is printed every time in _handle_char when the if (dev->accept_new) branch is not taken,
  • n;Z is printed when in the netdev recv method the echo request sequence number Z-1 was skipped
  • A shell command req_recv is introduced to the examples/gnrc_border_router example that prints the number of ICMPv6 echo requests seen by the netdev recv method.

I used ifconfig to determine the link-local address of the ethos interface on the iotlab-m3 and pinged it with an interval of 10ms.

ping -i 0.01 fe80::a804:28ff:fefa:f3f8%tap0

Expected results

The number of packets transmitted by ping should equal the number returned by the patched-in req_recv command, the live diagnostic output added by the patch should not appear (except for the Escaped X (offset Y) in Z output).

Actual results

The number of packets transmitted by ping is larger than the number returned by the patched-in req_recv command, the live diagnostic output shows

!a
n;Z

where Z is an echo request sequence number that saw no predecessor, is shown for every echo request that is missing in the number returned by the patched-in req_recv command, indicating, that the echo request was lost, due to branch above not taken (!a may be shown multiple times before n;Z due to other frames lost).

Versions

Analysis was done on 770fe2f, but I guess this issue exists since the inception of ethos.

Metadata

Metadata

Labels

Area: driversArea: Device driversArea: networkArea: NetworkingType: bugThe issue reports a bug / The PR fixes a bug (including spelling errors)

Type

No type

Projects

No projects

Milestone

No milestone

Relationships

None yet

Development

No branches or pull requests

Issue actions