Skip to content

Tarantool is extremely easy to crash by IPROTO misusage #10155

@drewdzzz

Description

@drewdzzz

By mistake, I've sent IPROTO_OK after Tarantool sent me the snapshot via IPROTO_FETCH_SNAPSHOT request (Tarantool doesn't expect it) and then I sent IPROTO_SUBSCRIBE. Such sequence of requests suddenly crashed Tarantool. So I wrote a simple test with misusages of anonymous replication protocol. This protocol can be used not only by Tarantool, but by users manually (CDC, for example), so such misusage is possible in real world. Four cases have discovered five different assertion fails and three different crashes.

Test itself

Should be run in parallel (for example, ./test-run.sh $(yes iproto_crash | head -n 100))

local server = require('luatest.server')
local t = require('luatest')
local msgpack = require('msgpack')
local uri = require('uri')
local lsocket = require('socket')
 
local type = box.iproto.type
local key = box.iproto.key
 
local timeout = 60
 
local function socket_connect(server)
    local u = uri.parse(server.net_box_uri)
    local s = lsocket.tcp_connect(u.host, u.service)
    t.assert_not_equals(s, nil)
    -- Skip the greeting.
    s:read(box.iproto.GREETING_SIZE, timeout)
    return s
end
 
local function socket_write(s, header, body)
    return s:write(box.iproto.encode_packet(header, body))
end
 
local function socket_read(s)
    local size_mp = s:read(5, timeout)
    t.assert_equals(#size_mp, 5)
    local size = msgpack.decode(size_mp)
    local response = s:read(size, timeout)
    t.assert_equals(#response, size)
    return box.iproto.decode_packet(size_mp .. response)
end
 
local function socket_restart(g)
    g.s:close()
    g.s = socket_connect(g.server)
end
 
local function encode_map(map)
    return msgpack.object(setmetatable(map, {__serialize = 'map'}))
end
 
local function setmap(map)
    return setmetatable(map, {__serialize = 'map'})
end
 
local function write_fetch_snapshot(s)
    local header = {
        [key.REQUEST_TYPE] = box.iproto.type.FETCH_SNAPSHOT,
        [key.SYNC] = 1,
    }
    local body = setmap({})
    return socket_write(s, header, body)
end
 
local function write_subscribe(s, uuid, replicaset_uuid, is_anon)
    local header = {
        [key.REQUEST_TYPE] = box.iproto.type.SUBSCRIBE,
        [key.SYNC] = 1,
    }
    local body = {
        [key.REPLICASET_UUID] = replicaset_uuid,
        [key.INSTANCE_UUID] = uuid,
        [key.VCLOCK] = encode_map({}),
        [key.REPLICA_ANON] = is_anon,
    }
    return socket_write(s, header, body)
end
 
local function write_ok(s, body)
    local header = {
        [key.REQUEST_TYPE] = box.iproto.type.OK,
        [key.SYNC] = 1,
    }
    if body == nil then
        body = {}
    end
    return socket_write(s, header, body)
end
 
local function parse_data_stream(g)
    local h, b = socket_read(g.s)
    local request_type = h[key.REQUEST_TYPE]
    while request_type == type.INSERT or request_type == type.RAFT_PROMOTE do
        h, b = socket_read(g.s)
        request_type = h[key.REQUEST_TYPE]
    end
    return h, b
end
 
local g = t.group()
 
g.before_each(function(g)
    g.server = server:new()
    g.server:start()
    g.server:exec(function()
        box.schema.space.create('test')
        box.space.test:create_index('pk')
        for i = 1, 100 do
            box.space.test:replace{i}
        end
    end)
    g.s = socket_connect(g.server)
end)
 
g.after_each(function(g)
    g.s:close()
    g.server:stop()
end)
 
-- The case sends a dummy OK before subscribe
g.test_iproto_crash_on_subscribe = function(g)
    local uuid = require('uuid').str()
    local replicaset_uuid = g.server:eval('return box.info.replicaset.uuid')
 
    -- Write OK but do not read response
    write_ok(g.s)
 
    write_subscribe(g.s, uuid, replicaset_uuid, true)
    socket_read(g.s)
end
 
-- The case sends spams dummy OKs before subscribe
g.test_iproto_crash_on_subscribe_spam_ok = function(g)
    local uuid = require('uuid').str()
    local replicaset_uuid = g.server:eval('return box.info.replicaset.uuid')
 
    -- Write OK but do not read response
    for i = 1, 100 do
        write_ok(g.s)
    end
 
    write_subscribe(g.s, uuid, replicaset_uuid, true)
    socket_read(g.s)
end
 
-- The case simulates a situtaion where the user of anonymous replication
-- simply sent IPROTO_OK back after FETCH_SNAPSHOT by mistake (did not know
-- that Tarantool doesn't expect a reply on FETCH_SNAPSHOT)
g.test_iproto_crash_fetch_snapshot_subscribe = function(g)
    local uuid = require('uuid').str()
    local replicaset_uuid = g.server:eval('return box.info.replicaset.uuid')
 
    write_fetch_snapshot(g.s)
    -- Read OK response
    socket_read(g.s)
    -- Read all data sent in response
    local h, b = parse_data_stream(g)
    t.assert_equals(h.REQUEST_TYPE, type.OK)
 
    write_ok(g.s)
 
    write_subscribe(g.s, uuid, replicaset_uuid, true)
    socket_read(g.s)
end
 
-- The same as above, but additionally forgot to pass is_anon option to
-- subscribe
g.test_iproto_crash_fetch_snapshot_subscribe_non_anon = function(g)
    local uuid = require('uuid').str()
    local replicaset_uuid = g.server:eval('return box.info.replicaset.uuid')
 
    write_fetch_snapshot(g.s)
    -- Read OK response
    socket_read(g.s)
    -- Read all data sent in response
    local h, b = parse_data_stream(g)
    t.assert_equals(h.REQUEST_TYPE, type.OK)
 
    write_ok(g.s)
 
    write_subscribe(g.s, uuid, replicaset_uuid)
    socket_read(g.s)
end
Backtrace 1
[013] #   #1  0x102550324 in crash_collect+200
[013] #   #2  0x1025501e0 in crash_signal_cb+76
[013] #   #3  0x18ca13584 in _sigtramp+56
[013] #   #4  0x102575b74 in iostream_writev+160
[013] #   #5  0x102575050 in coio_flush+64
[013] #   #6  0x102574d14 in coio_writev_timeout+360
[013] #   #7  0x102252008 in coio_writev(iostream*, iovec*, int, unsigned long)+56
[013] #   #8  0x102251f5c in coio_write_xrow+188
[013] #   #9  0x1023eab6c in relay_send(relay*, xrow_header*)+292
[013] #   #10 0x1023ec834 in relay_send_tx(relay*)+384
[013] #   #11 0x1023e8314 in relay_process_row(xstream*, xrow_header*)+788
[013] #   #12 0x1023d714c in xstream_write+40
[013] #   #13 0x1023d5bdc in recover_xlog(recovery*, xstream*, vclock const*)+812
[013] #   #14 0x1023d5300 in recover_remaining_wals(recovery*, xstream*, vclock const*, bool)+568
[013] #   #15 0x1023eb630 in relay_process_wal_event(wal_watcher*, unsigned int)+92
[013] #   #16 0x1023fec5c in wal_watcher_notify_perform+68
[013] #   #17 0x102568808 in cmsg_deliver+52
[013] #   #18 0x102569700 in cbus_process+192
[013] #   #19 0x102568f90 in cbus_pair+272
[013] #   #20 0x1023febc4 in wal_set_watcher+308
[013] #   #21 0x1023e91a4 in relay_subscribe_f(char*)+708
Backtrace 2
[013] #   #1  0x100bc8324 in crash_collect+200
[013] #   #2  0x100bc81e0 in crash_signal_cb+76
[013] #   #3  0x18ca13584 in _sigtramp+56
[013] #   #4  0x100bedb74 in iostream_writev+160
[013] #   #5  0x100bed050 in coio_flush+64
[013] #   #6  0x100becd14 in coio_writev_timeout+360
[013] #   #7  0x1008ca008 in coio_writev(iostream*, iovec*, int, unsigned long)+56
[013] #   #8  0x1008c9f5c in coio_write_xrow+188
[013] #   #9  0x100a2299c in box_process_subscribe(iostream*, xrow_header const*)+4276
[013] #   #10 0x1008be688 in tx_process_replication(cmsg*)+684
[013] #   #11 0x100be0808 in cmsg_deliver+52
[013] #   #12 0x100be2b60 in fiber_pool_f+832
[013] #   #13 0x1008b2590 in fiber_cxx_invoke(int (*)(char*), char*)+32
[013] #   #14 0x100bd3e70 in fiber_loop+268
[013] #   #15 0x100fd90b0 in coro_startup+8
Backtrace 3
[2024-06-21 14:28:30.394620] #   #1  0x100b14324 in crash_collect+200
[2024-06-21 14:28:30.394838] #   #2  0x100b141e0 in crash_signal_cb+76
[2024-06-21 14:28:30.395050] #   #3  0x18ca13584 in _sigtramp+56
[2024-06-21 14:28:30.395266] #   #4  0x100b381dc in iostream_read+160
[2024-06-21 14:28:30.395494] #   #5  0x100b37e88 in coio_read_ahead_timeout+364
[2024-06-21 14:28:30.395712] #   #6  0x100b38564 in coio_readn_ahead_timeout+56
[2024-06-21 14:28:30.395928] #   #7  0x100815da4 in coio_breadn_timeout(iostream*, ibuf*, unsigned long, double)+156
[2024-06-21 14:28:30.396120] #   #8  0x1008159b0 in coio_read_xrow_timeout_xc+104
[2024-06-21 14:28:30.396332] #   #9  0x1009ac6d4 in relay_reader_f(char*)+624
[2024-06-21 14:28:30.396542] #   #10 0x1007fe590 in fiber_cxx_invoke(int (*)(char*), char*)+32
[2024-06-21 14:28:30.398392] #   #11 0x100b1fe70 in fiber_loop+268
[2024-06-21 14:28:30.399069] #   #12 0x100f250b0 in coro_startup+8
A bunch of failed assertions

Reproduced on Linux x86-64:

./third_party/libev/ev_epoll.c:134: epoll_modify: Assertion `("libev: I/O watcher with invalid fd found in epoll_ctl", errno != EBADF && errno != ELOOP && errno != EINVAL)' failed.

Reproduced on MacOS 14 (Apple M2 Pro):

Assertion failed: ((io)->owner == NULL), function iostream_read, file iostream.h, line 198
Assertion failed: ((io)->owner == cord()), function iostream_read, file iostream.h, line 200
Assertion failed: ((io)->owner == NULL), function iostream_writev, file iostream.h, line 226.

Reproduced on both platforms:

Assertion failed: (! ev_is_active(&con->input)), function net_end_subscribe, file iproto.cc, line 3279.

Tarantool version: Tarantool 3.1.0-entrypoint-362-gd2240bf7c (current master)

Metadata

Metadata

Assignees

Labels

2.11Target is 2.11 and all newer release/master branchesbugSomething isn't workingcrashiproto

Type

No type

Projects

No projects

Milestone

No milestone

Relationships

None yet

Development

No branches or pull requests

Issue actions