box: read snapshot in a distinct thread to improve recovery performance by nshy · Pull Request #11861 · tarantool/tarantool

nshy · 2025-09-29T08:49:06Z

Validating requests MsgPack and parsing the requests body turns out to be significant part of recovery in terms of CPU. Let's move it to a distinct thread. This is similar to iproto and applier where we have a distinct thread(s) for that.

Snapshot is written in batches of about 128k
(XLOG_TX_AUTOCOMMIT_THRESHOLD). The recovery thread reads the batch, parses its requests and send to TX to process. The maximum number of read but not processed batches is limited to 2 to make a flow control.

Closes #11888

Performance test results

Recovery performance is evaluated using below perf test. The 1.10.5 performance is taken from the previous evaluations for the issue.

tarantool perf/lua/recovery.lua \
  --row_count 10000000 --column_count 10 --recovery_count 5 \
  --preload_script "require('compat').box_recovery_triggers_deprecation = 'new'"

Run                 | Perf, krps |
---------------------------------|
master, current     |    1060    |
---------------------------------|
master, patched     |    1800    |
---------------------------------|
1.10.5              |    1315    |

So introducing snapshot reader thread gives 70% boost for recovery from snapshot. Now master is 37% faster than 1.10.5.

If box_recovery_triggers_deprecation = 'old' then performance is only 990 krps.

coveralls · 2025-09-29T14:21:19Z

coverage: 87.642% (+0.01%) from 87.632%
when pulling 712d6b9 on nshy:recovery-thread
into a854a65
on tarantool:master.

src/box/xlog_reader.h

src/box/xlog_reader.c

src/box/memtx_engine.cc

src/box/xlog_reader.h

src/box/memtx_engine.cc

src/box/xlog.h

src/box/xlog_reader.c

changelogs/unreleased/gh-11888-snapshot-recovery-perf-improved.md

src/box/xlog.h

src/box/xlog.c

src/box/xlog_reader.h

nshy · 2025-10-13T08:10:25Z

I also avoid using mempool to keep batches data as discussed f2f. For that we start to keep vclock in struct synchro_request and struct raft_request right in these structures (done in distinct commit). Also we need to fix pointer to header for struct request in batch (see xlog_batch_get()).

diff

diff --git a/src/box/xlog.c b/src/box/xlog.c
index bab4bd4bb1..b1bde679aa 100644
--- a/src/box/xlog.c
+++ b/src/box/xlog.c
@@ -2245,7 +2245,6 @@ xlog_batch_destroy(struct xlog_batch *batch)
 		xlog_batch_get(batch, i, &entry);
 		if (entry->error != NULL)
 			error_unref(entry->error);
-		mempool_free(&batch->cursor->entry_pool, entry);
 	}
 	ibuf_destroy(&batch->data);
 	ibuf_destroy(&batch->entries);
@@ -2260,8 +2259,6 @@ xlog_entry_cursor_create(struct xlog_entry_cursor *cursor,
 	cursor->search_magic = false;
 	cursor->eof = false;
 	cursor->error = NULL;
-	mempool_create(&cursor->entry_pool, cord_slab_cache(),
-		       sizeof(struct xlog_entry));
 }
 
 void
@@ -2271,7 +2268,6 @@ xlog_entry_cursor_destroy(struct xlog_entry_cursor *cursor)
 		xlog_cursor_close(cursor->raw_cursor, false);
 	if (cursor->error != NULL)
 		error_unref(cursor->error);
-	mempool_destroy(&cursor->entry_pool);
 	TRASH(cursor);
 }
 
@@ -2330,19 +2326,18 @@ xlog_entry_cursor_next_batch(struct xlog_entry_cursor *cursor,
 		}
 		return 1;
 	}
-	batch->cursor = cursor;
 	batch->entry_count = 0;
 	ibuf_create(&batch->entries, &cord()->slabc,
 		    /*start_capacity=*/16 * 1024);
 	while (ibuf_used(&batch->data) != 0) {
 		/* Sic: alignment should be correct here. */
-		struct xlog_entry *entry = xmempool_alloc(&cursor->entry_pool);
+		struct xlog_entry *entry =
+			ibuf_alloc(&batch->entries, sizeof(*entry));
 		rc = xrow_decode(&entry->header,
 				 (const char **)&batch->data.rpos,
 				 (const char *)batch->data.wpos,
 				 /*end_is_exact=*/false);
 		if (rc != 0) {
-			mempool_free(&cursor->entry_pool, entry);
 			diag_log();
 			diag_set(XlogError, "can't parse row");
 			if (batch->entry_count == 0) {
@@ -2356,9 +2351,6 @@ xlog_entry_cursor_next_batch(struct xlog_entry_cursor *cursor,
 		}
 		batch->entry_count++;
 		entry->error = NULL;
-		struct xlog_entry **entry_ptr =
-			xibuf_alloc(&batch->entries, sizeof(*entry_ptr));
-		*entry_ptr = entry;
 		if (xlog_entry_decode(entry) != 0) {
 			entry->error = diag_last_error(diag_get());
 			error_ref(entry->error);
diff --git a/src/box/xlog.h b/src/box/xlog.h
index b3ead949e2..ff743cf344 100644
--- a/src/box/xlog.h
+++ b/src/box/xlog.h
@@ -42,7 +42,6 @@
 
 #include "small/ibuf.h"
 #include "small/obuf.h"
-#include "small/mempool.h"
 
 struct iovec;
 struct xrow_header;
@@ -925,13 +924,21 @@ struct xlog_entry {
 	struct error *error;
 };
 
+/** Batch of `struct xlog_entry`. */
+struct xlog_batch {
+	/** Raw batch data. */
+	struct ibuf data;
+	/** Buffer holding array of `struct xlog_entry`. */
+	struct ibuf entries;
+	/** Number of entries. */
+	size_t entry_count;
+};
+
 /** Cursor to read `struct xlog_entry`. */
 struct
 xlog_entry_cursor {
 	/** Cursor used to get raw batch data. */
 	struct xlog_cursor *raw_cursor;
-	/** Pool for `struct xlog_entry *` objects. */
-	struct mempool entry_pool;
 	/** Set if last batch reading failed. */
 	bool search_magic;
 	/** Set if EOF is reached. */
@@ -940,18 +947,6 @@ xlog_entry_cursor {
 	struct error *error;
 };
 
-/** Batch of `struct xlog_entry`. */
-struct xlog_batch {
-	/** The cursor this batch is read from. */
-	struct xlog_entry_cursor *cursor;
-	/** Raw batch data. */
-	struct ibuf data;
-	/** Buffer holding array of `struct xlog_entry *`. */
-	struct ibuf entries;
-	/** Number of entries. */
-	size_t entry_count;
-};
-
 /**
  * Get xlog entry at `index` from the `batch`.
  *
@@ -965,8 +960,14 @@ xlog_batch_get(struct xlog_batch *batch, size_t index,
 	       struct xlog_entry **entry)
 {
 	assert(index < batch->entry_count);
-	struct xlog_entry **entries = (struct xlog_entry **)batch->entries.rpos;
-	*entry = entries[index];
+	struct xlog_entry *entries = (struct xlog_entry *)batch->entries.rpos;
+	*entry = &entries[index];
+	/*
+	 * DML has header reference that may become invalid due to
+	 * reallocations during parsing the batch. Fix header reference.
+	 */
+	if (iproto_type_is_dml((*entry)->header.type))
+		(*entry)->dml.header = &(*entry)->header;
 	if ((*entry)->error != NULL) {
 		diag_set_error(diag_get(), (*entry)->error);
 		return -1;
@@ -974,12 +975,7 @@ xlog_batch_get(struct xlog_batch *batch, size_t index,
 	return 0;
 }
 
-/**
- * Destroys the batch.
- *
- * The batch should be destroyed before the cursor from which it is read from
- * is destroyed.
- */
+/** Destroys the batch. */
 void
 xlog_batch_destroy(struct xlog_batch *batch);
 
diff --git a/src/box/xlog_reader.c b/src/box/xlog_reader.c
index daf9b18403..7718b6e64e 100644
--- a/src/box/xlog_reader.c
+++ b/src/box/xlog_reader.c
@@ -303,12 +303,17 @@ xlog_reader_f(va_list ap)
 		rc = 0;
 	}
 
-	while (reader->spare_msg == NULL)
-		fiber_yield();
-
 	if (opened)
 		xlog_entry_cursor_destroy(&cursor);
 
+	/*
+	 * cbus_endpoint_destroy() below is not enough. This loop is required
+	 * to avoid race on creation. The thread may exit before even TX
+	 * is connected to its endpoint.
+	 */
+	while (reader->spare_msg == NULL)
+		fiber_yield();
+
 	cbus_endpoint_destroy(&endpoint, NULL);
 	cpipe_destroy(&reader->tx_pipe);

src/box/box.cc

src/box/xlog.h

src/box/xlog.c

src/box/xlog.h

src/box/xlog.c

src/box/xlog_reader.c

src/box/memtx_engine.cc

It is just `struct ibuf` actually. The size we add to it we use in `xlog_tx_cursor_pos()` to calculate `ibuf_pos()`. We don't need `xlog_tx_cursor_*` functions in header and can open code it as they used only once. Exception is `xlog_tx_cursor_create()` which is renamed to `xlog_read_tx()`. Part of tarantool#11888 NO_TEST=refactoring NO_CHANGELOG=refactoring NO_DOC=refactoring

src/box/applier.cc

src/box/xlog.h

sergepetrenko

Thanks for the patch!
I've only looked over the box: keep vclock in synchro and raft request commit. LGTM.

nshy · 2025-10-23T07:50:30Z

Fixed fuzzer after PR changes (commit "box: keep vclock in synchro and raft request").

--- a/test/fuzz/xrow_decode_raft_fuzzer.c
+++ b/test/fuzz/xrow_decode_raft_fuzzer.c
@@ -39,8 +39,7 @@ LLVMFuzzerTestOneInput(const uint8_t *data, size_t size)
        row.bodycnt = 1;

        struct raft_request request = {0};
-       struct vclock vclock = {0};
-       xrow_decode_raft(&row, &request, &vclock);
+       xrow_decode_raft(&row, &request);

        return 0;
 }

Currently we keep a pointer to vclock in `struct synchro_request` and `struct raft_request` unlikely to the other requests. There we keep only pointers to MsgPack data but keep parsed values right in the request struct. Thus this makes these two requests memory management different from the other requests. Let's align them with others. Part of tarantool#11888 NO_CHANGELOG=refactoring NO_DOC=refactoring

Validating requests MsgPack and parsing the requests body turns out to be significant part of recovery in terms of CPU. Let's move it to a distinct thread. This is similar to iproto and applier where we have a distinct thread(s) for that. Snapshot is written in batches of about 128k (`XLOG_TX_AUTOCOMMIT_THRESHOLD`). The recovery thread reads the batch, parses its requests and send to TX to process. The maximum number of read but not processed batches is limited to 2 to make a flow control. Closes tarantool#11888 Part of tarantool#11452 ============================ = Performance test results = ============================ Recovery performance is evaluated using below perf test. The 1.10.5 performance is taken from the previous evaluations for the issue. ``` tarantool perf/lua/recovery.lua \ --row_count 10000000 --column_count 10 --recovery_count 5 \ --preload_script "require('compat').box_recovery_triggers_deprecation = 'new'" ``` Run | Perf, krps | ---------------------------------| master, current | 1060 | ---------------------------------| master, patched | 1800 | ---------------------------------| 1.10.5 | 1315 | So introducing snapshot reader thread gives 70% boost for recovery from snapshot. Now master is 37% faster than 1.10.5. If `box_recovery_triggers_deprecation = 'old'` then performance is only 990 krps. NO_TEST=performance NO_DOC=performance

ligurio

changes in test/fuzz/xrow_decode_raft_fuzzer.c - LGTM

nshy force-pushed the recovery-thread branch 2 times, most recently from 9f5dae6 to 40d271e Compare September 29, 2025 14:04

nshy requested a review from locker September 29, 2025 14:19

nshy assigned locker Sep 29, 2025

locker reviewed Sep 30, 2025

View reviewed changes

locker assigned nshy and unassigned locker Sep 30, 2025

nshy force-pushed the recovery-thread branch from 40d271e to 52272ea Compare October 7, 2025 10:13

nshy requested a review from a team as a code owner October 7, 2025 10:13

nshy force-pushed the recovery-thread branch 2 times, most recently from c8e3679 to 48230f8 Compare October 8, 2025 07:17

nshy assigned locker and unassigned nshy Oct 8, 2025

locker reviewed Oct 8, 2025

View reviewed changes

changelogs/unreleased/gh-11888-snapshot-recovery-perf-improved.md Outdated Show resolved Hide resolved

src/box/xlog.h Outdated Show resolved Hide resolved

src/box/xlog.c Outdated Show resolved Hide resolved

src/box/xlog_reader.h Outdated Show resolved Hide resolved

locker assigned nshy and unassigned locker Oct 8, 2025

nshy force-pushed the recovery-thread branch from 48230f8 to d6fb1a9 Compare October 13, 2025 09:42

nshy requested a review from sergepetrenko October 13, 2025 10:07

nshy assigned locker and unassigned nshy Oct 13, 2025

sergepetrenko self-assigned this Oct 13, 2025

locker reviewed Oct 15, 2025

View reviewed changes

locker assigned nshy and unassigned locker Oct 15, 2025

nshy force-pushed the recovery-thread branch 2 times, most recently from 1058e93 to 6aa7bcc Compare October 16, 2025 17:41

lenkis approved these changes Oct 16, 2025

View reviewed changes

nshy assigned locker and unassigned nshy Oct 17, 2025

locker approved these changes Oct 17, 2025

View reviewed changes

src/box/applier.cc Show resolved Hide resolved

src/box/xlog.h Outdated Show resolved Hide resolved

locker assigned nshy and unassigned locker Oct 17, 2025

nshy removed their assignment Oct 20, 2025

sergepetrenko approved these changes Oct 22, 2025

View reviewed changes

sergepetrenko assigned nshy and unassigned sergepetrenko Oct 22, 2025

nshy force-pushed the recovery-thread branch from 6aa7bcc to 1122774 Compare October 23, 2025 07:30

nshy added the full-ci Enables all tests for a pull request label Oct 23, 2025

nshy added 2 commits October 23, 2025 10:50

nshy force-pushed the recovery-thread branch from 1122774 to 712d6b9 Compare October 23, 2025 07:50

nshy requested a review from ligurio as a code owner October 23, 2025 07:50

nshy assigned locker and nshy and unassigned nshy and locker Oct 23, 2025

ligurio approved these changes Oct 23, 2025

View reviewed changes

nshy assigned locker and unassigned nshy Oct 23, 2025

locker merged commit 08329dc into tarantool:master Oct 23, 2025
44 checks passed

nshy deleted the recovery-thread branch October 23, 2025 10:28

nshy mentioned this pull request Oct 23, 2025

Recovery performance regression (-1.5x on SSD) #11452

Closed

sergepetrenko mentioned this pull request Jan 13, 2026

Refactor and speed up recovery #5787

Open

10 tasks

Conversation

nshy commented Sep 29, 2025 • edited by locker Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

coveralls commented Sep 29, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

nshy commented Oct 13, 2025

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

sergepetrenko left a comment

Choose a reason for hiding this comment

Uh oh!

nshy commented Oct 23, 2025

Uh oh!

ligurio left a comment

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

6 participants

nshy commented Sep 29, 2025 •

edited by locker

Loading

coveralls commented Sep 29, 2025 •

edited

Loading