[MOD-12069] Add *_pending_jobs metrics (#7556)

meiravgri · meiravgri · commit a544a1804ad0 · 2025-12-03T08:37:30.000Z
* align info/* to active_coord * add APIs to get queues length * add to info * fix * test * fix test * catch general error * rename * fix moduleArgs * rename * rename test_active_worker_threads * rename to wworketrs (cherry picked from commit ea0476a)
diff --git a/deps/thpool/thpool.c b/deps/thpool/thpool.c
@@ -539,6 +539,14 @@ size_t redisearch_thpool_get_num_threads(redisearch_thpool_t *thpool_p) {
   return thpool_p->n_threads;
 }
 
+size_t redisearch_thpool_high_priority_pending_jobs(redisearch_thpool_t *thpool_p) {
+  return __atomic_load_n(&(thpool_p->jobqueues.high_priority_jobqueue.len), __ATOMIC_RELAXED);
+}
+
+size_t redisearch_thpool_low_priority_pending_jobs(redisearch_thpool_t *thpool_p) {
+  return __atomic_load_n(&(thpool_p->jobqueues.low_priority_jobqueue.len), __ATOMIC_RELAXED);
+}
+
 thpool_stats redisearch_thpool_get_stats(redisearch_thpool_t *thpool_p) {
   /* Locking must be done in the following order to prevent deadlocks. */
   redisearch_thpool_lock(thpool_p);
diff --git a/deps/thpool/thpool.h b/deps/thpool/thpool.h
@@ -275,6 +275,10 @@ thpool_stats redisearch_thpool_get_stats(redisearch_thpool_t *);
 
 size_t redisearch_thpool_get_num_threads(redisearch_thpool_t *);
 
+size_t redisearch_thpool_high_priority_pending_jobs(redisearch_thpool_t *);
+
+size_t redisearch_thpool_low_priority_pending_jobs(redisearch_thpool_t *);
+
 /**
  * @brief Schedule a job to reduce the number of threads in the threadpool in an asynchronous manner.
  *
diff --git a/src/info/global_stats.c b/src/info/global_stats.c
@@ -141,7 +141,14 @@ void GlobalStats_UpdateActiveIoThreads(int toAdd) {
 MultiThreadingStats GlobalStats_GetMultiThreadingStats() {
   MultiThreadingStats stats;
   stats.active_io_threads = READ(RSGlobalStats.totalStats.multi_threading.active_io_threads);
+
+  // Workers stats
+  // We don't use workersThreadPool_getStats here to avoid the overhead of locking the thread pool.
   stats.active_worker_threads = workersThreadPool_WorkingThreadCount();
+  stats.workers_low_priority_pending_jobs = workersThreadPool_LowPriorityPendingJobsCount();
+  stats.workers_high_priority_pending_jobs = workersThreadPool_HighPriorityPendingJobsCount();
+
+  // Coordinator stats
   stats.active_coord_threads = ConcurrentSearchPool_WorkingThreadCount();
   return stats;
 }
diff --git a/src/info/global_stats.h b/src/info/global_stats.h
@@ -65,6 +65,8 @@ typedef struct {
   size_t active_io_threads; // number of I/O thread callbacks currently executing
   size_t active_worker_threads; // number of worker threads currently executing jobs
   size_t active_coord_threads; // number of coordinator threads currently executing jobs
+  size_t workers_low_priority_pending_jobs; // number of low priority jobs waiting to be executed (currently only vecsim background indexing)
+  size_t workers_high_priority_pending_jobs; // number of high priority jobs waiting to be executed (currently only queries)
 } MultiThreadingStats;
 
 typedef struct {
diff --git a/src/info/info_redis/info_redis.c b/src/info/info_redis/info_redis.c
@@ -269,6 +269,8 @@ void AddToInfo_MultiThreading(RedisModuleInfoCtx *ctx, TotalIndexesInfo *total_i
   RedisModule_InfoAddFieldULongLong(ctx, "active_io_threads", stats.active_io_threads);
   RedisModule_InfoAddFieldULongLong(ctx, "active_worker_threads", stats.active_worker_threads);
   RedisModule_InfoAddFieldULongLong(ctx, "active_coord_threads", stats.active_coord_threads);
+  RedisModule_InfoAddFieldULongLong(ctx, "workers_low_priority_pending_jobs", stats.workers_low_priority_pending_jobs);
+  RedisModule_InfoAddFieldULongLong(ctx, "workers_high_priority_pending_jobs", stats.workers_high_priority_pending_jobs);
 }
 
 void AddToInfo_Dialects(RedisModuleInfoCtx *ctx) {
diff --git a/src/util/workers.c b/src/util/workers.c
@@ -111,6 +111,18 @@ size_t workersThreadPool_WorkingThreadCount(void) {
   return redisearch_thpool_num_jobs_in_progress(_workers_thpool);
 }
 
+size_t workersThreadPool_LowPriorityPendingJobsCount(void) {
+  RS_ASSERT(_workers_thpool != NULL);
+
+  return redisearch_thpool_low_priority_pending_jobs(_workers_thpool);
+}
+
+size_t workersThreadPool_HighPriorityPendingJobsCount(void) {
+  RS_ASSERT(_workers_thpool != NULL);
+
+  return redisearch_thpool_high_priority_pending_jobs(_workers_thpool);
+}
+
 // return n_threads value.
 size_t workersThreadPool_NumThreads(void) {
   RS_ASSERT(_workers_thpool);
diff --git a/src/util/workers.h b/src/util/workers.h
@@ -25,6 +25,12 @@ void workersThreadPool_SetNumWorkers(void);
 // return number of currently working threads
 size_t workersThreadPool_WorkingThreadCount(void);
 
+// Return the number of low priority jobs waiting to be executed.
+size_t workersThreadPool_LowPriorityPendingJobsCount(void);
+
+// Return the number of high priority jobs waiting to be executed.
+size_t workersThreadPool_HighPriorityPendingJobsCount(void);
+
 // return n_threads value.
 size_t workersThreadPool_NumThreads(void);
 
diff --git a/tests/pytests/common.py b/tests/pytests/common.py
@@ -52,6 +52,33 @@ def __exit__(self, exc_type, exc_value, traceback):
     def handler(self, signum, frame):
         raise Exception(f'Timeout: {self.message}')
 
+def wait_for_condition(check_fn, message):
+    """
+    Wait for a condition with timeout and status reporting.
+
+    Parameters:
+        - env: Test environment
+        - check_fn: Function that takes returns (status: bool, state: dict)
+                   where state is a dict of the current state information
+        - message: Message prefix for timeout exception
+    """
+    iter = 0
+    timeout_msg = {}
+
+    try:
+        with TimeLimit(120):
+            while True:
+                done, state = check_fn()
+                if done:
+                    break
+                time.sleep(0.01)
+                iter += 1
+                timeout_msg['iter'] = iter
+                timeout_msg['state'] = state
+    except Exception as e:
+        log = f"{message}: {timeout_msg}"
+        raise Exception(f'Error: {e}, log: {log}')
+
 class DialectEnv(Env):
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
diff --git a/tests/pytests/test_info_modules.py b/tests/pytests/test_info_modules.py
@@ -3,7 +3,10 @@
 import redis
 from inspect import currentframe
 import numpy as np
-
+from vecsim_utils import (
+    DEFAULT_FIELD_NAME,
+    set_up_database_with_vectors,
+)
 
 def info_modules_to_dict(conn):
   res = conn.execute_command('INFO MODULES')
@@ -881,6 +884,17 @@ def test_errors_and_warnings_init(env):
     for field in info_dict[metric]:
       env.assertEqual(info_dict[metric][field], '0')
 
+########
+# Multi Threaded Stats tests
+########
+
+MULTI_THREADING_SECTION = f'{SEARCH_PREFIX}multi_threading'
+ACTIVE_IO_THREADS_METRIC = f'{SEARCH_PREFIX}active_io_threads'
+ACTIVE_WORKER_THREADS_METRIC = f'{SEARCH_PREFIX}active_worker_threads'
+ACTIVE_COORD_THREADS_METRIC = f'{SEARCH_PREFIX}active_coord_threads'
+WORKERS_LOW_PRIORITY_PENDING_JOBS_METRIC = f'{SEARCH_PREFIX}workers_low_priority_pending_jobs'
+WORKERS_HIGH_PRIORITY_PENDING_JOBS_METRIC = f'{SEARCH_PREFIX}workers_high_priority_pending_jobs'
+
 def test_active_io_threads_stats(env):
   conn = getConnectionByEnv(env)
   # Setup: Create index with some data
@@ -892,17 +906,16 @@ def test_active_io_threads_stats(env):
   info_dict = info_modules_to_dict(env)
 
   # Verify multi_threading section exists
-  multi_threading_section = f'{SEARCH_PREFIX}multi_threading'
-  env.assertTrue(multi_threading_section in info_dict,
+  env.assertTrue(MULTI_THREADING_SECTION in info_dict,
                  message="multi_threading section should exist in INFO MODULES")
 
   # Verify all expected fields exist
-  env.assertTrue(f'{SEARCH_PREFIX}active_io_threads' in info_dict[multi_threading_section],
-                 message="active_io_threads field should exist in multi_threading section")
+  env.assertTrue(ACTIVE_IO_THREADS_METRIC in info_dict[MULTI_THREADING_SECTION],
+                 message=f"{ACTIVE_IO_THREADS_METRIC} field should exist in multi_threading section")
 
   # Verify all fields initialized to 0.
-  env.assertEqual(info_dict[multi_threading_section][f'{SEARCH_PREFIX}active_io_threads'], '0',
-                 message="active_io_threads should be 0 when idle")
+  env.assertEqual(info_dict[MULTI_THREADING_SECTION][ACTIVE_IO_THREADS_METRIC], '0',
+                 message=f"{ACTIVE_IO_THREADS_METRIC} should be 0 when idle")
   # There's no deterministic way to test active_io_threads increases while a query is running,
   # we test it in unit tests.
 
@@ -929,13 +942,12 @@ def _test_active_worker_threads(env, num_queries):
         conn.execute_command('HSET', f'doc{i}', 'n', i)
 
     # Verify active_worker_threads and coord threads start at 0
-    multi_threading_section = f'{SEARCH_PREFIX}multi_threading'
     for i, con in enumerate(env.getOSSMasterNodesConnectionList()):
         info_dict = info_modules_to_dict(con)
-        env.assertEqual(info_dict[multi_threading_section][f'{SEARCH_PREFIX}active_worker_threads'], '0',
-                       message=f"shard {i}: active_worker_threads should be 0 when idle")
-        env.assertEqual(info_dict[multi_threading_section][f'{SEARCH_PREFIX}active_coord_threads'], '0',
-                       message=f"shard {i}: active_coord_threads should be 0 when idle")
+        env.assertEqual(info_dict[MULTI_THREADING_SECTION][ACTIVE_WORKER_THREADS_METRIC], '0',
+                       message=f"shard {i}: {ACTIVE_WORKER_THREADS_METRIC} should be 0 when idle")
+        env.assertEqual(info_dict[MULTI_THREADING_SECTION][ACTIVE_COORD_THREADS_METRIC], '0',
+                       message=f"shard {i}: {ACTIVE_COORD_THREADS_METRIC} should be 0 when idle")
 
     # Define callback for testing a specific query type
     def _test_query_type(query_type):
@@ -964,14 +976,14 @@ def _test_query_type(query_type):
         # Verify active_worker_threads == num_queries
         for i, con in enumerate(env.getOSSMasterNodesConnectionList()):
             info_dict = info_modules_to_dict(con)
-            env.assertEqual(info_dict[multi_threading_section][f'{SEARCH_PREFIX}active_worker_threads'], str(num_queries),
-                           message=f"shard {i}: {query_type}: active_worker_threads should be {num_queries} when {num_queries} queries are paused")
+            env.assertEqual(info_dict[MULTI_THREADING_SECTION][ACTIVE_WORKER_THREADS_METRIC], str(num_queries),
+                           message=f"shard {i}: {query_type}: {ACTIVE_WORKER_THREADS_METRIC} should be {num_queries} when {num_queries} queries are paused")
 
         # If this is cluster, and FT.AGGREGATE, verify active_coord_threads == num_queries
         if env.isCluster() and query_type == 'FT.AGGREGATE':
           info_dict = info_modules_to_dict(env)
-          env.assertEqual(info_dict[multi_threading_section][f'{SEARCH_PREFIX}active_coord_threads'], str(num_queries),
-                         message=f"coordinator: {query_type}: active_coord_threads should be {num_queries} when {num_queries} queries are paused")
+          env.assertEqual(info_dict[MULTI_THREADING_SECTION][ACTIVE_COORD_THREADS_METRIC], str(num_queries),
+                         message=f"coordinator: {query_type}: {ACTIVE_COORD_THREADS_METRIC} should be {num_queries} when {num_queries} queries are paused")
 
         # Resume all queries
         allShards_setPauseRPResume(env)
@@ -986,21 +998,161 @@ def _test_query_type(query_type):
         # Verify active_worker_threads returns to 0
         for i, con in enumerate(env.getOSSMasterNodesConnectionList()):
             info_dict = info_modules_to_dict(con)
-            env.assertEqual(info_dict[multi_threading_section][f'{SEARCH_PREFIX}active_worker_threads'], '0',
-                           message=f"shard {i}: {query_type}: active_worker_threads should return to 0 after queries complete")
+            env.assertEqual(info_dict[MULTI_THREADING_SECTION][ACTIVE_WORKER_THREADS_METRIC], '0',
+                           message=f"shard {i}: {query_type}: {ACTIVE_WORKER_THREADS_METRIC} should return to 0 after queries complete")
 
     # Test both query types
     _test_query_type('FT.SEARCH')
     _test_query_type('FT.AGGREGATE')
 
-# --- Test 1: Standalone Mode ---
-@skip(cluster=True)  # Only run in standalone mode
-def test_active_worker_threads_SA(env):
+def test_active_worker_threads(env):
     num_queries = 1
     _test_active_worker_threads(env, num_queries)
 
-# --- Test 2: Cluster Mode ---
-@skip(cluster=False)  # Only run in cluster mode
-def test_active_worker_threads_cluster(env):
-    num_queries = 1
-    _test_active_worker_threads(env, num_queries)
+def _test_pending_jobs_metrics(env, command_type):
+    """
+    Parameters:
+        - env: Test environment (works for both SA and cluster)
+    """
+
+    # --- STEP 1: SETUP ---
+    # Configure WORKERS (we just need workers enabled, e.g., 2)
+    run_command_on_all_shards(env, config_cmd(), 'SET', 'WORKERS', '2')
+
+    # Define variables
+    num_vectors = 10 * env.shardsCount  # Number of vectors to index (creates low priority jobs)
+    num_queries = 3   # Number of queries to execute (creates high priority jobs)
+    dim = 4
+    vector_field = DEFAULT_FIELD_NAME
+    index_name = 'idx'
+
+    # --- STEP 2: VERIFY INITIAL STATE (metrics = 0) ---
+    for conn in env.getOSSMasterNodesConnectionList():
+        info_dict = info_modules_to_dict(conn)
+        env.assertEqual(info_dict[MULTI_THREADING_SECTION][WORKERS_LOW_PRIORITY_PENDING_JOBS_METRIC], '0')
+        env.assertEqual(info_dict[MULTI_THREADING_SECTION][WORKERS_HIGH_PRIORITY_PENDING_JOBS_METRIC], '0')
+
+    #  --- STEP 3: PAUSE WORKERS THREAD POOL ---
+    # Pause workers to prevent jobs from executing
+    run_command_on_all_shards(env, debug_cmd(), 'WORKERS', 'PAUSE')
+
+    # --- STEP 4: CREATE INDEX AND INDEX VECTORS (creates workers_low_priority_pending_jobs) ---
+    # Create index with HNSW and load vectors (HNSW creates background indexing jobs which are low priority)
+    set_up_database_with_vectors(env, dim, num_vectors, index_name=index_name,
+                                             field_name=vector_field, datatype='FLOAT32',
+                                             metric='L2', alg='HNSW')
+
+    def check_indexing_jobs_pending():
+        num_shards = env.shardsCount
+        all_shards_ready = [False] * num_shards
+        state = {
+          'indexing_jobs_pending': [0] * num_shards,
+          'expected_indexing_jobs': [0] * num_shards,
+        }
+
+        for i, con in enumerate(env.getOSSMasterNodesConnectionList()):
+          # Expected low_priority_pending_jobs = con.dbsize() (number of vectors on this shard)
+          expected_indexing_jobs = con.execute_command('DBSIZE')
+
+          shard_stats = info_modules_to_dict(con)
+          indexing_jobs_pending = int(shard_stats[MULTI_THREADING_SECTION][WORKERS_LOW_PRIORITY_PENDING_JOBS_METRIC])
+
+          all_shards_ready[i] = (expected_indexing_jobs == indexing_jobs_pending)
+          state['expected_indexing_jobs'][i] = expected_indexing_jobs
+          state['indexing_jobs_pending'][i] = indexing_jobs_pending
+        return all(all_shards_ready), state
+
+    wait_for_condition(check_indexing_jobs_pending, "wait_for_workers_low_priority_jobs_pending")
+
+    # --- STEP 5: EXECUTE QUERIES (creates high_priority_pending_jobs) ---
+    # Launch num_queries queries in background threads
+    # Queries will be queued as high-priority jobs but not executed (workers paused)
+
+    query_threads = []
+    query_results = []
+
+    def run_query(query_id):
+        conn = getConnectionByEnv(env)
+        try:
+            result = conn.execute_command(f'FT.{command_type}', index_name, '*')
+            query_results.append((query_id, 'success', result))
+        except Exception as e:
+            query_results.append((query_id, 'error', e))
+
+    for i in range(num_queries):
+        t = threading.Thread(target=run_query, args=(i,))
+        query_threads.append(t)
+        t.start()
+
+    # Give threads a moment to start and attempt to queue their queries
+    time.sleep(0.1)
+
+    # Check if any queries failed immediately (before being queued)
+    for query_id, status, result in query_results:
+        if status == 'error':
+            env.assertTrue(False, message=f"Query {query_id} failed immediately: {result}")
+
+    # --- STEP 6: WAIT FOR THREADPOOL STATS TO UPDATE (jobs queued) ---
+    # Wait for the threadpool stats to reflect the expected pending jobs
+    def check_queries_jobs_pending():
+        num_shards = env.shardsCount
+        all_shards_ready = [False] * num_shards
+        expected_queries_jobs = num_queries
+        state = {
+          'queries_jobs_pending': [0] * num_shards,
+          'expected_queries_jobs': [expected_queries_jobs] * num_shards,
+        }
+
+        for i, con in enumerate(env.getOSSMasterNodesConnectionList()):
+
+          shard_stats = info_modules_to_dict(con)
+          queries_pending_jobs = int(shard_stats[MULTI_THREADING_SECTION][WORKERS_HIGH_PRIORITY_PENDING_JOBS_METRIC])
+
+          all_shards_ready[i] = (expected_queries_jobs == queries_pending_jobs)
+          state['queries_jobs_pending'][i] = queries_pending_jobs
+          state['expected_queries_jobs'][i] = expected_queries_jobs
+        return all(all_shards_ready), state
+
+    wait_for_condition(check_queries_jobs_pending, "wait_for_high_priority_jobs_pending")
+
+    # --- STEP 7: RESUME WORKERS AND DRAIN ---
+    # Resume workers:
+    run_command_on_all_shards(env, debug_cmd(), 'WORKERS', 'RESUME')
+
+    # Wait for all query threads to complete:
+    for t in query_threads:
+        t.join(timeout=30)
+
+    # Drain worker thread pool to ensure all jobs complete:
+    run_command_on_all_shards(env, debug_cmd(), 'WORKERS', 'DRAIN')
+
+    # --- STEP 8: VERIFY METRICS RETURN TO 0 ---
+    # Wait for metrics to return to 0 (job callback finished before stats update)
+    def check_reset_metrics():
+        num_shards = env.shardsCount
+        all_shards_ready = [False] * num_shards
+        state = {
+          'workers_low_priority_jobs_pending': [-1] * num_shards,
+          'workers_high_priority_jobs_pending': [-1] * num_shards,
+        }
+
+        for i, con in enumerate(env.getOSSMasterNodesConnectionList()):
+
+          shard_stats = info_modules_to_dict(con)
+          queries_jobs_pending = int(shard_stats[MULTI_THREADING_SECTION][WORKERS_HIGH_PRIORITY_PENDING_JOBS_METRIC])
+          background_indexing_jobs_pending = int(shard_stats[MULTI_THREADING_SECTION][WORKERS_LOW_PRIORITY_PENDING_JOBS_METRIC])
+
+          all_shards_ready[i] = (queries_jobs_pending == 0 and background_indexing_jobs_pending == 0)
+          state['workers_low_priority_jobs_pending'][i] = background_indexing_jobs_pending
+          state['workers_high_priority_jobs_pending'][i] = queries_jobs_pending
+        return all(all_shards_ready), state
+
+    wait_for_condition(check_reset_metrics, "wait_for_workers_pending_jobs_metric_reset")
+
+def test_pending_jobs_metrics_search():
+  env = Env(moduleArgs='DEFAULT_DIALECT 2')
+  _test_pending_jobs_metrics(env, 'SEARCH')
+
+def test_pending_jobs_metrics_aggregate():
+  env = Env(moduleArgs='DEFAULT_DIALECT 2')
+  _test_pending_jobs_metrics(env, 'AGGREGATE')

Original file line number	Diff line number	Diff line change
`@@ -275,6 +275,10 @@ thpool_stats redisearch_thpool_get_stats(redisearch_thpool_t *);`
`275`	`275`
`276`	`276`	`size_t redisearch_thpool_get_num_threads(redisearch_thpool_t *);`
`277`	`277`
	`278`	`+size_t redisearch_thpool_high_priority_pending_jobs(redisearch_thpool_t *);`
	`279`	`+`
	`280`	`+size_t redisearch_thpool_low_priority_pending_jobs(redisearch_thpool_t *);`
	`281`	`+`
`278`	`282`	`/**`
`279`	`283`	`* @brief Schedule a job to reduce the number of threads in the threadpool in an asynchronous manner.`
`280`	`284`	`*`
Original file line number	Diff line number	Diff line change
`@@ -269,6 +269,8 @@ void AddToInfo_MultiThreading(RedisModuleInfoCtx ctx, TotalIndexesInfo total_i`
`269`	`269`	`RedisModule_InfoAddFieldULongLong(ctx, "active_io_threads", stats.active_io_threads);`
`270`	`270`	`RedisModule_InfoAddFieldULongLong(ctx, "active_worker_threads", stats.active_worker_threads);`
`271`	`271`	`RedisModule_InfoAddFieldULongLong(ctx, "active_coord_threads", stats.active_coord_threads);`
	`272`	`+ RedisModule_InfoAddFieldULongLong(ctx, "workers_low_priority_pending_jobs", stats.workers_low_priority_pending_jobs);`
	`273`	`+ RedisModule_InfoAddFieldULongLong(ctx, "workers_high_priority_pending_jobs", stats.workers_high_priority_pending_jobs);`
`272`	`274`	`}`
`273`	`275`
`274`	`276`	`void AddToInfo_Dialects(RedisModuleInfoCtx *ctx) {`