Fix flaky test_overcommit_tracker/test_user_overcommit

alexey-milovidov · claude · alexey-milovidov · commit edf15b492f2a · 2026-03-24T12:16:32.000+01:00
The test is probabilistic: it relies on memory pressure to kill queries with low `memory_overcommit_ratio_denominator` while sparing those with high ratio. Under MSan (which has ~3x memory overhead), a single attempt could kill all queries, causing the assertion to fail. Two changes: - Reduce `numbers(2500000)` to `numbers(1000000)` to lower per-query memory usage, making it more likely that B queries survive. - Add a retry loop (up to 5 attempts) since the test is inherently probabilistic. https://s3.amazonaws.com/clickhouse-test-reports/json.html?PR=100404&sha=4e239671fdc3523725369f5ad6028b4a52c43b45&name_0=PR&name_1=Integration%20tests%20%28amd_msan%2C%204%2F6%29 Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
diff --git a/tests/integration/test_overcommit_tracker/test.py b/tests/integration/test_overcommit_tracker/test.py
@@ -22,32 +22,41 @@ def start_cluster():
         cluster.shutdown()
 
 
-USER_TEST_QUERY_A = "SELECT groupArray(number) FROM numbers(2500000) SETTINGS max_memory_usage_for_user=2000000000,memory_overcommit_ratio_denominator=1"
-USER_TEST_QUERY_B = "SELECT groupArray(number) FROM numbers(2500000) SETTINGS max_memory_usage_for_user=2000000000,memory_overcommit_ratio_denominator=80000000"
+USER_TEST_QUERY_A = "SELECT groupArray(number) FROM numbers(1000000) SETTINGS max_memory_usage_for_user=2000000000,memory_overcommit_ratio_denominator=1"
+USER_TEST_QUERY_B = "SELECT groupArray(number) FROM numbers(1000000) SETTINGS max_memory_usage_for_user=2000000000,memory_overcommit_ratio_denominator=80000000"
 
 
 def test_user_overcommit():
     node.query("CREATE USER IF NOT EXISTS A")
     node.query("GRANT ALL ON *.* TO A")
 
-    responses_A = list()
-    responses_B = list()
-    for i in range(100):
-        if i % 2 == 0:
-            responses_A.append(node.get_query_request(USER_TEST_QUERY_A, user="A"))
-        else:
-            responses_B.append(node.get_query_request(USER_TEST_QUERY_B, user="A"))
-
-    overcommited_killed = False
-    for response in responses_A:
-        _, err = response.get_answer_and_error()
-        if "MEMORY_LIMIT_EXCEEDED" in err:
-            overcommited_killed = True
+    # The test is probabilistic: it relies on memory pressure to kill queries
+    # with low overcommit ratio while sparing those with high ratio.
+    # Under sanitizers with higher memory overhead, a single attempt may fail,
+    # so we retry a few times.
     finished = False
-    for response in responses_B:
-        _, err = response.get_answer_and_error()
-        if err == "":
-            finished = True
+    for attempt in range(5):
+        responses_A = list()
+        responses_B = list()
+        for i in range(100):
+            if i % 2 == 0:
+                responses_A.append(
+                    node.get_query_request(USER_TEST_QUERY_A, user="A")
+                )
+            else:
+                responses_B.append(
+                    node.get_query_request(USER_TEST_QUERY_B, user="A")
+                )
+
+        for response in responses_A:
+            response.get_answer_and_error()
+        for response in responses_B:
+            _, err = response.get_answer_and_error()
+            if err == "":
+                finished = True
+
+        if finished:
+            break
 
     assert finished, "all tasks are killed"