[CI] Allow 1% tolerance on vllm_bench total_input_tokens check

Samuel Shen · Samuel Shen · commit a9f407e09b71 · 2026-04-14T18:36:31.000-04:00
vLLM's RandomDataset decodes and re-encodes generated token sequences
(vllm/benchmarks/datasets.py) to avoid string-level drift, but the
roundtrip is not guaranteed to preserve exact token counts — the
benchmark itself only warns when token_mismatch != 0. The strict -eq
assertion against NUM_PROMPTS * RANDOM_INPUT_LEN was failing with a
0.08% overage (500400 vs 500000) on Qwen3-14B after a vLLM upgrade.

Switch to a ±1% tolerance check, which matches the benchmark's own
semantics while still catching real workload-size regressions.

Signed-off-by: Samuel Shen &lt;slshen@uchciago.edu&gt;
diff --git a/.buildkite/k3_tests/multiprocess/scripts/run-vllm-bench.sh b/.buildkite/k3_tests/multiprocess/scripts/run-vllm-bench.sh
@@ -121,19 +121,26 @@ verify_results() {
 
     echo "=== Verification ==="
 
-    if [ "$lmcache_total_input_tokens" -eq "$EXPECTED_TOTAL_INPUT_TOKENS" ] 2>/dev/null; then
-        echo "LMCache total_input_tokens: $lmcache_total_input_tokens (expected: $EXPECTED_TOTAL_INPUT_TOKENS) PASS"
-    else
-        echo "LMCache total_input_tokens: $lmcache_total_input_tokens (expected: $EXPECTED_TOTAL_INPUT_TOKENS) FAIL"
-        failed=1
-    fi
-
-    if [ "$baseline_total_input_tokens" -eq "$EXPECTED_TOTAL_INPUT_TOKENS" ] 2>/dev/null; then
-        echo "Baseline total_input_tokens: $baseline_total_input_tokens (expected: $EXPECTED_TOTAL_INPUT_TOKENS) PASS"
-    else
-        echo "Baseline total_input_tokens: $baseline_total_input_tokens (expected: $EXPECTED_TOTAL_INPUT_TOKENS) FAIL"
-        failed=1
-    fi
+    # vLLM's random dataset decodes and re-encodes token sequences, which can
+    # drift slightly from the requested length (see RandomDataset in
+    # vllm/benchmarks/datasets.py). Allow 1% tolerance.
+    local token_tolerance=$((EXPECTED_TOTAL_INPUT_TOKENS / 100))
+
+    check_input_tokens() {
+        local label="$1"
+        local actual="$2"
+        local diff=$((actual - EXPECTED_TOTAL_INPUT_TOKENS))
+        local abs_diff=${diff#-}
+        if [ "$abs_diff" -le "$token_tolerance" ] 2>/dev/null; then
+            echo "$label total_input_tokens: $actual (expected: $EXPECTED_TOTAL_INPUT_TOKENS ±$token_tolerance) PASS"
+        else
+            echo "$label total_input_tokens: $actual (expected: $EXPECTED_TOTAL_INPUT_TOKENS ±$token_tolerance) FAIL"
+            failed=1
+        fi
+    }
+
+    check_input_tokens "LMCache" "$lmcache_total_input_tokens"
+    check_input_tokens "Baseline" "$baseline_total_input_tokens"
 
     if [ "$lmcache_completed" -eq "$EXPECTED_COMPLETED" ] 2>/dev/null; then
         echo "LMCache completed: $lmcache_completed (expected: $EXPECTED_COMPLETED) PASS"
diff --git a/.buildkite/scripts/multiprocessing-test/run-vllm-bench.sh b/.buildkite/scripts/multiprocessing-test/run-vllm-bench.sh
@@ -137,21 +137,26 @@ verify_results() {
     
     echo "=== Verification ==="
     
-    # Check total_input_tokens for LMCache
-    if [ "$lmcache_total_input_tokens" -eq "$EXPECTED_TOTAL_INPUT_TOKENS" ] 2>/dev/null; then
-        echo "✅ LMCache total_input_tokens: $lmcache_total_input_tokens (expected: $EXPECTED_TOTAL_INPUT_TOKENS)"
-    else
-        echo "❌ LMCache total_input_tokens: $lmcache_total_input_tokens (expected: $EXPECTED_TOTAL_INPUT_TOKENS)"
-        failed=1
-    fi
-    
-    # Check total_input_tokens for baseline
-    if [ "$baseline_total_input_tokens" -eq "$EXPECTED_TOTAL_INPUT_TOKENS" ] 2>/dev/null; then
-        echo "✅ Baseline total_input_tokens: $baseline_total_input_tokens (expected: $EXPECTED_TOTAL_INPUT_TOKENS)"
-    else
-        echo "❌ Baseline total_input_tokens: $baseline_total_input_tokens (expected: $EXPECTED_TOTAL_INPUT_TOKENS)"
-        failed=1
-    fi
+    # vLLM's random dataset decodes and re-encodes token sequences, which can
+    # drift slightly from the requested length (see RandomDataset in
+    # vllm/benchmarks/datasets.py). Allow 1% tolerance.
+    local token_tolerance=$((EXPECTED_TOTAL_INPUT_TOKENS / 100))
+
+    check_input_tokens() {
+        local label="$1"
+        local actual="$2"
+        local diff=$((actual - EXPECTED_TOTAL_INPUT_TOKENS))
+        local abs_diff=${diff#-}
+        if [ "$abs_diff" -le "$token_tolerance" ] 2>/dev/null; then
+            echo "✅ $label total_input_tokens: $actual (expected: $EXPECTED_TOTAL_INPUT_TOKENS ±$token_tolerance)"
+        else
+            echo "❌ $label total_input_tokens: $actual (expected: $EXPECTED_TOTAL_INPUT_TOKENS ±$token_tolerance)"
+            failed=1
+        fi
+    }
+
+    check_input_tokens "LMCache" "$lmcache_total_input_tokens"
+    check_input_tokens "Baseline" "$baseline_total_input_tokens"
     
     # Check completed for LMCache
     if [ "$lmcache_completed" -eq "$EXPECTED_COMPLETED" ] 2>/dev/null; then