Skip to content

Commit a9f407e

Browse files
author
Samuel Shen
committed
[CI] Allow 1% tolerance on vllm_bench total_input_tokens check
vLLM's RandomDataset decodes and re-encodes generated token sequences (vllm/benchmarks/datasets.py) to avoid string-level drift, but the roundtrip is not guaranteed to preserve exact token counts — the benchmark itself only warns when token_mismatch != 0. The strict -eq assertion against NUM_PROMPTS * RANDOM_INPUT_LEN was failing with a 0.08% overage (500400 vs 500000) on Qwen3-14B after a vLLM upgrade. Switch to a ±1% tolerance check, which matches the benchmark's own semantics while still catching real workload-size regressions. Signed-off-by: Samuel Shen <slshen@uchciago.edu>
1 parent cfb5c52 commit a9f407e

2 files changed

Lines changed: 40 additions & 28 deletions

File tree

.buildkite/k3_tests/multiprocess/scripts/run-vllm-bench.sh

Lines changed: 20 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -121,19 +121,26 @@ verify_results() {
121121

122122
echo "=== Verification ==="
123123

124-
if [ "$lmcache_total_input_tokens" -eq "$EXPECTED_TOTAL_INPUT_TOKENS" ] 2>/dev/null; then
125-
echo "LMCache total_input_tokens: $lmcache_total_input_tokens (expected: $EXPECTED_TOTAL_INPUT_TOKENS) PASS"
126-
else
127-
echo "LMCache total_input_tokens: $lmcache_total_input_tokens (expected: $EXPECTED_TOTAL_INPUT_TOKENS) FAIL"
128-
failed=1
129-
fi
130-
131-
if [ "$baseline_total_input_tokens" -eq "$EXPECTED_TOTAL_INPUT_TOKENS" ] 2>/dev/null; then
132-
echo "Baseline total_input_tokens: $baseline_total_input_tokens (expected: $EXPECTED_TOTAL_INPUT_TOKENS) PASS"
133-
else
134-
echo "Baseline total_input_tokens: $baseline_total_input_tokens (expected: $EXPECTED_TOTAL_INPUT_TOKENS) FAIL"
135-
failed=1
136-
fi
124+
# vLLM's random dataset decodes and re-encodes token sequences, which can
125+
# drift slightly from the requested length (see RandomDataset in
126+
# vllm/benchmarks/datasets.py). Allow 1% tolerance.
127+
local token_tolerance=$((EXPECTED_TOTAL_INPUT_TOKENS / 100))
128+
129+
check_input_tokens() {
130+
local label="$1"
131+
local actual="$2"
132+
local diff=$((actual - EXPECTED_TOTAL_INPUT_TOKENS))
133+
local abs_diff=${diff#-}
134+
if [ "$abs_diff" -le "$token_tolerance" ] 2>/dev/null; then
135+
echo "$label total_input_tokens: $actual (expected: $EXPECTED_TOTAL_INPUT_TOKENS ±$token_tolerance) PASS"
136+
else
137+
echo "$label total_input_tokens: $actual (expected: $EXPECTED_TOTAL_INPUT_TOKENS ±$token_tolerance) FAIL"
138+
failed=1
139+
fi
140+
}
141+
142+
check_input_tokens "LMCache" "$lmcache_total_input_tokens"
143+
check_input_tokens "Baseline" "$baseline_total_input_tokens"
137144

138145
if [ "$lmcache_completed" -eq "$EXPECTED_COMPLETED" ] 2>/dev/null; then
139146
echo "LMCache completed: $lmcache_completed (expected: $EXPECTED_COMPLETED) PASS"

.buildkite/scripts/multiprocessing-test/run-vllm-bench.sh

Lines changed: 20 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -137,21 +137,26 @@ verify_results() {
137137

138138
echo "=== Verification ==="
139139

140-
# Check total_input_tokens for LMCache
141-
if [ "$lmcache_total_input_tokens" -eq "$EXPECTED_TOTAL_INPUT_TOKENS" ] 2>/dev/null; then
142-
echo "✅ LMCache total_input_tokens: $lmcache_total_input_tokens (expected: $EXPECTED_TOTAL_INPUT_TOKENS)"
143-
else
144-
echo "❌ LMCache total_input_tokens: $lmcache_total_input_tokens (expected: $EXPECTED_TOTAL_INPUT_TOKENS)"
145-
failed=1
146-
fi
147-
148-
# Check total_input_tokens for baseline
149-
if [ "$baseline_total_input_tokens" -eq "$EXPECTED_TOTAL_INPUT_TOKENS" ] 2>/dev/null; then
150-
echo "✅ Baseline total_input_tokens: $baseline_total_input_tokens (expected: $EXPECTED_TOTAL_INPUT_TOKENS)"
151-
else
152-
echo "❌ Baseline total_input_tokens: $baseline_total_input_tokens (expected: $EXPECTED_TOTAL_INPUT_TOKENS)"
153-
failed=1
154-
fi
140+
# vLLM's random dataset decodes and re-encodes token sequences, which can
141+
# drift slightly from the requested length (see RandomDataset in
142+
# vllm/benchmarks/datasets.py). Allow 1% tolerance.
143+
local token_tolerance=$((EXPECTED_TOTAL_INPUT_TOKENS / 100))
144+
145+
check_input_tokens() {
146+
local label="$1"
147+
local actual="$2"
148+
local diff=$((actual - EXPECTED_TOTAL_INPUT_TOKENS))
149+
local abs_diff=${diff#-}
150+
if [ "$abs_diff" -le "$token_tolerance" ] 2>/dev/null; then
151+
echo "$label total_input_tokens: $actual (expected: $EXPECTED_TOTAL_INPUT_TOKENS ±$token_tolerance)"
152+
else
153+
echo "$label total_input_tokens: $actual (expected: $EXPECTED_TOTAL_INPUT_TOKENS ±$token_tolerance)"
154+
failed=1
155+
fi
156+
}
157+
158+
check_input_tokens "LMCache" "$lmcache_total_input_tokens"
159+
check_input_tokens "Baseline" "$baseline_total_input_tokens"
155160

156161
# Check completed for LMCache
157162
if [ "$lmcache_completed" -eq "$EXPECTED_COMPLETED" ] 2>/dev/null; then

0 commit comments

Comments
 (0)