Skip to content

Commit 5c72a69

Browse files
committed
rebase again
Signed-off-by: Samuel Shen <slshen@tensormesh.ai>
2 parents e700def + e2cc19b commit 5c72a69

49 files changed

Lines changed: 6339 additions & 447 deletions

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

.buildkite/k3_tests/multiprocess/scripts/launch-processes.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,7 @@ VLLM_SERVER_DEV_MODE=1 \
7676
VLLM_BATCH_INVARIANT=1 \
7777
PYTHONHASHSEED=0 \
7878
vllm serve "$MODEL" \
79-
--kv-transfer-config "{\"kv_connector\":\"LMCacheMPConnector\", \"kv_role\":\"kv_both\", \"kv_connector_extra_config\": {\"lmcache.mp.port\": $LMCACHE_PORT}}" \
79+
--kv-transfer-config "{\"kv_connector\":\"LMCacheMPConnector\", \"kv_role\":\"kv_both\", \"kv_load_failure_policy\": \"recompute\", \"kv_connector_extra_config\": {\"lmcache.mp.port\": $LMCACHE_PORT, \"lmcache.mp.mq_timeout\": 10}}" \
8080
--attention-backend FLASH_ATTN \
8181
--port "$vllm_port" \
8282
--no-async-scheduling \
Lines changed: 171 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,171 @@
1+
#!/usr/bin/env bash
2+
# Test LMCache fault tolerance: verify vLLM requests complete after
3+
# the LMCache MP server is killed mid-flight.
4+
#
5+
# Flow:
6+
# 1. Run a warmup bench (measures baseline timing)
7+
# 2. Run bench again, killing LMCache server mid-flight
8+
# 3. Run bench fully without LMCache server
9+
# 4. Verify all prompts completed in every phase
10+
set -o pipefail
11+
12+
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
13+
REPO_ROOT="$(cd "${SCRIPT_DIR}/../../../.." && pwd)"
14+
15+
source "${REPO_ROOT}/.buildkite/k3_tests/common_scripts/helpers.sh"
16+
17+
# Configuration (inherited from run-mp-test.sh)
18+
VLLM_PORT="${VLLM_PORT:-8000}"
19+
MODEL="${MODEL:-Qwen/Qwen3-14B}"
20+
BUILD_ID="${BUILD_ID:-local_$$}"
21+
RESULTS_DIR="${RESULTS_DIR:-/tmp/lmcache_ci_results_${BUILD_ID}}"
22+
LMCACHE_PORT="${LMCACHE_PORT:-6555}"
23+
24+
# Bench parameters
25+
NUM_PROMPTS="${NUM_PROMPTS:-50}"
26+
RANDOM_INPUT_LEN="${RANDOM_INPUT_LEN:-10000}"
27+
RANDOM_OUTPUT_LEN="${RANDOM_OUTPUT_LEN:-1}"
28+
RANDOM_SEED="${RANDOM_SEED:-42}"
29+
30+
# Output directory
31+
FT_DIR="$RESULTS_DIR/fault_tolerance"
32+
mkdir -p "$FT_DIR"
33+
34+
echo "=== Fault Tolerance Test ==="
35+
echo "Model: $MODEL"
36+
echo "vLLM Port: $VLLM_PORT"
37+
echo "LMCache Port: $LMCACHE_PORT"
38+
echo "Bench: $NUM_PROMPTS prompts, input_len=$RANDOM_INPUT_LEN, output_len=$RANDOM_OUTPUT_LEN"
39+
echo "Results dir: $FT_DIR"
40+
echo ""
41+
42+
# ── Helpers ──────────────────────────────────────────────────
43+
44+
run_bench() {
45+
local description="$1"
46+
local result_file="$2"
47+
48+
echo ""
49+
echo "--- $description ---"
50+
51+
vllm bench serve \
52+
--seed "$RANDOM_SEED" \
53+
--port "$VLLM_PORT" \
54+
--model "$MODEL" \
55+
--dataset-name random \
56+
--random-input-len "$RANDOM_INPUT_LEN" \
57+
--random-output-len "$RANDOM_OUTPUT_LEN" \
58+
--num-prompts "$NUM_PROMPTS" \
59+
--ignore-eos \
60+
--backend openai-chat \
61+
--endpoint /v1/chat/completions \
62+
--result-dir "$FT_DIR" \
63+
--result-filename "$result_file" \
64+
--save-result
65+
66+
local completed
67+
completed=$(python3 -c "
68+
import json
69+
with open('$FT_DIR/$result_file') as f:
70+
data = json.load(f)
71+
print(data.get('completed', 0))
72+
")
73+
74+
echo "$description: $completed / $NUM_PROMPTS completed"
75+
76+
if [ "$completed" -ne "$NUM_PROMPTS" ]; then
77+
echo "FAIL: Expected $NUM_PROMPTS completed, got $completed"
78+
return 1
79+
fi
80+
81+
echo "PASS: All $NUM_PROMPTS prompts completed"
82+
return 0
83+
}
84+
85+
get_lmcache_pid() {
86+
local pid_file="/tmp/lmcache_mp_pids_${BUILD_ID}"
87+
if [[ -f "$pid_file" ]]; then
88+
head -1 "$pid_file"
89+
fi
90+
}
91+
92+
# ── Step 1: Warmup bench ─────────────────────────────────────
93+
echo "============================================"
94+
echo "=== Fault Tolerance Step 1: Warmup bench ==="
95+
echo "============================================"
96+
97+
if ! run_bench "Warmup (with LMCache)" "ft_warmup.json"; then
98+
echo "FAIL: Warmup bench failed"
99+
exit 1
100+
fi
101+
102+
# Extract duration to calibrate kill timing
103+
WARMUP_DURATION=$(python3 -c "import json; print(json.load(open('$FT_DIR/ft_warmup.json'))['duration'])")
104+
KILL_DELAY=$(python3 -c "print(max(3, int($WARMUP_DURATION * 0.4)))")
105+
echo "Warmup took ${WARMUP_DURATION}s. Will kill LMCache after ${KILL_DELAY}s in next run."
106+
107+
# ── Step 2: Bench with mid-flight LMCache kill ───────────────
108+
echo ""
109+
echo "============================================"
110+
echo "=== Fault Tolerance Step 2: Mid-flight kill ==="
111+
echo "============================================"
112+
113+
LMCACHE_PID=$(get_lmcache_pid)
114+
if [ -z "$LMCACHE_PID" ] || ! kill -0 "$LMCACHE_PID" 2>/dev/null; then
115+
echo "FAIL: LMCache server not running (PID=$LMCACHE_PID)"
116+
exit 1
117+
fi
118+
119+
echo "LMCache server PID: $LMCACHE_PID"
120+
echo "Will kill after ${KILL_DELAY}s into bench."
121+
122+
# Start bench in background
123+
run_bench "Mid-flight kill" "ft_midflight.json" &
124+
BENCH_PID=$!
125+
126+
# Wait, then kill LMCache
127+
sleep "$KILL_DELAY"
128+
echo "Killing LMCache server (PID: $LMCACHE_PID)..."
129+
kill "$LMCACHE_PID" 2>/dev/null
130+
wait "$LMCACHE_PID" 2>/dev/null || true
131+
echo "LMCache server killed at +${KILL_DELAY}s."
132+
133+
# Wait for bench to finish
134+
echo "Waiting for bench to complete..."
135+
if ! wait "$BENCH_PID"; then
136+
echo "FAIL: Bench did not complete after mid-flight LMCache kill."
137+
echo "--- vLLM log (last 50 lines) ---"
138+
tail -50 "/tmp/build_${BUILD_ID}_vllm.log" 2>/dev/null || true
139+
exit 1
140+
fi
141+
142+
# ── Step 3: Bench fully without LMCache server ───────────────
143+
echo ""
144+
echo "============================================"
145+
echo "=== Fault Tolerance Step 3: Without LMCache ==="
146+
echo "============================================"
147+
148+
if ! run_bench "Without LMCache" "ft_without_lmcache.json"; then
149+
echo "FAIL: Bench failed without LMCache server."
150+
echo "--- vLLM log (last 50 lines) ---"
151+
tail -50 "/tmp/build_${BUILD_ID}_vllm.log" 2>/dev/null || true
152+
exit 1
153+
fi
154+
155+
# ── Summary ──────────────────────────────────────────────────
156+
echo ""
157+
echo "============================================"
158+
echo "=== Fault Tolerance Test PASSED ==="
159+
echo "============================================"
160+
161+
warmup_completed=$(python3 -c "import json; print(json.load(open('$FT_DIR/ft_warmup.json'))['completed'])")
162+
warmup_duration=$(python3 -c "import json; print(f\"{json.load(open('$FT_DIR/ft_warmup.json'))['duration']:.1f}\")")
163+
midflight_completed=$(python3 -c "import json; print(json.load(open('$FT_DIR/ft_midflight.json'))['completed'])")
164+
midflight_duration=$(python3 -c "import json; print(f\"{json.load(open('$FT_DIR/ft_midflight.json'))['duration']:.1f}\")")
165+
without_completed=$(python3 -c "import json; print(json.load(open('$FT_DIR/ft_without_lmcache.json'))['completed'])")
166+
without_duration=$(python3 -c "import json; print(f\"{json.load(open('$FT_DIR/ft_without_lmcache.json'))['duration']:.1f}\")")
167+
168+
echo " Warmup (with LMCache): $warmup_completed/$NUM_PROMPTS in ${warmup_duration}s"
169+
echo " Mid-flight kill: $midflight_completed/$NUM_PROMPTS in ${midflight_duration}s (killed at +${KILL_DELAY}s)"
170+
echo " Without LMCache: $without_completed/$NUM_PROMPTS in ${without_duration}s"
171+
echo ""

.buildkite/k3_tests/multiprocess/scripts/run-mp-test.sh

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,17 @@ if ! "${SCRIPT_DIR}/run-long-doc-qa.sh"; then
9595
fi
9696
echo ""
9797

98+
# Step 6: Fault tolerance test (kills LMCache server -- must be last)
99+
echo "============================================"
100+
echo "=== Step 6: Running fault tolerance test ==="
101+
echo "============================================"
102+
if ! "${SCRIPT_DIR}/run-fault-tolerance.sh"; then
103+
echo "fault tolerance test failed"
104+
TEST_RESULT=1
105+
exit 1
106+
fi
107+
echo ""
108+
98109
echo "============================================"
99110
echo "=== All tests passed! ==="
100111
echo "============================================"

.buildkite/scripts/multiprocessing-test/launch-containers.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -92,7 +92,7 @@ docker run -d \
9292
--env PYTHONHASHSEED=0 \
9393
lmcache/vllm-openai:test \
9494
"$MODEL" \
95-
--kv-transfer-config "{\"kv_connector\":\"LMCacheMPConnector\", \"kv_role\":\"kv_both\", \"kv_connector_extra_config\": {\"lmcache.mp.port\": $LMCACHE_PORT}}" \
95+
--kv-transfer-config "{\"kv_connector\":\"LMCacheMPConnector\", \"kv_role\":\"kv_both\", \"kv_load_failure_policy\": \"recompute\", \"kv_connector_extra_config\": {\"lmcache.mp.port\": $LMCACHE_PORT, \"lmcache.mp.mq_timeout\": 10}}" \
9696
--attention-backend FLASH_ATTN \
9797
--port "$VLLM_PORT" \
9898
$GPU_MEMORY_UTIL_ARG
Lines changed: 152 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,152 @@
1+
#!/bin/bash
2+
# Test fault tolerance: verify vLLM requests complete after LMCache server dies.
3+
#
4+
# Flow:
5+
# 1. Warmup bench (calibrate timing)
6+
# 2. Start bench in background, kill LMCache container mid-flight
7+
# 3. Verify all prompts completed
8+
# 4. Quick curl health check (vLLM still alive)
9+
#
10+
# NOTE: This test is destructive — it kills the LMCache container.
11+
# Run it as the LAST test step.
12+
13+
set -e
14+
set -o pipefail
15+
16+
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
17+
source "$SCRIPT_DIR/common.sh"
18+
19+
# Configuration (reuse exported vars from run-mp-test.sh)
20+
VLLM_PORT="${VLLM_PORT:-8000}"
21+
LMCACHE_CONTAINER_NAME="${LMCACHE_CONTAINER_NAME:-lmcache-mp-test}"
22+
VLLM_CONTAINER_NAME="${VLLM_CONTAINER_NAME:-vllm-mp-test}"
23+
MODEL="${MODEL:-Qwen/Qwen3-14B}"
24+
NUM_PROMPTS="${NUM_PROMPTS:-50}"
25+
RANDOM_INPUT_LEN="${RANDOM_INPUT_LEN:-10000}"
26+
RANDOM_OUTPUT_LEN="${RANDOM_OUTPUT_LEN:-1}"
27+
RANDOM_SEED="${RANDOM_SEED:-42}"
28+
29+
# Output directory
30+
FT_RESULTS_DIR="$RESULTS_DIR/fault_tolerance"
31+
mkdir -p "$FT_RESULTS_DIR"
32+
33+
echo "=== Fault Tolerance Test ==="
34+
echo "vLLM port: $VLLM_PORT"
35+
echo "LMCache container: $LMCACHE_CONTAINER_NAME"
36+
echo "Bench: $NUM_PROMPTS prompts, input_len=$RANDOM_INPUT_LEN"
37+
echo ""
38+
39+
# Helper: run vllm bench serve
40+
run_bench() {
41+
local description="$1"
42+
local result_file="$2"
43+
44+
echo "--- $description ---"
45+
vllm bench serve \
46+
--seed "$RANDOM_SEED" \
47+
--port "$VLLM_PORT" \
48+
--model "$MODEL" \
49+
--dataset-name random \
50+
--random-input-len "$RANDOM_INPUT_LEN" \
51+
--random-output-len "$RANDOM_OUTPUT_LEN" \
52+
--num-prompts "$NUM_PROMPTS" \
53+
--ignore-eos \
54+
--backend openai-chat \
55+
--endpoint /v1/chat/completions \
56+
--result-dir "$FT_RESULTS_DIR" \
57+
--result-filename "$result_file" \
58+
--save-result
59+
60+
local completed
61+
completed=$(python3 -c "
62+
import json
63+
with open('$FT_RESULTS_DIR/$result_file') as f:
64+
data = json.load(f)
65+
print(data.get('completed', 0))
66+
")
67+
68+
echo "$description: $completed / $NUM_PROMPTS completed"
69+
if [ "$completed" -ne "$NUM_PROMPTS" ]; then
70+
echo "ERROR: Expected $NUM_PROMPTS completed, got $completed"
71+
return 1
72+
fi
73+
echo "All $NUM_PROMPTS prompts completed"
74+
}
75+
76+
main() {
77+
setup_venv vllm openai
78+
79+
# Step 1: Warmup bench (measure timing for kill calibration)
80+
echo "============================================"
81+
echo "=== Step 1: Warmup bench ==="
82+
echo "============================================"
83+
if ! run_bench "Warmup (with LMCache)" "bench_warmup.json"; then
84+
echo "Warmup bench failed"
85+
exit 1
86+
fi
87+
88+
WARMUP_DURATION=$(python3 -c "import json; print(json.load(open('$FT_RESULTS_DIR/bench_warmup.json'))['duration'])")
89+
KILL_DELAY=$(python3 -c "print(max(3, int($WARMUP_DURATION * 0.4)))")
90+
echo "Warmup took ${WARMUP_DURATION}s. Will kill LMCache after ${KILL_DELAY}s."
91+
echo ""
92+
93+
# Step 2: Bench with mid-flight LMCache kill
94+
echo "============================================"
95+
echo "=== Step 2: Bench with mid-flight LMCache kill ==="
96+
echo "============================================"
97+
98+
run_bench "Mid-flight kill" "bench_midflight.json" &
99+
BENCH_PID=$!
100+
101+
echo "Waiting ${KILL_DELAY}s before killing LMCache container..."
102+
sleep "$KILL_DELAY"
103+
104+
echo "Killing LMCache container: $LMCACHE_CONTAINER_NAME"
105+
docker kill "$LMCACHE_CONTAINER_NAME" 2>/dev/null || true
106+
docker rm -f "$LMCACHE_CONTAINER_NAME" 2>/dev/null || true
107+
echo "LMCache container killed."
108+
109+
echo "Waiting for bench to complete..."
110+
if ! wait "$BENCH_PID"; then
111+
echo "Bench did not complete after mid-flight LMCache kill"
112+
echo "--- vLLM container logs ---"
113+
docker logs --tail 50 "$VLLM_CONTAINER_NAME" 2>&1 || true
114+
exit 1
115+
fi
116+
echo ""
117+
118+
# Step 3: Quick curl health check
119+
echo "============================================"
120+
echo "=== Step 3: Quick curl health check ==="
121+
echo "============================================"
122+
for i in 1 2 3; do
123+
if ! curl -sf --max-time 120 \
124+
"http://localhost:${VLLM_PORT}/v1/completions" \
125+
-H "Content-Type: application/json" \
126+
-d "{
127+
\"model\": \"$MODEL\",
128+
\"prompt\": \"Question: What is $i + $i?\\nAnswer:\",
129+
\"max_tokens\": 32,
130+
\"temperature\": 0
131+
}" > /dev/null 2>&1; then
132+
echo "Request $i failed - vLLM became unresponsive"
133+
exit 1
134+
fi
135+
echo " Request $i: OK"
136+
done
137+
echo "vLLM still responsive"
138+
echo ""
139+
140+
# Summary
141+
warmup_dur=$(python3 -c "import json; print(f\"{json.load(open('$FT_RESULTS_DIR/bench_warmup.json'))['duration']:.1f}\")")
142+
midflight_dur=$(python3 -c "import json; print(f\"{json.load(open('$FT_RESULTS_DIR/bench_midflight.json'))['duration']:.1f}\")")
143+
144+
echo "============================================"
145+
echo "=== Fault Tolerance Test PASSED ==="
146+
echo "============================================"
147+
echo " Warmup: $NUM_PROMPTS/$NUM_PROMPTS in ${warmup_dur}s"
148+
echo " Mid-flight kill: $NUM_PROMPTS/$NUM_PROMPTS in ${midflight_dur}s (killed at +${KILL_DELAY}s)"
149+
echo " Results: $FT_RESULTS_DIR/"
150+
}
151+
152+
main "$@"

.buildkite/scripts/multiprocessing-test/run-mp-test.sh

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,6 @@ export VLLM_BASELINE_CONTAINER_NAME="vllm-baseline-test-$$"
1313
export LMCACHE_PORT="${LMCACHE_PORT:-6555}"
1414
export VLLM_PORT="${VLLM_PORT:-8000}"
1515
export VLLM_BASELINE_PORT="${VLLM_BASELINE_PORT:-9000}"
16-
export LMCACHE_HTTP_PORT="${LMCACHE_HTTP_PORT:-6556}"
1716
export MAX_WAIT_SECONDS="${MAX_WAIT_SECONDS:-300}"
1817
export BUILD_ID="${BUILD_ID:-local_$$}"
1918

@@ -110,12 +109,15 @@ if ! "$SCRIPT_DIR/run-long-doc-qa.sh"; then
110109
fi
111110
echo ""
112111

113-
114-
# Step 7: Query LMCache server status
112+
# Step 7: Run fault tolerance test (destructive — kills LMCache container)
115113
echo "============================================"
116-
echo "=== Step 7: LMCache server status ==="
114+
echo "=== Step 7: Running fault tolerance test ==="
117115
echo "============================================"
118-
curl -s "http://localhost:${LMCACHE_HTTP_PORT}/api/status" | python3 -m json.tool || echo "⚠️ Failed to query LMCache status"
116+
if ! "$SCRIPT_DIR/run-fault-tolerance.sh"; then
117+
echo "❌ fault tolerance test failed"
118+
TEST_RESULT=1
119+
exit 1
120+
fi
119121
echo ""
120122

121123
echo "============================================"

.buildkite/scripts/multiprocessing-test/test-launch.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,7 @@ docker run -d \
7070
--env PYTHONHASHSEED=0 \
7171
lmcache/vllm-openai:test \
7272
"$MODEL" \
73-
--kv-transfer-config "{\"kv_connector\":\"LMCacheMPConnector\", \"kv_role\":\"kv_both\", \"kv_connector_extra_config\": {\"lmcache.mp.port\": $LMCACHE_PORT}}" \
73+
--kv-transfer-config "{\"kv_connector\":\"LMCacheMPConnector\", \"kv_role\":\"kv_both\", \"kv_load_failure_policy\": \"recompute\", \"kv_connector_extra_config\": {\"lmcache.mp.port\": $LMCACHE_PORT, \"lmcache.mp.mq_timeout\": 10}}" \
7474
--port "$VLLM_PORT" \
7575
--no-async-scheduling \
7676
$GPU_MEMORY_UTIL_ARG

0 commit comments

Comments
 (0)