LMCache
diff --git a/‎.buildkite/k3_tests/multiprocess/scripts/launch-processes.sh‎
Lines changed: 1 addition & 1 deletion b/‎.buildkite/k3_tests/multiprocess/scripts/launch-processes.sh‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.buildkite/k3_tests/multiprocess/scripts/run-fault-tolerance.sh‎
Lines changed: 171 additions & 0 deletions b/‎.buildkite/k3_tests/multiprocess/scripts/run-fault-tolerance.sh‎
Lines changed: 171 additions & 0 deletions
diff --git a/‎.buildkite/k3_tests/multiprocess/scripts/run-mp-test.sh‎
Lines changed: 11 additions & 0 deletions b/‎.buildkite/k3_tests/multiprocess/scripts/run-mp-test.sh‎
Lines changed: 11 additions & 0 deletions
diff --git a/‎.buildkite/scripts/multiprocessing-test/launch-containers.sh‎
Lines changed: 1 addition & 1 deletion b/‎.buildkite/scripts/multiprocessing-test/launch-containers.sh‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.buildkite/scripts/multiprocessing-test/run-fault-tolerance.sh‎
Lines changed: 152 additions & 0 deletions b/‎.buildkite/scripts/multiprocessing-test/run-fault-tolerance.sh‎
Lines changed: 152 additions & 0 deletions
diff --git a/‎.buildkite/scripts/multiprocessing-test/run-mp-test.sh‎
Lines changed: 7 additions & 5 deletions b/‎.buildkite/scripts/multiprocessing-test/run-mp-test.sh‎
Lines changed: 7 additions & 5 deletions
diff --git a/‎.buildkite/scripts/multiprocessing-test/test-launch.sh‎
Lines changed: 1 addition & 1 deletion b/‎.buildkite/scripts/multiprocessing-test/test-launch.sh‎
Lines changed: 1 addition & 1 deletion
@@ -76,7 +76,7 @@ VLLM_SERVER_DEV_MODE=1 \
 VLLM_BATCH_INVARIANT=1 \
 PYTHONHASHSEED=0 \
 vllm serve "$MODEL" \
-    --kv-transfer-config "{\"kv_connector\":\"LMCacheMPConnector\", \"kv_role\":\"kv_both\", \"kv_connector_extra_config\": {\"lmcache.mp.port\": $LMCACHE_PORT}}" \
+    --kv-transfer-config "{\"kv_connector\":\"LMCacheMPConnector\", \"kv_role\":\"kv_both\", \"kv_load_failure_policy\": \"recompute\", \"kv_connector_extra_config\": {\"lmcache.mp.port\": $LMCACHE_PORT, \"lmcache.mp.mq_timeout\": 10}}" \
     --attention-backend FLASH_ATTN \
     --port "$vllm_port" \
     --no-async-scheduling \
 
@@ -0,0 +1,171 @@
+#!/usr/bin/env bash
+# Test LMCache fault tolerance: verify vLLM requests complete after
+# the LMCache MP server is killed mid-flight.
+#
+# Flow:
+#   1. Run a warmup bench (measures baseline timing)
+#   2. Run bench again, killing LMCache server mid-flight
+#   3. Run bench fully without LMCache server
+#   4. Verify all prompts completed in every phase
+set -o pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+REPO_ROOT="$(cd "${SCRIPT_DIR}/../../../.." && pwd)"
+
+source "${REPO_ROOT}/.buildkite/k3_tests/common_scripts/helpers.sh"
+
+# Configuration (inherited from run-mp-test.sh)
+VLLM_PORT="${VLLM_PORT:-8000}"
+MODEL="${MODEL:-Qwen/Qwen3-14B}"
+BUILD_ID="${BUILD_ID:-local_$$}"
+RESULTS_DIR="${RESULTS_DIR:-/tmp/lmcache_ci_results_${BUILD_ID}}"
+LMCACHE_PORT="${LMCACHE_PORT:-6555}"
+
+# Bench parameters
+NUM_PROMPTS="${NUM_PROMPTS:-50}"
+RANDOM_INPUT_LEN="${RANDOM_INPUT_LEN:-10000}"
+RANDOM_OUTPUT_LEN="${RANDOM_OUTPUT_LEN:-1}"
+RANDOM_SEED="${RANDOM_SEED:-42}"
+
+# Output directory
+FT_DIR="$RESULTS_DIR/fault_tolerance"
+mkdir -p "$FT_DIR"
+
+echo "=== Fault Tolerance Test ==="
+echo "Model: $MODEL"
+echo "vLLM Port: $VLLM_PORT"
+echo "LMCache Port: $LMCACHE_PORT"
+echo "Bench: $NUM_PROMPTS prompts, input_len=$RANDOM_INPUT_LEN, output_len=$RANDOM_OUTPUT_LEN"
+echo "Results dir: $FT_DIR"
+echo ""
+
+# ── Helpers ──────────────────────────────────────────────────
+
+run_bench() {
+    local description="$1"
+    local result_file="$2"
+
+    echo ""
+    echo "--- $description ---"
+
+    vllm bench serve \
+        --seed "$RANDOM_SEED" \
+        --port "$VLLM_PORT" \
+        --model "$MODEL" \
+        --dataset-name random \
+        --random-input-len "$RANDOM_INPUT_LEN" \
+        --random-output-len "$RANDOM_OUTPUT_LEN" \
+        --num-prompts "$NUM_PROMPTS" \
+        --ignore-eos \
+        --backend openai-chat \
+        --endpoint /v1/chat/completions \
+        --result-dir "$FT_DIR" \
+        --result-filename "$result_file" \
+        --save-result
+
+    local completed
+    completed=$(python3 -c "
+import json
+with open('$FT_DIR/$result_file') as f:
+    data = json.load(f)
+print(data.get('completed', 0))
+")
+
+    echo "$description: $completed / $NUM_PROMPTS completed"
+
+    if [ "$completed" -ne "$NUM_PROMPTS" ]; then
+        echo "FAIL: Expected $NUM_PROMPTS completed, got $completed"
+        return 1
+    fi
+
+    echo "PASS: All $NUM_PROMPTS prompts completed"
+    return 0
+}
+
+get_lmcache_pid() {
+    local pid_file="/tmp/lmcache_mp_pids_${BUILD_ID}"
+    if [[ -f "$pid_file" ]]; then
+        head -1 "$pid_file"
+    fi
+}
+
+# ── Step 1: Warmup bench ─────────────────────────────────────
+echo "============================================"
+echo "=== Fault Tolerance Step 1: Warmup bench ==="
+echo "============================================"
+
+if ! run_bench "Warmup (with LMCache)" "ft_warmup.json"; then
+    echo "FAIL: Warmup bench failed"
+    exit 1
+fi
+
+# Extract duration to calibrate kill timing
+WARMUP_DURATION=$(python3 -c "import json; print(json.load(open('$FT_DIR/ft_warmup.json'))['duration'])")
+KILL_DELAY=$(python3 -c "print(max(3, int($WARMUP_DURATION * 0.4)))")
+echo "Warmup took ${WARMUP_DURATION}s. Will kill LMCache after ${KILL_DELAY}s in next run."
+
+# ── Step 2: Bench with mid-flight LMCache kill ───────────────
+echo ""
+echo "============================================"
+echo "=== Fault Tolerance Step 2: Mid-flight kill ==="
+echo "============================================"
+
+LMCACHE_PID=$(get_lmcache_pid)
+if [ -z "$LMCACHE_PID" ] || ! kill -0 "$LMCACHE_PID" 2>/dev/null; then
+    echo "FAIL: LMCache server not running (PID=$LMCACHE_PID)"
+    exit 1
+fi
+
+echo "LMCache server PID: $LMCACHE_PID"
+echo "Will kill after ${KILL_DELAY}s into bench."
+
+# Start bench in background
+run_bench "Mid-flight kill" "ft_midflight.json" &
+BENCH_PID=$!
+
+# Wait, then kill LMCache
+sleep "$KILL_DELAY"
+echo "Killing LMCache server (PID: $LMCACHE_PID)..."
+kill "$LMCACHE_PID" 2>/dev/null
+wait "$LMCACHE_PID" 2>/dev/null || true
+echo "LMCache server killed at +${KILL_DELAY}s."
+
+# Wait for bench to finish
+echo "Waiting for bench to complete..."
+if ! wait "$BENCH_PID"; then
+    echo "FAIL: Bench did not complete after mid-flight LMCache kill."
+    echo "--- vLLM log (last 50 lines) ---"
+    tail -50 "/tmp/build_${BUILD_ID}_vllm.log" 2>/dev/null || true
+    exit 1
+fi
+
+# ── Step 3: Bench fully without LMCache server ───────────────
+echo ""
+echo "============================================"
+echo "=== Fault Tolerance Step 3: Without LMCache ==="
+echo "============================================"
+
+if ! run_bench "Without LMCache" "ft_without_lmcache.json"; then
+    echo "FAIL: Bench failed without LMCache server."
+    echo "--- vLLM log (last 50 lines) ---"
+    tail -50 "/tmp/build_${BUILD_ID}_vllm.log" 2>/dev/null || true
+    exit 1
+fi
+
+# ── Summary ──────────────────────────────────────────────────
+echo ""
+echo "============================================"
+echo "=== Fault Tolerance Test PASSED ==="
+echo "============================================"
+
+warmup_completed=$(python3 -c "import json; print(json.load(open('$FT_DIR/ft_warmup.json'))['completed'])")
+warmup_duration=$(python3 -c "import json; print(f\"{json.load(open('$FT_DIR/ft_warmup.json'))['duration']:.1f}\")")
+midflight_completed=$(python3 -c "import json; print(json.load(open('$FT_DIR/ft_midflight.json'))['completed'])")
+midflight_duration=$(python3 -c "import json; print(f\"{json.load(open('$FT_DIR/ft_midflight.json'))['duration']:.1f}\")")
+without_completed=$(python3 -c "import json; print(json.load(open('$FT_DIR/ft_without_lmcache.json'))['completed'])")
+without_duration=$(python3 -c "import json; print(f\"{json.load(open('$FT_DIR/ft_without_lmcache.json'))['duration']:.1f}\")")
+
+echo "  Warmup (with LMCache):  $warmup_completed/$NUM_PROMPTS in ${warmup_duration}s"
+echo "  Mid-flight kill:        $midflight_completed/$NUM_PROMPTS in ${midflight_duration}s (killed at +${KILL_DELAY}s)"
+echo "  Without LMCache:        $without_completed/$NUM_PROMPTS in ${without_duration}s"
+echo ""
@@ -95,6 +95,17 @@ if ! "${SCRIPT_DIR}/run-long-doc-qa.sh"; then
 fi
 echo ""
 
+# Step 6: Fault tolerance test (kills LMCache server -- must be last)
+echo "============================================"
+echo "=== Step 6: Running fault tolerance test ==="
+echo "============================================"
+if ! "${SCRIPT_DIR}/run-fault-tolerance.sh"; then
+    echo "fault tolerance test failed"
+    TEST_RESULT=1
+    exit 1
+fi
+echo ""
+
 echo "============================================"
 echo "=== All tests passed! ==="
 echo "============================================"
@@ -92,7 +92,7 @@ docker run -d \
     --env PYTHONHASHSEED=0 \
     lmcache/vllm-openai:test \
     "$MODEL" \
-    --kv-transfer-config "{\"kv_connector\":\"LMCacheMPConnector\", \"kv_role\":\"kv_both\", \"kv_connector_extra_config\": {\"lmcache.mp.port\": $LMCACHE_PORT}}" \
+    --kv-transfer-config "{\"kv_connector\":\"LMCacheMPConnector\", \"kv_role\":\"kv_both\", \"kv_load_failure_policy\": \"recompute\", \"kv_connector_extra_config\": {\"lmcache.mp.port\": $LMCACHE_PORT, \"lmcache.mp.mq_timeout\": 10}}" \
     --attention-backend FLASH_ATTN \
     --port "$VLLM_PORT" \
     $GPU_MEMORY_UTIL_ARG
 
@@ -0,0 +1,152 @@
+#!/bin/bash
+# Test fault tolerance: verify vLLM requests complete after LMCache server dies.
+#
+# Flow:
+#   1. Warmup bench (calibrate timing)
+#   2. Start bench in background, kill LMCache container mid-flight
+#   3. Verify all prompts completed
+#   4. Quick curl health check (vLLM still alive)
+#
+# NOTE: This test is destructive — it kills the LMCache container.
+# Run it as the LAST test step.
+
+set -e
+set -o pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+source "$SCRIPT_DIR/common.sh"
+
+# Configuration (reuse exported vars from run-mp-test.sh)
+VLLM_PORT="${VLLM_PORT:-8000}"
+LMCACHE_CONTAINER_NAME="${LMCACHE_CONTAINER_NAME:-lmcache-mp-test}"
+VLLM_CONTAINER_NAME="${VLLM_CONTAINER_NAME:-vllm-mp-test}"
+MODEL="${MODEL:-Qwen/Qwen3-14B}"
+NUM_PROMPTS="${NUM_PROMPTS:-50}"
+RANDOM_INPUT_LEN="${RANDOM_INPUT_LEN:-10000}"
+RANDOM_OUTPUT_LEN="${RANDOM_OUTPUT_LEN:-1}"
+RANDOM_SEED="${RANDOM_SEED:-42}"
+
+# Output directory
+FT_RESULTS_DIR="$RESULTS_DIR/fault_tolerance"
+mkdir -p "$FT_RESULTS_DIR"
+
+echo "=== Fault Tolerance Test ==="
+echo "vLLM port: $VLLM_PORT"
+echo "LMCache container: $LMCACHE_CONTAINER_NAME"
+echo "Bench: $NUM_PROMPTS prompts, input_len=$RANDOM_INPUT_LEN"
+echo ""
+
+# Helper: run vllm bench serve
+run_bench() {
+    local description="$1"
+    local result_file="$2"
+
+    echo "--- $description ---"
+    vllm bench serve \
+        --seed "$RANDOM_SEED" \
+        --port "$VLLM_PORT" \
+        --model "$MODEL" \
+        --dataset-name random \
+        --random-input-len "$RANDOM_INPUT_LEN" \
+        --random-output-len "$RANDOM_OUTPUT_LEN" \
+        --num-prompts "$NUM_PROMPTS" \
+        --ignore-eos \
+        --backend openai-chat \
+        --endpoint /v1/chat/completions \
+        --result-dir "$FT_RESULTS_DIR" \
+        --result-filename "$result_file" \
+        --save-result
+
+    local completed
+    completed=$(python3 -c "
+import json
+with open('$FT_RESULTS_DIR/$result_file') as f:
+    data = json.load(f)
+print(data.get('completed', 0))
+")
+
+    echo "$description: $completed / $NUM_PROMPTS completed"
+    if [ "$completed" -ne "$NUM_PROMPTS" ]; then
+        echo "ERROR: Expected $NUM_PROMPTS completed, got $completed"
+        return 1
+    fi
+    echo "All $NUM_PROMPTS prompts completed"
+}
+
+main() {
+    setup_venv vllm openai
+
+    # Step 1: Warmup bench (measure timing for kill calibration)
+    echo "============================================"
+    echo "=== Step 1: Warmup bench ==="
+    echo "============================================"
+    if ! run_bench "Warmup (with LMCache)" "bench_warmup.json"; then
+        echo "Warmup bench failed"
+        exit 1
+    fi
+
+    WARMUP_DURATION=$(python3 -c "import json; print(json.load(open('$FT_RESULTS_DIR/bench_warmup.json'))['duration'])")
+    KILL_DELAY=$(python3 -c "print(max(3, int($WARMUP_DURATION * 0.4)))")
+    echo "Warmup took ${WARMUP_DURATION}s. Will kill LMCache after ${KILL_DELAY}s."
+    echo ""
+
+    # Step 2: Bench with mid-flight LMCache kill
+    echo "============================================"
+    echo "=== Step 2: Bench with mid-flight LMCache kill ==="
+    echo "============================================"
+
+    run_bench "Mid-flight kill" "bench_midflight.json" &
+    BENCH_PID=$!
+
+    echo "Waiting ${KILL_DELAY}s before killing LMCache container..."
+    sleep "$KILL_DELAY"
+
+    echo "Killing LMCache container: $LMCACHE_CONTAINER_NAME"
+    docker kill "$LMCACHE_CONTAINER_NAME" 2>/dev/null || true
+    docker rm -f "$LMCACHE_CONTAINER_NAME" 2>/dev/null || true
+    echo "LMCache container killed."
+
+    echo "Waiting for bench to complete..."
+    if ! wait "$BENCH_PID"; then
+        echo "Bench did not complete after mid-flight LMCache kill"
+        echo "--- vLLM container logs ---"
+        docker logs --tail 50 "$VLLM_CONTAINER_NAME" 2>&1 || true
+        exit 1
+    fi
+    echo ""
+
+    # Step 3: Quick curl health check
+    echo "============================================"
+    echo "=== Step 3: Quick curl health check ==="
+    echo "============================================"
+    for i in 1 2 3; do
+        if ! curl -sf --max-time 120 \
+            "http://localhost:${VLLM_PORT}/v1/completions" \
+            -H "Content-Type: application/json" \
+            -d "{
+                \"model\": \"$MODEL\",
+                \"prompt\": \"Question: What is $i + $i?\\nAnswer:\",
+                \"max_tokens\": 32,
+                \"temperature\": 0
+            }" > /dev/null 2>&1; then
+            echo "Request $i failed - vLLM became unresponsive"
+            exit 1
+        fi
+        echo "  Request $i: OK"
+    done
+    echo "vLLM still responsive"
+    echo ""
+
+    # Summary
+    warmup_dur=$(python3 -c "import json; print(f\"{json.load(open('$FT_RESULTS_DIR/bench_warmup.json'))['duration']:.1f}\")")
+    midflight_dur=$(python3 -c "import json; print(f\"{json.load(open('$FT_RESULTS_DIR/bench_midflight.json'))['duration']:.1f}\")")
+
+    echo "============================================"
+    echo "=== Fault Tolerance Test PASSED ==="
+    echo "============================================"
+    echo "  Warmup:            $NUM_PROMPTS/$NUM_PROMPTS in ${warmup_dur}s"
+    echo "  Mid-flight kill:   $NUM_PROMPTS/$NUM_PROMPTS in ${midflight_dur}s (killed at +${KILL_DELAY}s)"
+    echo "  Results: $FT_RESULTS_DIR/"
+}
+
+main "$@"
@@ -13,7 +13,6 @@ export VLLM_BASELINE_CONTAINER_NAME="vllm-baseline-test-$$"
 export LMCACHE_PORT="${LMCACHE_PORT:-6555}"
 export VLLM_PORT="${VLLM_PORT:-8000}"
 export VLLM_BASELINE_PORT="${VLLM_BASELINE_PORT:-9000}"
-export LMCACHE_HTTP_PORT="${LMCACHE_HTTP_PORT:-6556}"
 export MAX_WAIT_SECONDS="${MAX_WAIT_SECONDS:-300}"
 export BUILD_ID="${BUILD_ID:-local_$$}"
 
@@ -110,12 +109,15 @@ if ! "$SCRIPT_DIR/run-long-doc-qa.sh"; then
 fi
 echo ""
 
-
-# Step 7: Query LMCache server status
+# Step 7: Run fault tolerance test (destructive — kills LMCache container)
 echo "============================================"
-echo "=== Step 7: LMCache server status ==="
+echo "=== Step 7: Running fault tolerance test ==="
 echo "============================================"
-curl -s "http://localhost:${LMCACHE_HTTP_PORT}/api/status" | python3 -m json.tool || echo "⚠️ Failed to query LMCache status"
+if ! "$SCRIPT_DIR/run-fault-tolerance.sh"; then
+    echo "❌ fault tolerance test failed"
+    TEST_RESULT=1
+    exit 1
+fi
 echo ""
 
 echo "============================================"
 
@@ -70,7 +70,7 @@ docker run -d \
     --env PYTHONHASHSEED=0 \
     lmcache/vllm-openai:test \
     "$MODEL" \
-    --kv-transfer-config "{\"kv_connector\":\"LMCacheMPConnector\", \"kv_role\":\"kv_both\", \"kv_connector_extra_config\": {\"lmcache.mp.port\": $LMCACHE_PORT}}" \
+    --kv-transfer-config "{\"kv_connector\":\"LMCacheMPConnector\", \"kv_role\":\"kv_both\", \"kv_load_failure_policy\": \"recompute\", \"kv_connector_extra_config\": {\"lmcache.mp.port\": $LMCACHE_PORT, \"lmcache.mp.mq_timeout\": 10}}" \
     --port "$VLLM_PORT" \
     --no-async-scheduling \
     $GPU_MEMORY_UTIL_ARG