|
| 1 | +#!/usr/bin/env bash |
| 2 | +# Test LMCache fault tolerance: verify vLLM requests complete after |
| 3 | +# the LMCache MP server is killed mid-flight. |
| 4 | +# |
| 5 | +# Flow: |
| 6 | +# 1. Run a warmup bench (measures baseline timing) |
| 7 | +# 2. Run bench again, killing LMCache server mid-flight |
| 8 | +# 3. Run bench fully without LMCache server |
| 9 | +# 4. Verify all prompts completed in every phase |
| 10 | +set -o pipefail |
| 11 | + |
| 12 | +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" |
| 13 | +REPO_ROOT="$(cd "${SCRIPT_DIR}/../../../.." && pwd)" |
| 14 | + |
| 15 | +source "${REPO_ROOT}/.buildkite/k3_tests/common_scripts/helpers.sh" |
| 16 | + |
| 17 | +# Configuration (inherited from run-mp-test.sh) |
| 18 | +VLLM_PORT="${VLLM_PORT:-8000}" |
| 19 | +MODEL="${MODEL:-Qwen/Qwen3-14B}" |
| 20 | +BUILD_ID="${BUILD_ID:-local_$$}" |
| 21 | +RESULTS_DIR="${RESULTS_DIR:-/tmp/lmcache_ci_results_${BUILD_ID}}" |
| 22 | +LMCACHE_PORT="${LMCACHE_PORT:-6555}" |
| 23 | + |
| 24 | +# Bench parameters |
| 25 | +NUM_PROMPTS="${NUM_PROMPTS:-50}" |
| 26 | +RANDOM_INPUT_LEN="${RANDOM_INPUT_LEN:-10000}" |
| 27 | +RANDOM_OUTPUT_LEN="${RANDOM_OUTPUT_LEN:-1}" |
| 28 | +RANDOM_SEED="${RANDOM_SEED:-42}" |
| 29 | + |
| 30 | +# Output directory |
| 31 | +FT_DIR="$RESULTS_DIR/fault_tolerance" |
| 32 | +mkdir -p "$FT_DIR" |
| 33 | + |
| 34 | +echo "=== Fault Tolerance Test ===" |
| 35 | +echo "Model: $MODEL" |
| 36 | +echo "vLLM Port: $VLLM_PORT" |
| 37 | +echo "LMCache Port: $LMCACHE_PORT" |
| 38 | +echo "Bench: $NUM_PROMPTS prompts, input_len=$RANDOM_INPUT_LEN, output_len=$RANDOM_OUTPUT_LEN" |
| 39 | +echo "Results dir: $FT_DIR" |
| 40 | +echo "" |
| 41 | + |
| 42 | +# ── Helpers ────────────────────────────────────────────────── |
| 43 | + |
| 44 | +run_bench() { |
| 45 | + local description="$1" |
| 46 | + local result_file="$2" |
| 47 | + |
| 48 | + echo "" |
| 49 | + echo "--- $description ---" |
| 50 | + |
| 51 | + vllm bench serve \ |
| 52 | + --seed "$RANDOM_SEED" \ |
| 53 | + --port "$VLLM_PORT" \ |
| 54 | + --model "$MODEL" \ |
| 55 | + --dataset-name random \ |
| 56 | + --random-input-len "$RANDOM_INPUT_LEN" \ |
| 57 | + --random-output-len "$RANDOM_OUTPUT_LEN" \ |
| 58 | + --num-prompts "$NUM_PROMPTS" \ |
| 59 | + --ignore-eos \ |
| 60 | + --backend openai-chat \ |
| 61 | + --endpoint /v1/chat/completions \ |
| 62 | + --result-dir "$FT_DIR" \ |
| 63 | + --result-filename "$result_file" \ |
| 64 | + --save-result |
| 65 | + |
| 66 | + local completed |
| 67 | + completed=$(python3 -c " |
| 68 | +import json |
| 69 | +with open('$FT_DIR/$result_file') as f: |
| 70 | + data = json.load(f) |
| 71 | +print(data.get('completed', 0)) |
| 72 | +") |
| 73 | + |
| 74 | + echo "$description: $completed / $NUM_PROMPTS completed" |
| 75 | + |
| 76 | + if [ "$completed" -ne "$NUM_PROMPTS" ]; then |
| 77 | + echo "FAIL: Expected $NUM_PROMPTS completed, got $completed" |
| 78 | + return 1 |
| 79 | + fi |
| 80 | + |
| 81 | + echo "PASS: All $NUM_PROMPTS prompts completed" |
| 82 | + return 0 |
| 83 | +} |
| 84 | + |
| 85 | +get_lmcache_pid() { |
| 86 | + local pid_file="/tmp/lmcache_mp_pids_${BUILD_ID}" |
| 87 | + if [[ -f "$pid_file" ]]; then |
| 88 | + head -1 "$pid_file" |
| 89 | + fi |
| 90 | +} |
| 91 | + |
| 92 | +# ── Step 1: Warmup bench ───────────────────────────────────── |
| 93 | +echo "============================================" |
| 94 | +echo "=== Fault Tolerance Step 1: Warmup bench ===" |
| 95 | +echo "============================================" |
| 96 | + |
| 97 | +if ! run_bench "Warmup (with LMCache)" "ft_warmup.json"; then |
| 98 | + echo "FAIL: Warmup bench failed" |
| 99 | + exit 1 |
| 100 | +fi |
| 101 | + |
| 102 | +# Extract duration to calibrate kill timing |
| 103 | +WARMUP_DURATION=$(python3 -c "import json; print(json.load(open('$FT_DIR/ft_warmup.json'))['duration'])") |
| 104 | +KILL_DELAY=$(python3 -c "print(max(3, int($WARMUP_DURATION * 0.4)))") |
| 105 | +echo "Warmup took ${WARMUP_DURATION}s. Will kill LMCache after ${KILL_DELAY}s in next run." |
| 106 | + |
| 107 | +# ── Step 2: Bench with mid-flight LMCache kill ─────────────── |
| 108 | +echo "" |
| 109 | +echo "============================================" |
| 110 | +echo "=== Fault Tolerance Step 2: Mid-flight kill ===" |
| 111 | +echo "============================================" |
| 112 | + |
| 113 | +LMCACHE_PID=$(get_lmcache_pid) |
| 114 | +if [ -z "$LMCACHE_PID" ] || ! kill -0 "$LMCACHE_PID" 2>/dev/null; then |
| 115 | + echo "FAIL: LMCache server not running (PID=$LMCACHE_PID)" |
| 116 | + exit 1 |
| 117 | +fi |
| 118 | + |
| 119 | +echo "LMCache server PID: $LMCACHE_PID" |
| 120 | +echo "Will kill after ${KILL_DELAY}s into bench." |
| 121 | + |
| 122 | +# Start bench in background |
| 123 | +run_bench "Mid-flight kill" "ft_midflight.json" & |
| 124 | +BENCH_PID=$! |
| 125 | + |
| 126 | +# Wait, then kill LMCache |
| 127 | +sleep "$KILL_DELAY" |
| 128 | +echo "Killing LMCache server (PID: $LMCACHE_PID)..." |
| 129 | +kill "$LMCACHE_PID" 2>/dev/null |
| 130 | +wait "$LMCACHE_PID" 2>/dev/null || true |
| 131 | +echo "LMCache server killed at +${KILL_DELAY}s." |
| 132 | + |
| 133 | +# Wait for bench to finish |
| 134 | +echo "Waiting for bench to complete..." |
| 135 | +if ! wait "$BENCH_PID"; then |
| 136 | + echo "FAIL: Bench did not complete after mid-flight LMCache kill." |
| 137 | + echo "--- vLLM log (last 50 lines) ---" |
| 138 | + tail -50 "/tmp/build_${BUILD_ID}_vllm.log" 2>/dev/null || true |
| 139 | + exit 1 |
| 140 | +fi |
| 141 | + |
| 142 | +# ── Step 3: Bench fully without LMCache server ─────────────── |
| 143 | +echo "" |
| 144 | +echo "============================================" |
| 145 | +echo "=== Fault Tolerance Step 3: Without LMCache ===" |
| 146 | +echo "============================================" |
| 147 | + |
| 148 | +if ! run_bench "Without LMCache" "ft_without_lmcache.json"; then |
| 149 | + echo "FAIL: Bench failed without LMCache server." |
| 150 | + echo "--- vLLM log (last 50 lines) ---" |
| 151 | + tail -50 "/tmp/build_${BUILD_ID}_vllm.log" 2>/dev/null || true |
| 152 | + exit 1 |
| 153 | +fi |
| 154 | + |
| 155 | +# ── Summary ────────────────────────────────────────────────── |
| 156 | +echo "" |
| 157 | +echo "============================================" |
| 158 | +echo "=== Fault Tolerance Test PASSED ===" |
| 159 | +echo "============================================" |
| 160 | + |
| 161 | +warmup_completed=$(python3 -c "import json; print(json.load(open('$FT_DIR/ft_warmup.json'))['completed'])") |
| 162 | +warmup_duration=$(python3 -c "import json; print(f\"{json.load(open('$FT_DIR/ft_warmup.json'))['duration']:.1f}\")") |
| 163 | +midflight_completed=$(python3 -c "import json; print(json.load(open('$FT_DIR/ft_midflight.json'))['completed'])") |
| 164 | +midflight_duration=$(python3 -c "import json; print(f\"{json.load(open('$FT_DIR/ft_midflight.json'))['duration']:.1f}\")") |
| 165 | +without_completed=$(python3 -c "import json; print(json.load(open('$FT_DIR/ft_without_lmcache.json'))['completed'])") |
| 166 | +without_duration=$(python3 -c "import json; print(f\"{json.load(open('$FT_DIR/ft_without_lmcache.json'))['duration']:.1f}\")") |
| 167 | + |
| 168 | +echo " Warmup (with LMCache): $warmup_completed/$NUM_PROMPTS in ${warmup_duration}s" |
| 169 | +echo " Mid-flight kill: $midflight_completed/$NUM_PROMPTS in ${midflight_duration}s (killed at +${KILL_DELAY}s)" |
| 170 | +echo " Without LMCache: $without_completed/$NUM_PROMPTS in ${without_duration}s" |
| 171 | +echo "" |
0 commit comments