|
| 1 | +#!/usr/bin/env bash |
| 2 | +# Self-contained deadlock regression test. |
| 3 | +# |
| 4 | +# Launches DeepSeek-V2-Lite-Chat with TP=2 (both GPUs) + LMCache server, |
| 5 | +# sends 50 requests with ~30K token prefixes, and verifies they all |
| 6 | +# complete within 3 minutes. A CUDA-driver/GIL deadlock would cause |
| 7 | +# requests to hang indefinitely, failing the timeout. |
| 8 | +# |
| 9 | +# This test is self-contained: it handles its own server lifecycle |
| 10 | +# instead of using the standard launch-processes.sh / wait-for-servers.sh. |
| 11 | +set -o pipefail |
| 12 | + |
| 13 | +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" |
| 14 | +REPO_ROOT="$(cd "${SCRIPT_DIR}/../../../.." && pwd)" |
| 15 | + |
| 16 | +source "${REPO_ROOT}/.buildkite/k3_tests/common_scripts/helpers.sh" |
| 17 | + |
| 18 | +# ── Configuration ─────────────────────────────────────────── |
| 19 | +MODEL="deepseek-ai/DeepSeek-V2-Lite-Chat" |
| 20 | +LMCACHE_PORT="${LMCACHE_PORT:-15554}" |
| 21 | +VLLM_PORT="${VLLM_PORT:-8000}" |
| 22 | +BUILD_ID="${BUILD_ID:-local_$$}" |
| 23 | +PID_FILE="/tmp/lmcache_mp_pids_${BUILD_ID}" |
| 24 | +TIMEOUT_SECONDS=180 # 3 minutes |
| 25 | + |
| 26 | +# ── Install py-spy for deadlock diagnosis ────────────────── |
| 27 | +echo "=== Installing py-spy ===" |
| 28 | +uv pip install py-spy |
| 29 | +PY_SPY="$(which py-spy)" |
| 30 | +echo "py-spy installed at: $PY_SPY" |
| 31 | + |
| 32 | +PYSPY_LOG="/tmp/build_${BUILD_ID}_pyspy.log" |
| 33 | + |
| 34 | +# ── Helper: dump stacks of server processes via py-spy ───── |
| 35 | +dump_stacks() { |
| 36 | + echo "" | tee -a "$PYSPY_LOG" |
| 37 | + echo "=== py-spy stack dump (native + Python) ===" | tee -a "$PYSPY_LOG" |
| 38 | + |
| 39 | + if kill -0 "$LMCACHE_PID" 2>/dev/null; then |
| 40 | + echo "" | tee -a "$PYSPY_LOG" |
| 41 | + echo "--- LMCache server (PID=$LMCACHE_PID) ---" | tee -a "$PYSPY_LOG" |
| 42 | + sudo "$PY_SPY" dump --pid "$LMCACHE_PID" --native 2>&1 | tee -a "$PYSPY_LOG" || true |
| 43 | + fi |
| 44 | + |
| 45 | + # Copy to repo root so cleanup.sh collects it as a Buildkite artifact |
| 46 | + cp "$PYSPY_LOG" "${REPO_ROOT}/build_${BUILD_ID}_pyspy.log" 2>/dev/null || true |
| 47 | +} |
| 48 | + |
| 49 | +# ── 1. Launch LMCache server ─────────────────────────────── |
| 50 | +echo "=== Launching LMCache server ===" |
| 51 | +echo "Port: $LMCACHE_PORT" |
| 52 | + |
| 53 | +lmcache server \ |
| 54 | + --host localhost \ |
| 55 | + --port "$LMCACHE_PORT" \ |
| 56 | + --chunk-size 256 \ |
| 57 | + --l1-size-gb 50 \ |
| 58 | + --eviction-policy LRU \ |
| 59 | + --max-workers 2 \ |
| 60 | + > "/tmp/build_${BUILD_ID}_lmcache.log" 2>&1 & |
| 61 | + |
| 62 | +LMCACHE_PID=$! |
| 63 | +echo "$LMCACHE_PID" >> "$PID_FILE" |
| 64 | +echo "LMCache server started (PID=$LMCACHE_PID)" |
| 65 | +sleep 10 |
| 66 | + |
| 67 | +# ── 2. Launch vLLM with DeepSeek TP=2 ───────────────────── |
| 68 | +echo "=== Launching vLLM (DeepSeek TP=2) ===" |
| 69 | +echo "Model: $MODEL" |
| 70 | +echo "Port: $VLLM_PORT" |
| 71 | + |
| 72 | +# Save VLLM_PORT before unsetting — vLLM's internal get_open_port() |
| 73 | +# would otherwise collide with the serving port for torch.distributed. |
| 74 | +SAVED_VLLM_PORT="$VLLM_PORT" |
| 75 | +unset VLLM_PORT |
| 76 | + |
| 77 | +FLASHINFER_DISABLE_VERSION_CHECK=1 \ |
| 78 | +VLLM_SERVER_DEV_MODE=1 \ |
| 79 | +vllm serve "$MODEL" \ |
| 80 | + --tensor-parallel-size 2 \ |
| 81 | + --distributed-executor-backend mp \ |
| 82 | + --block-size 64 \ |
| 83 | + --trust-remote-code \ |
| 84 | + --load-format dummy \ |
| 85 | + --enable-prefix-caching \ |
| 86 | + --enable-chunked-prefill \ |
| 87 | + --gpu-memory-utilization 0.8 \ |
| 88 | + --max-model-len 65536 \ |
| 89 | + --hf-overrides '{"max_position_embeddings":65536}' \ |
| 90 | + --max-num-seqs 32 \ |
| 91 | + --max-num-batched-tokens 16000 \ |
| 92 | + --scheduling-policy fcfs \ |
| 93 | + --port "$SAVED_VLLM_PORT" \ |
| 94 | + --enforce-eager \ |
| 95 | + --kv-transfer-config "{\"kv_connector\":\"LMCacheMPConnector\", \"kv_role\":\"kv_both\", \"kv_load_failure_policy\": \"recompute\", \"kv_connector_extra_config\": {\"lmcache.mp.port\": $LMCACHE_PORT, \"lmcache.mp.mq_timeout\": 60}}" \ |
| 96 | + > "/tmp/build_${BUILD_ID}_vllm.log" 2>&1 & |
| 97 | + |
| 98 | +VLLM_PID=$! |
| 99 | +echo "$VLLM_PID" >> "$PID_FILE" |
| 100 | +echo "vLLM started (PID=$VLLM_PID)" |
| 101 | + |
| 102 | +VLLM_PORT="$SAVED_VLLM_PORT" |
| 103 | + |
| 104 | +# ── 3. Wait for vLLM to be ready ────────────────────────── |
| 105 | +echo "=== Waiting for vLLM to be ready ===" |
| 106 | +if ! wait_for_server "$VLLM_PORT" 600; then |
| 107 | + echo "vLLM failed to start. Last 100 lines of log:" |
| 108 | + tail -100 "/tmp/build_${BUILD_ID}_vllm.log" 2>/dev/null || true |
| 109 | + exit 1 |
| 110 | +fi |
| 111 | + |
| 112 | +# ── 4. Run benchmark with timeout ───────────────────────── |
| 113 | +echo "=== Running lmcache bench engine (random-prefill, 50 reqs, ~30K tokens) ===" |
| 114 | +echo "Timeout: ${TIMEOUT_SECONDS}s" |
| 115 | + |
| 116 | +if ! timeout "$TIMEOUT_SECONDS" lmcache bench engine \ |
| 117 | + --engine-url "http://localhost:${VLLM_PORT}" \ |
| 118 | + --workload random-prefill \ |
| 119 | + --tokens-per-gb-kvcache 6000 \ |
| 120 | + --rp-request-length 30000 \ |
| 121 | + --rp-num-requests 50 \ |
| 122 | + --no-interactive \ |
| 123 | + --no-csv \ |
| 124 | + -q; then |
| 125 | + echo "FAIL: Benchmark failed or timed out (possible deadlock)" |
| 126 | + echo "" |
| 127 | + echo "=== LMCache log (last 50 lines) ===" |
| 128 | + tail -50 "/tmp/build_${BUILD_ID}_lmcache.log" 2>/dev/null || true |
| 129 | + echo "" |
| 130 | + echo "=== vLLM log (last 50 lines) ===" |
| 131 | + tail -50 "/tmp/build_${BUILD_ID}_vllm.log" 2>/dev/null || true |
| 132 | + exit 1 |
| 133 | +fi |
| 134 | + |
| 135 | +echo "" |
| 136 | +echo "=== Benchmark completed within ${TIMEOUT_SECONDS}s ===" |
| 137 | +echo "PASS: No deadlock detected" |
0 commit comments