Skip to content

Commit 0f34708

Browse files
authored
Merge branch 'dev' into l0
2 parents 1e46dd1 + 06981d6 commit 0f34708

42 files changed

Lines changed: 2371 additions & 452 deletions

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

.buildkite/k3_harness/ci-base.Dockerfile

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
# Built automatically by setup-cluster.sh and imported into K3s containerd.
55
# Rebuild when requirements/*.txt changes.
66

7-
FROM nvcr.io/nvidia/cuda-dl-base:25.03-cuda12.8-devel-ubuntu24.04
7+
FROM nvidia/cuda:13.0.2-devel-ubuntu24.04
88

99
ENV DEBIAN_FRONTEND=noninteractive
1010
ENV PATH="/opt/venv/bin:${PATH}"
@@ -15,7 +15,8 @@ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
1515
&& apt-get install -y --no-install-recommends \
1616
ccache software-properties-common git curl sudo jq lsof \
1717
python3 python3-dev python3-venv python3-pip tzdata libxcb1-dev \
18-
&& ldconfig /usr/local/cuda-12.8/compat/ \
18+
libcudart12 \
19+
&& ldconfig \
1920
&& curl -LsSf https://astral.sh/uv/install.sh | sh \
2021
&& mv ~/.local/bin/uv /usr/local/bin/ \
2122
&& mv ~/.local/bin/uvx /usr/local/bin/ \

.buildkite/k3_harness/setup-blend-env.sh

Lines changed: 19 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
#!/usr/bin/env bash
2-
# Per-job environment setup: installs vLLM nightly + LMCache from source.
2+
# Per-job environment setup: installs vLLM (nightly cu128 wheels) + LMCache from source.
33
# Called at the start of every CI job.
44
set -euo pipefail
55

@@ -17,7 +17,7 @@ fi
1717
source "${REPO_ROOT}/.buildkite/k3_tests/common_scripts/helpers.sh"
1818
check_gpu_health 80
1919

20-
echo "--- :python: Installing vLLM nightly"
20+
echo "--- :python: Installing vLLM (nightly cu128 wheels)"
2121

2222

2323
DEFAULT_VENV_BIN="/opt/venv/bin"
@@ -51,35 +51,25 @@ else
5151
fi
5252
TEST_VENV_BIN="/workspace/.venv/bin"
5353

54-
# Resolve the latest nightly wheel URL directly from the nightly index.
55-
# PEP 440 ranks stable releases (0.17.0) above pre-release nightlies
56-
# (0.17.0rc1.devN), so pip/uv always picks the stable version when both
57-
# indexes are available. We work around this by parsing the nightly index
58-
# page and installing the wheel by URL.
59-
ARCH=$(uname -m) # x86_64 or aarch64
60-
VLLM_NIGHTLY_INDEX="https://wheels.vllm.ai/nightly/vllm/"
61-
INDEX_HTML=$(curl -sfL "$VLLM_NIGHTLY_INDEX" 2>&1) || true
62-
VLLM_NIGHTLY_URL=$(echo "$INDEX_HTML" \
63-
| grep -oP 'href="\K[^"]+'"${ARCH}"'\.whl' \
64-
| head -1) || true
65-
if [[ -z "$VLLM_NIGHTLY_URL" ]]; then
66-
echo "WARNING: Could not find vLLM nightly wheel for ${ARCH} — falling back to latest stable" >&2
67-
"${UV_BIN}" pip install -p "${TEST_VENV_BIN}/python" "vllm[runai,tensorizer,flashinfer]"
68-
else
69-
# href is relative (../../<commit>/vllm-....whl), resolve to absolute URL
70-
VLLM_WHEEL_URL="https://wheels.vllm.ai/nightly/vllm/${VLLM_NIGHTLY_URL}"
71-
echo "Resolved nightly wheel: $VLLM_WHEEL_URL"
72-
"${UV_BIN}" pip install -p "${TEST_VENV_BIN}/python" --prerelease=allow \
73-
"${VLLM_WHEEL_URL}[runai,tensorizer,flashinfer]" \
74-
--extra-index-url https://pypi.org/simple \
75-
--index-strategy unsafe-best-match
76-
fi
54+
# When flashinfer and flashinfer-cubin resolve to different patch versions, skip strict check.
55+
export FLASHINFER_DISABLE_VERSION_CHECK=1
56+
57+
"${UV_BIN}" pip install -p "${TEST_VENV_BIN}/python" -U vllm "torch==2.10.0+cu128" --pre \
58+
--extra-index-url https://wheels.vllm.ai/nightly/cu128 \
59+
--extra-index-url https://download.pytorch.org/whl/cu128 \
60+
--index-strategy unsafe-best-match
61+
7762

7863
# install LMCache from source twice as two torch version might be different
64+
65+
"${DEFAULT_VENV_BIN}/python" -c 'import vllm; print(f"default venv vllm={vllm.__version__}")'
66+
"${TEST_VENV_BIN}/python" -c 'import vllm; print(f"test venv vllm={vllm.__version__}")'
67+
"${DEFAULT_VENV_BIN}/python" -c 'import torch; print(f"default venv torch={torch.__version__}, torch.version.cuda={torch.version.cuda}")'
68+
"${TEST_VENV_BIN}/python" -c 'import torch; print(f"test venv torch={torch.__version__}, torch.version.cuda={torch.version.cuda}")'
69+
7970
echo "--- :python: Installing LMCache from source"
8071
"${UV_BIN}" pip install -p "${DEFAULT_VENV_BIN}/python" -e . --no-build-isolation
8172
"${UV_BIN}" pip install -p "${TEST_VENV_BIN}/python" -e . --no-build-isolation
82-
8373
# Work around openai_harmony vocab download/load issues for GPT-OSS (vLLM recipes troubleshooting).
8474
# related github issue: https://github.com/openai/harmony/pull/41
8575
TIKTOKEN_ENCODINGS_DIR="${REPO_ROOT}/tiktoken_encodings"
@@ -94,9 +84,12 @@ fi
9484
if [[ ! -s "${TIKTOKEN_ENCODINGS_DIR}/cl100k_base.tiktoken" ]]; then
9585
curl -fsSL "https://openaipublic.blob.core.windows.net/encodings/cl100k_base.tiktoken" -o "${TIKTOKEN_ENCODINGS_DIR}/cl100k_base.tiktoken"
9686
fi
87+
88+
9789
export TIKTOKEN_ENCODINGS_BASE="${TIKTOKEN_ENCODINGS_DIR}"
9890
echo "Using TIKTOKEN_ENCODINGS_BASE=${TIKTOKEN_ENCODINGS_BASE}"
9991

92+
10093
echo "--- :white_check_mark: Environment ready"
10194
"${DEFAULT_VENV_BIN}/python" -c "import vllm; import lmcache; print(f'vLLM={vllm.__version__}, LMCache installed from source with no build isolation in default venv')"
10295
"${TEST_VENV_BIN}/python" -c "import vllm; import lmcache; print(f'vLLM={vllm.__version__}, LMCache installed from source with no build isolation in test venv')"

.buildkite/k3_tests/blend/scripts/run-blend-test.sh

Lines changed: 12 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -30,14 +30,16 @@ SERVER_WAIT_TIMEOUT="${SERVER_WAIT_TIMEOUT:-400}"
3030

3131
BUILD_ID="${BUILDKITE_BUILD_ID:-local_$$}"
3232
WORK_LOG="/tmp/build_${BUILD_ID}_blend.log"
33-
# Blend server, vLLM prefiller/decoder, and proxy stdout/stderr (main script uses WORK_LOG via tee).
34-
VLLM_LOG="/tmp/build_${BUILD_ID}_vllm.log"
33+
# Proxy stdout/stderr. Blend server/prefiller/decoder each get their own _blend_server/_prefiller_PORT/_decoder_PORT logs.
34+
VLLM_LOG="/tmp/build_${BUILD_ID}_proxy.log"
35+
BLEND_SERVER_LOG="/tmp/build_${BUILD_ID}_blend_server.log"
3536
ARTIFACT="build_${BUILD_ID}.log"
3637
# Benchmark wall-clock limit (seconds). Exit 124 from `timeout` => failure. Default stays under blend pipeline 90m.
3738
BENCHMARK_TIMEOUT_SEC="${BENCHMARK_TIMEOUT_SEC:-4800}"
3839

3940
: > "${WORK_LOG}"
4041
: > "${VLLM_LOG}"
42+
: > "${BLEND_SERVER_LOG}"
4143

4244
declare -A RESERVED_PORTS=()
4345

@@ -89,7 +91,7 @@ resolve_port_csv() {
8991

9092
collect_artifact() {
9193
echo "[INFO] Collecting logs into ${ARTIFACT}"
92-
cat "${WORK_LOG}" "${VLLM_LOG}" > "${ARTIFACT}" 2>/dev/null || true
94+
cat /tmp/build_"${BUILD_ID}"_*.log > "${ARTIFACT}" 2>/dev/null || true
9395
}
9496

9597
finalize() {
@@ -198,7 +200,7 @@ export LD_LIBRARY_PATH=/opt/nvidia/nsight-compute/2025.1.0/host/linux-desktop-gl
198200
--eviction-policy LRU \
199201
--chunk-size 1024 \
200202
--l1-align-bytes 16777216 \
201-
>>"${VLLM_LOG}" 2>&1 &
203+
>>"${BLEND_SERVER_LOG}" 2>&1 &
202204
TRACKED_PIDS+=($!)
203205

204206
sleep 10
@@ -209,6 +211,8 @@ GPU_IDX=0
209211
for port in "${PREFILLER_PORTS[@]}"; do
210212
GPU_END=$((GPU_IDX + TENSOR_PARALLEL - 1))
211213
CUDA_DEVS=$(seq -s, "$GPU_IDX" "$GPU_END")
214+
PREFILLER_LOG="/tmp/build_${BUILD_ID}_prefiller_${port}.log"
215+
: > "${PREFILLER_LOG}"
212216
echo "Starting prefiller on GPUs ${CUDA_DEVS}, port ${port}"
213217
CUDA_VISIBLE_DEVICES=$CUDA_DEVS \
214218
LMCACHE_REQUEST_TELEMETRY_TYPE=fastapi \
@@ -226,7 +230,7 @@ for port in "${PREFILLER_PORTS[@]}"; do
226230
--gpu-memory-utilization "$GPU_MEM_UTIL" \
227231
--kv-transfer-config \
228232
"{\"kv_connector\":\"LMCacheMPCBConnector\",\"kv_role\":\"kv_both\",\"kv_connector_extra_config\":{\"lmcache.mp.port\":${LMCACHE_MP_PORT}}}" \
229-
>>"${VLLM_LOG}" 2>&1 &
233+
>>"${PREFILLER_LOG}" 2>&1 &
230234
TRACKED_PIDS+=($!)
231235
GPU_IDX=$((GPU_IDX + TENSOR_PARALLEL))
232236
done
@@ -238,6 +242,8 @@ done
238242
for port in "${DECODER_PORTS[@]}"; do
239243
GPU_END=$((GPU_IDX + TENSOR_PARALLEL - 1))
240244
CUDA_DEVS=$(seq -s, "$GPU_IDX" "$GPU_END")
245+
DECODER_LOG="/tmp/build_${BUILD_ID}_decoder_${port}.log"
246+
: > "${DECODER_LOG}"
241247
echo "Starting decoder on GPUs ${CUDA_DEVS}, port ${port}"
242248
CUDA_VISIBLE_DEVICES=$CUDA_DEVS \
243249
VLLM_USE_FLASHINFER_MOE_FP8=0 \
@@ -252,7 +258,7 @@ for port in "${DECODER_PORTS[@]}"; do
252258
--gpu-memory-utilization "$GPU_MEM_UTIL" \
253259
--kv-transfer-config \
254260
"{\"kv_connector\":\"LMCacheMPConnector\",\"kv_role\":\"kv_both\",\"kv_connector_extra_config\":{\"lmcache.mp.port\":${LMCACHE_MP_PORT}}}" \
255-
>>"${VLLM_LOG}" 2>&1 &
261+
>>"${DECODER_LOG}" 2>&1 &
256262
TRACKED_PIDS+=($!)
257263
GPU_IDX=$((GPU_IDX + TENSOR_PARALLEL))
258264
done

.buildkite/k3_tests/multiprocess/pipeline.yml

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,3 +50,10 @@ steps:
5050
agents: { queue: "k8s" }
5151
plugins: [{ kubernetes: { podSpec: *pod-2gpu } }]
5252
artifact_paths: ["*.log"]
53+
54+
- label: ":compression: deadlock"
55+
command: .buildkite/k3_tests/multiprocess/run.sh deadlock
56+
timeout_in_minutes: 30
57+
agents: { queue: "k8s" }
58+
plugins: [{ kubernetes: { podSpec: *pod-2gpu } }]
59+
artifact_paths: ["*.log"]

.buildkite/k3_tests/multiprocess/run.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
# No Docker -- all processes run natively in the pod.
77
set -euo pipefail
88

9-
TEST_NAME="${1:?Usage: $0 <test_name> (lm_eval|vllm_bench|long_doc_qa|long_doc_qa_l2|fault_tolerance)}"
9+
TEST_NAME="${1:?Usage: $0 <test_name> (lm_eval|vllm_bench|long_doc_qa|long_doc_qa_l2|fault_tolerance|deadlock)}"
1010
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
1111
REPO_ROOT="$(cd "${SCRIPT_DIR}/../../.." && pwd)"
1212

Lines changed: 137 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,137 @@
1+
#!/usr/bin/env bash
2+
# Self-contained deadlock regression test.
3+
#
4+
# Launches DeepSeek-V2-Lite-Chat with TP=2 (both GPUs) + LMCache server,
5+
# sends 50 requests with ~30K token prefixes, and verifies they all
6+
# complete within 3 minutes. A CUDA-driver/GIL deadlock would cause
7+
# requests to hang indefinitely, failing the timeout.
8+
#
9+
# This test is self-contained: it handles its own server lifecycle
10+
# instead of using the standard launch-processes.sh / wait-for-servers.sh.
11+
set -o pipefail
12+
13+
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
14+
REPO_ROOT="$(cd "${SCRIPT_DIR}/../../../.." && pwd)"
15+
16+
source "${REPO_ROOT}/.buildkite/k3_tests/common_scripts/helpers.sh"
17+
18+
# ── Configuration ───────────────────────────────────────────
19+
MODEL="deepseek-ai/DeepSeek-V2-Lite-Chat"
20+
LMCACHE_PORT="${LMCACHE_PORT:-15554}"
21+
VLLM_PORT="${VLLM_PORT:-8000}"
22+
BUILD_ID="${BUILD_ID:-local_$$}"
23+
PID_FILE="/tmp/lmcache_mp_pids_${BUILD_ID}"
24+
TIMEOUT_SECONDS=180 # 3 minutes
25+
26+
# ── Install py-spy for deadlock diagnosis ──────────────────
27+
echo "=== Installing py-spy ==="
28+
uv pip install py-spy
29+
PY_SPY="$(which py-spy)"
30+
echo "py-spy installed at: $PY_SPY"
31+
32+
PYSPY_LOG="/tmp/build_${BUILD_ID}_pyspy.log"
33+
34+
# ── Helper: dump stacks of server processes via py-spy ─────
35+
dump_stacks() {
36+
echo "" | tee -a "$PYSPY_LOG"
37+
echo "=== py-spy stack dump (native + Python) ===" | tee -a "$PYSPY_LOG"
38+
39+
if kill -0 "$LMCACHE_PID" 2>/dev/null; then
40+
echo "" | tee -a "$PYSPY_LOG"
41+
echo "--- LMCache server (PID=$LMCACHE_PID) ---" | tee -a "$PYSPY_LOG"
42+
sudo "$PY_SPY" dump --pid "$LMCACHE_PID" --native 2>&1 | tee -a "$PYSPY_LOG" || true
43+
fi
44+
45+
# Copy to repo root so cleanup.sh collects it as a Buildkite artifact
46+
cp "$PYSPY_LOG" "${REPO_ROOT}/build_${BUILD_ID}_pyspy.log" 2>/dev/null || true
47+
}
48+
49+
# ── 1. Launch LMCache server ───────────────────────────────
50+
echo "=== Launching LMCache server ==="
51+
echo "Port: $LMCACHE_PORT"
52+
53+
lmcache server \
54+
--host localhost \
55+
--port "$LMCACHE_PORT" \
56+
--chunk-size 256 \
57+
--l1-size-gb 50 \
58+
--eviction-policy LRU \
59+
--max-workers 2 \
60+
> "/tmp/build_${BUILD_ID}_lmcache.log" 2>&1 &
61+
62+
LMCACHE_PID=$!
63+
echo "$LMCACHE_PID" >> "$PID_FILE"
64+
echo "LMCache server started (PID=$LMCACHE_PID)"
65+
sleep 10
66+
67+
# ── 2. Launch vLLM with DeepSeek TP=2 ─────────────────────
68+
echo "=== Launching vLLM (DeepSeek TP=2) ==="
69+
echo "Model: $MODEL"
70+
echo "Port: $VLLM_PORT"
71+
72+
# Save VLLM_PORT before unsetting — vLLM's internal get_open_port()
73+
# would otherwise collide with the serving port for torch.distributed.
74+
SAVED_VLLM_PORT="$VLLM_PORT"
75+
unset VLLM_PORT
76+
77+
FLASHINFER_DISABLE_VERSION_CHECK=1 \
78+
VLLM_SERVER_DEV_MODE=1 \
79+
vllm serve "$MODEL" \
80+
--tensor-parallel-size 2 \
81+
--distributed-executor-backend mp \
82+
--block-size 64 \
83+
--trust-remote-code \
84+
--load-format dummy \
85+
--enable-prefix-caching \
86+
--enable-chunked-prefill \
87+
--gpu-memory-utilization 0.8 \
88+
--max-model-len 65536 \
89+
--hf-overrides '{"max_position_embeddings":65536}' \
90+
--max-num-seqs 32 \
91+
--max-num-batched-tokens 16000 \
92+
--scheduling-policy fcfs \
93+
--port "$SAVED_VLLM_PORT" \
94+
--enforce-eager \
95+
--kv-transfer-config "{\"kv_connector\":\"LMCacheMPConnector\", \"kv_role\":\"kv_both\", \"kv_load_failure_policy\": \"recompute\", \"kv_connector_extra_config\": {\"lmcache.mp.port\": $LMCACHE_PORT, \"lmcache.mp.mq_timeout\": 60}}" \
96+
> "/tmp/build_${BUILD_ID}_vllm.log" 2>&1 &
97+
98+
VLLM_PID=$!
99+
echo "$VLLM_PID" >> "$PID_FILE"
100+
echo "vLLM started (PID=$VLLM_PID)"
101+
102+
VLLM_PORT="$SAVED_VLLM_PORT"
103+
104+
# ── 3. Wait for vLLM to be ready ──────────────────────────
105+
echo "=== Waiting for vLLM to be ready ==="
106+
if ! wait_for_server "$VLLM_PORT" 600; then
107+
echo "vLLM failed to start. Last 100 lines of log:"
108+
tail -100 "/tmp/build_${BUILD_ID}_vllm.log" 2>/dev/null || true
109+
exit 1
110+
fi
111+
112+
# ── 4. Run benchmark with timeout ─────────────────────────
113+
echo "=== Running lmcache bench engine (random-prefill, 50 reqs, ~30K tokens) ==="
114+
echo "Timeout: ${TIMEOUT_SECONDS}s"
115+
116+
if ! timeout "$TIMEOUT_SECONDS" lmcache bench engine \
117+
--engine-url "http://localhost:${VLLM_PORT}" \
118+
--workload random-prefill \
119+
--tokens-per-gb-kvcache 6000 \
120+
--rp-request-length 30000 \
121+
--rp-num-requests 50 \
122+
--no-interactive \
123+
--no-csv \
124+
-q; then
125+
echo "FAIL: Benchmark failed or timed out (possible deadlock)"
126+
echo ""
127+
echo "=== LMCache log (last 50 lines) ==="
128+
tail -50 "/tmp/build_${BUILD_ID}_lmcache.log" 2>/dev/null || true
129+
echo ""
130+
echo "=== vLLM log (last 50 lines) ==="
131+
tail -50 "/tmp/build_${BUILD_ID}_vllm.log" 2>/dev/null || true
132+
exit 1
133+
fi
134+
135+
echo ""
136+
echo "=== Benchmark completed within ${TIMEOUT_SECONDS}s ==="
137+
echo "PASS: No deadlock detected"

.buildkite/k3_tests/multiprocess/scripts/run-single-test.sh

Lines changed: 26 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -42,25 +42,30 @@ echo "vLLM baseline port: $VLLM_BASELINE_PORT"
4242
echo "Results dir: $RESULTS_DIR"
4343
echo ""
4444

45-
# ── Step 1: Launch native processes ──────────────────────────
46-
echo "============================================"
47-
echo "=== Launching native processes ==="
48-
echo "============================================"
49-
if ! "${SCRIPT_DIR}/launch-processes.sh"; then
50-
echo "Failed to launch processes"
51-
exit 1
52-
fi
53-
echo ""
45+
# Tests that handle their own server lifecycle (different GPU/model config)
46+
SELF_CONTAINED_TESTS=" deadlock "
5447

55-
# ── Step 2: Wait for vLLM to be ready ───────────────────────
56-
echo "============================================"
57-
echo "=== Waiting for vLLM to be ready ==="
58-
echo "============================================"
59-
if ! "${SCRIPT_DIR}/wait-for-servers.sh"; then
60-
echo "vLLM failed to become ready"
61-
exit 1
48+
if [[ "$SELF_CONTAINED_TESTS" != *" $TEST_NAME "* ]]; then
49+
# ── Step 1: Launch native processes ──────────────────────────
50+
echo "============================================"
51+
echo "=== Launching native processes ==="
52+
echo "============================================"
53+
if ! "${SCRIPT_DIR}/launch-processes.sh"; then
54+
echo "Failed to launch processes"
55+
exit 1
56+
fi
57+
echo ""
58+
59+
# ── Step 2: Wait for vLLM to be ready ───────────────────────
60+
echo "============================================"
61+
echo "=== Waiting for vLLM to be ready ==="
62+
echo "============================================"
63+
if ! "${SCRIPT_DIR}/wait-for-servers.sh"; then
64+
echo "vLLM failed to become ready"
65+
exit 1
66+
fi
67+
echo ""
6268
fi
63-
echo ""
6469

6570
# ── Step 3: Run the requested test ──────────────────────────
6671
echo "============================================"
@@ -83,9 +88,12 @@ case "$TEST_NAME" in
8388
fault_tolerance)
8489
exec_script="${SCRIPT_DIR}/run-fault-tolerance.sh"
8590
;;
91+
deadlock)
92+
exec_script="${SCRIPT_DIR}/run-deadlock.sh"
93+
;;
8694
*)
8795
echo "Unknown test: $TEST_NAME"
88-
echo "Valid tests: lm_eval, vllm_bench, long_doc_qa, long_doc_qa_l2, fault_tolerance"
96+
echo "Valid tests: lm_eval, vllm_bench, long_doc_qa, long_doc_qa_l2, fault_tolerance, deadlock"
8997
exit 1
9098
;;
9199
esac

.buildkite/pipeline.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@ steps:
5050
--ignore=tests/v1/test_nixl_storage.py \
5151
--ignore=tests/skipped \
5252
--ignore=tests/v1/multiprocess \
53+
--ignore=tests/v1/mp_observability/test_event_recorder.py \
5354
--ignore=tests/v1/storage_backend/test_eic.py
5455
fi
5556

0 commit comments

Comments
 (0)