|
24 | 24 | from tests.utils.deployment_graph import ( |
25 | 25 | DeploymentGraph, |
26 | 26 | Payload, |
27 | | - completions_response_handler, |
28 | 27 | chat_completions_response_handler, |
| 28 | + completions_response_handler, |
29 | 29 | ) |
30 | 30 | from tests.utils.managed_process import ManagedProcess |
31 | 31 |
|
|
88 | 88 | config="configs/agg.yaml", |
89 | 89 | directory="/workspace/examples/llm", |
90 | 90 | endpoints=["v1/chat/completions", "v1/completions"], |
91 | | - response_handlers=[chat_completions_response_handler, completions_response_handler], |
| 91 | + response_handlers=[ |
| 92 | + chat_completions_response_handler, |
| 93 | + completions_response_handler, |
| 94 | + ], |
92 | 95 | marks=[pytest.mark.gpu_1, pytest.mark.vllm], |
93 | 96 | ), |
94 | 97 | text_payload, |
|
99 | 102 | config="configs/agg.yaml", |
100 | 103 | directory="/workspace/examples/sglang", |
101 | 104 | endpoints=["v1/chat/completions", "v1/completions"], |
102 | | - response_handlers=[chat_completions_response_handler, completions_response_handler], |
| 105 | + response_handlers=[ |
| 106 | + chat_completions_response_handler, |
| 107 | + completions_response_handler, |
| 108 | + ], |
103 | 109 | marks=[pytest.mark.gpu_1, pytest.mark.sglang], |
104 | 110 | ), |
105 | 111 | text_payload, |
|
110 | 116 | config="configs/disagg.yaml", |
111 | 117 | directory="/workspace/examples/llm", |
112 | 118 | endpoints=["v1/chat/completions", "v1/completions"], |
113 | | - response_handlers=[chat_completions_response_handler, completions_response_handler], |
| 119 | + response_handlers=[ |
| 120 | + chat_completions_response_handler, |
| 121 | + completions_response_handler, |
| 122 | + ], |
114 | 123 | marks=[pytest.mark.gpu_2, pytest.mark.vllm], |
115 | 124 | ), |
116 | 125 | text_payload, |
|
121 | 130 | config="configs/agg_router.yaml", |
122 | 131 | directory="/workspace/examples/llm", |
123 | 132 | endpoints=["v1/chat/completions", "v1/completions"], |
124 | | - response_handlers=[chat_completions_response_handler, completions_response_handler], |
| 133 | + response_handlers=[ |
| 134 | + chat_completions_response_handler, |
| 135 | + completions_response_handler, |
| 136 | + ], |
125 | 137 | marks=[pytest.mark.gpu_1, pytest.mark.vllm], |
126 | 138 | ), |
127 | 139 | text_payload, |
|
132 | 144 | config="configs/disagg_router.yaml", |
133 | 145 | directory="/workspace/examples/llm", |
134 | 146 | endpoints=["v1/chat/completions", "v1/completions"], |
135 | | - response_handlers=[chat_completions_response_handler, completions_response_handler], |
| 147 | + response_handlers=[ |
| 148 | + chat_completions_response_handler, |
| 149 | + completions_response_handler, |
| 150 | + ], |
136 | 151 | marks=[pytest.mark.gpu_2, pytest.mark.vllm], |
137 | 152 | ), |
138 | 153 | text_payload, |
|
143 | 158 | config="configs/agg.yaml", |
144 | 159 | directory="/workspace/examples/multimodal", |
145 | 160 | endpoints=["v1/chat/completions", "v1/completions"], |
146 | | - response_handlers=[chat_completions_response_handler, completions_response_handler], |
| 161 | + response_handlers=[ |
| 162 | + chat_completions_response_handler, |
| 163 | + completions_response_handler, |
| 164 | + ], |
147 | 165 | marks=[pytest.mark.gpu_2, pytest.mark.vllm], |
148 | 166 | ), |
149 | 167 | multimodal_payload, |
|
154 | 172 | config="configs/agg.yaml", |
155 | 173 | directory="/workspace/examples/vllm_v1", |
156 | 174 | endpoints=["v1/chat/completions", "v1/completions"], |
157 | | - response_handlers=[chat_completions_response_handler, completions_response_handler], |
| 175 | + response_handlers=[ |
| 176 | + chat_completions_response_handler, |
| 177 | + completions_response_handler, |
| 178 | + ], |
158 | 179 | marks=[pytest.mark.gpu_1, pytest.mark.vllm], |
159 | 180 | ), |
160 | 181 | text_payload, |
|
165 | 186 | config="configs/agg.yaml", |
166 | 187 | directory="/workspace/examples/tensorrt_llm", |
167 | 188 | endpoints=["v1/chat/completions", "v1/completions"], |
168 | | - response_handlers=[chat_completions_response_handler, completions_response_handler], |
| 189 | + response_handlers=[ |
| 190 | + chat_completions_response_handler, |
| 191 | + completions_response_handler, |
| 192 | + ], |
169 | 193 | marks=[pytest.mark.gpu_1, pytest.mark.tensorrtllm], |
170 | 194 | ), |
171 | 195 | text_payload, |
|
176 | 200 | config="configs/agg_router.yaml", |
177 | 201 | directory="/workspace/examples/tensorrt_llm", |
178 | 202 | endpoints=["v1/chat/completions", "v1/completions"], |
179 | | - response_handlers=[chat_completions_response_handler, completions_response_handler], |
| 203 | + response_handlers=[ |
| 204 | + chat_completions_response_handler, |
| 205 | + completions_response_handler, |
| 206 | + ], |
180 | 207 | marks=[pytest.mark.gpu_1, pytest.mark.tensorrtllm], |
181 | 208 | # FIXME: This is a hack to allow deployments to start before sending any requests. |
182 | 209 | # When using KV-router, if all the endpoints are not registered, the service |
|
191 | 218 | config="configs/disagg.yaml", |
192 | 219 | directory="/workspace/examples/tensorrt_llm", |
193 | 220 | endpoints=["v1/chat/completions", "v1/completions"], |
194 | | - response_handlers=[chat_completions_response_handler, completions_response_handler], |
| 221 | + response_handlers=[ |
| 222 | + chat_completions_response_handler, |
| 223 | + completions_response_handler, |
| 224 | + ], |
195 | 225 | marks=[pytest.mark.gpu_2, pytest.mark.tensorrtllm], |
196 | 226 | ), |
197 | 227 | text_payload, |
|
202 | 232 | config="configs/disagg_router.yaml", |
203 | 233 | directory="/workspace/examples/tensorrt_llm", |
204 | 234 | endpoints=["v1/chat/completions", "v1/completions"], |
205 | | - response_handlers=[chat_completions_response_handler, completions_response_handler], |
| 235 | + response_handlers=[ |
| 236 | + chat_completions_response_handler, |
| 237 | + completions_response_handler, |
| 238 | + ], |
206 | 239 | marks=[pytest.mark.gpu_2, pytest.mark.tensorrtllm], |
207 | 240 | # FIXME: This is a hack to allow deployments to start before sending any requests. |
208 | 241 | # When using KV-router, if all the endpoints are not registered, the service |
@@ -301,17 +334,27 @@ def check_response(response, response_handler): |
301 | 334 | assert content, "Empty response content" |
302 | 335 | for expected in payload.expected_response: |
303 | 336 | assert expected in content, "Expected '%s' not found in response" % expected |
| 337 | + |
304 | 338 | with DynamoServeProcess(deployment_graph, request) as server_process: |
305 | 339 | first_success_pending = True |
306 | | - for endpoint, response_handler in zip(deployment_graph.endpoints, deployment_graph.response_handlers): |
| 340 | + for endpoint, response_handler in zip( |
| 341 | + deployment_graph.endpoints, deployment_graph.response_handlers |
| 342 | + ): |
307 | 343 | url = f"http://localhost:{server_process.port}/{endpoint}" |
308 | 344 | start_time = time.time() |
309 | 345 | retry_delay = 5 |
310 | 346 | elapsed = 0.0 |
311 | | - request_body = payload.payload_chat if endpoint == "v1/chat/completions" else payload.payload_completions |
| 347 | + request_body = ( |
| 348 | + payload.payload_chat |
| 349 | + if endpoint == "v1/chat/completions" |
| 350 | + else payload.payload_completions |
| 351 | + ) |
312 | 352 |
|
313 | | - # We can skip this |
314 | | - while time.time() - start_time < deployment_graph.timeout and first_success_pending: |
| 353 | + # We can skip this |
| 354 | + while ( |
| 355 | + time.time() - start_time < deployment_graph.timeout |
| 356 | + and first_success_pending |
| 357 | + ): |
315 | 358 | elapsed = time.time() - start_time |
316 | 359 | try: |
317 | 360 | response = requests.post( |
|
0 commit comments