@@ -292,13 +292,26 @@ def GetStoreMetadata(
292292 # Store the blocks that has block hashes
293293 # NOTE: the invariant here is that `num_stored_blocks` should
294294 # always be a multiple of `blocks_in_chunk`
295- # TODO: This should be checked everytime we update the num_stored_blocks
295+ # TODO: This should be checked everytime we update
296+ # the num_stored_blocks
297+ #
298+ # Why computed_blocks includes num_lmcache_hit_blocks:
299+ #
300+ # Include lmcache-hit blocks so that the upper bound
301+ # matches num_stored_blocks (which already covers
302+ # them). Hit blocks are NOT re-stored.
303+ computed_blocks = (
304+ tracker .num_scheduled_tokens // vllm_block_size
305+ + tracker .num_lmcache_hit_blocks
306+ )
296307 min_available_blocks = min (
297308 len (tracker .block_hashes ),
298309 len (tracker .allocated_block_ids ),
299- tracker .num_scheduled_tokens // vllm_block_size ,
310+ computed_blocks ,
311+ )
312+ num_staging_blocks = (
313+ min_available_blocks - tracker .num_stored_blocks
300314 )
301- num_staging_blocks = min_available_blocks - tracker .num_stored_blocks
302315 num_chunks = num_staging_blocks // blocks_in_chunk
303316
304317 if num_chunks >= 1 :
@@ -996,8 +1009,11 @@ def _process_cached_requests(
9961009 if request_id not in cached_reqs .resumed_req_ids :
9971010 request_tracker .append_block_ids (new_block_ids )
9981011
999- # Update new scheduled tokens
1000- num_new_tokens = cached_reqs .num_computed_tokens [idx ]
1012+ # Use the incremental num_scheduled_tokens to
1013+ # stay consistent with _process_new_requests.
1014+ num_new_tokens = (
1015+ scheduler_output .num_scheduled_tokens [request_id ]
1016+ )
10011017 request_tracker .increase_num_scheduled_tokens (num_new_tokens )
10021018
10031019 r_meta = LMCacheMPRequestMetadata .GetStoreMetadata (
0 commit comments