fix: Prevent crash on full prompt cache hit (100% match)

solderzzc · solderzzc · commit eac5ab33852b · 2026-04-03T22:43:27.000-07:00
When a repeated prompt matched 100% of cached tokens, the remaining
token slice was empty (0 tokens). Passing this to the model caused
'[reshape] Cannot infer the shape of an empty array' fatal error.

Fix: replay the last cached token (with KV trim-back by 1) so the
model always receives at least 1 token for next-token logit production.
diff --git a/Sources/SwiftLM/Server.swift b/Sources/SwiftLM/Server.swift
@@ -961,7 +961,15 @@ func handleChatCompletion(
         if let cachedCount = await promptCache.restore(newTokens: promptTokens, into: cache) {
             // Cache hit: KV state is pre-populated up to cachedCount tokens.
             // Only compute the remaining (new) tokens.
-            let remainingTokens = lmInput.text.tokens[cachedCount...]
+            var startIndex = cachedCount
+            if startIndex >= lmInput.text.tokens.count {
+                // Full match: all tokens are cached. We still need to feed at least
+                // the last token so the model can produce next-token logits.
+                startIndex = lmInput.text.tokens.count - 1
+                // Trim the KV cache back by 1 to avoid double-counting the replayed token.
+                for layer in cache { layer.trim(1) }
+            }
+            let remainingTokens = lmInput.text.tokens[startIndex...]
             let trimmedInput = LMInput(tokens: remainingTokens)
             return try MLXLMCommon.generate(
                 input: trimmedInput, cache: cache, parameters: params, context: context