ENTERPILOT
diff --git a/‎internal/responsecache/handle_request_test.go‎
Lines changed: 168 additions & 18 deletions b/‎internal/responsecache/handle_request_test.go‎
Lines changed: 168 additions & 18 deletions
diff --git a/‎internal/responsecache/middleware_test.go‎
Lines changed: 81 additions & 29 deletions b/‎internal/responsecache/middleware_test.go‎
Lines changed: 81 additions & 29 deletions
diff --git a/‎internal/responsecache/responsecache.go‎
Lines changed: 2 additions & 2 deletions b/‎internal/responsecache/responsecache.go‎
Lines changed: 2 additions & 2 deletions
@@ -331,7 +331,7 @@ func TestHandleRequest_CacheControlNoCacheBypassesAllLayers(t *testing.T) {
 	}
 }
 
-func TestHandleRequest_StreamingMissPopulatesExactCacheAcrossModes(t *testing.T) {
+func TestHandleRequest_StreamingMissPopulatesExactStreamingCacheOnly(t *testing.T) {
 	store := cache.NewMapStore()
 	defer store.Close()
 
@@ -341,6 +341,11 @@ func TestHandleRequest_StreamingMissPopulatesExactCacheAcrossModes(t *testing.T)
 
 	streamBody := []byte(`{"model":"gpt-4","stream":true,"messages":[{"role":"user","content":"cache-streaming-cross-mode"}]}`)
 	jsonBody := []byte(`{"model":"gpt-4","messages":[{"role":"user","content":"cache-streaming-cross-mode"}]}`)
+	rawStream := []byte(
+		"data: {\"id\":\"chatcmpl-stream-cache\",\"object\":\"chat.completion.chunk\",\"created\":1234567890,\"model\":\"gpt-4\",\"provider\":\"openai\",\"choices\":[{\"index\":0,\"delta\":{\"role\":\"assistant\",\"content\":\"Hello\"},\"finish_reason\":null}]}\n\n" +
+			"data: {\"id\":\"chatcmpl-stream-cache\",\"object\":\"chat.completion.chunk\",\"created\":1234567890,\"model\":\"gpt-4\",\"provider\":\"openai\",\"choices\":[{\"index\":0,\"delta\":{\"content\":\" world\"},\"finish_reason\":\"stop\"}],\"usage\":{\"prompt_tokens\":11,\"completion_tokens\":2,\"total_tokens\":13}}\n\n" +
+			"data: [DONE]\n\n",
+	)
 	e := echo.New()
 	handlerCalls := 0
 
@@ -360,12 +365,13 @@ func TestHandleRequest_StreamingMissPopulatesExactCacheAcrossModes(t *testing.T)
 		c.SetRequest(req.WithContext(core.WithExecutionPlan(req.Context(), plan)))
 		if err := m.HandleRequest(c, body, func() error {
 			handlerCalls++
-			c.Response().Header().Set("Content-Type", "text/event-stream")
-			c.Response().WriteHeader(http.StatusOK)
-			_, _ = c.Response().Write([]byte("data: {\"id\":\"chatcmpl-stream-cache\",\"object\":\"chat.completion.chunk\",\"created\":1234567890,\"model\":\"gpt-4\",\"provider\":\"openai\",\"choices\":[{\"index\":0,\"delta\":{\"role\":\"assistant\",\"content\":\"Hello\"},\"finish_reason\":null}]}\n\n"))
-			_, _ = c.Response().Write([]byte("data: {\"id\":\"chatcmpl-stream-cache\",\"object\":\"chat.completion.chunk\",\"created\":1234567890,\"model\":\"gpt-4\",\"provider\":\"openai\",\"choices\":[{\"index\":0,\"delta\":{\"content\":\" world\"},\"finish_reason\":\"stop\"}],\"usage\":{\"prompt_tokens\":11,\"completion_tokens\":2,\"total_tokens\":13}}\n\n"))
-			_, _ = c.Response().Write([]byte("data: [DONE]\n\n"))
-			return nil
+			if isStreamingRequest(c.Request().URL.Path, body) {
+				c.Response().Header().Set("Content-Type", "text/event-stream")
+				c.Response().WriteHeader(http.StatusOK)
+				_, _ = c.Response().Write(rawStream)
+				return nil
+			}
+			return c.JSON(http.StatusOK, map[string]string{"mode": "json"})
 		}); err != nil {
 			t.Fatalf("HandleRequest: %v", err)
 		}
@@ -382,36 +388,180 @@ func TestHandleRequest_StreamingMissPopulatesExactCacheAcrossModes(t *testing.T)
 	if handlerCalls != 1 {
 		t.Fatalf("expected 1 handler invocation after streaming miss, got %d", handlerCalls)
 	}
+	if !bytes.Equal(rec1.Body.Bytes(), rawStream) {
+		t.Fatalf("streaming miss body = %q, want original SSE payload", rec1.Body.String())
+	}
 
 	m.simple.wg.Wait()
 
 	rec2 := run(jsonBody)
-	if got := rec2.Header().Get("X-Cache"); got != "HIT (exact)" {
-		t.Fatalf("non-streaming follow-up should hit exact cache, got X-Cache=%q", got)
+	if got := rec2.Header().Get("X-Cache"); got != "" {
+		t.Fatalf("non-streaming follow-up should miss exact cache because stream mode is keyed separately, got X-Cache=%q", got)
 	}
 	if got := rec2.Header().Get("Content-Type"); got != "application/json" {
-		t.Fatalf("non-streaming hit Content-Type = %q, want application/json", got)
+		t.Fatalf("non-streaming miss Content-Type = %q, want application/json", got)
 	}
-	if !bytes.Contains(rec2.Body.Bytes(), []byte(`"content":"Hello world"`)) {
-		t.Fatalf("non-streaming cache hit body = %q, want reconstructed JSON response", rec2.Body.String())
+	if !bytes.Contains(rec2.Body.Bytes(), []byte(`"mode":"json"`)) {
+		t.Fatalf("non-streaming miss body = %q, want JSON response", rec2.Body.String())
 	}
-	if handlerCalls != 1 {
-		t.Fatalf("non-streaming exact hit should not call handler again, got %d calls", handlerCalls)
+	if handlerCalls != 2 {
+		t.Fatalf("non-streaming miss should call handler again, got %d calls", handlerCalls)
 	}
 
+	m.simple.wg.Wait()
+
 	rec3 := run(streamBody)
 	if got := rec3.Header().Get("X-Cache"); got != "HIT (exact)" {
-		t.Fatalf("streaming follow-up should hit exact cache, got X-Cache=%q", got)
+		t.Fatalf("streaming follow-up should hit its own exact cache entry, got X-Cache=%q", got)
 	}
 	if got := rec3.Header().Get("Content-Type"); got != "text/event-stream" {
 		t.Fatalf("streaming hit Content-Type = %q, want text/event-stream", got)
 	}
-	if !bytes.Contains(rec3.Body.Bytes(), []byte("Hello world")) || !bytes.Contains(rec3.Body.Bytes(), []byte("[DONE]")) {
-		t.Fatalf("streaming cache hit body = %q, want synthesized SSE with content and [DONE]", rec3.Body.String())
+	if !bytes.Equal(rec3.Body.Bytes(), rawStream) {
+		t.Fatalf("streaming cache hit body = %q, want verbatim SSE replay", rec3.Body.String())
 	}
-	if handlerCalls != 1 {
+	if handlerCalls != 2 {
 		t.Fatalf("streaming exact hit should not call handler again, got %d calls", handlerCalls)
 	}
+
+	rec4 := run(jsonBody)
+	if got := rec4.Header().Get("X-Cache"); got != "HIT (exact)" {
+		t.Fatalf("non-streaming follow-up should hit its own exact cache entry, got X-Cache=%q", got)
+	}
+	if got := rec4.Header().Get("Content-Type"); got != "application/json" {
+		t.Fatalf("non-streaming hit Content-Type = %q, want application/json", got)
+	}
+	if !bytes.Contains(rec4.Body.Bytes(), []byte(`"mode":"json"`)) {
+		t.Fatalf("non-streaming cache hit body = %q, want cached JSON response", rec4.Body.String())
+	}
+	if handlerCalls != 2 {
+		t.Fatalf("non-streaming exact hit should not call handler again, got %d calls", handlerCalls)
+	}
+}
+
+func TestHandleRequest_StreamingExactHitWritesSyntheticUsageEntry(t *testing.T) {
+	store := cache.NewMapStore()
+	defer store.Close()
+
+	logger := &recordingUsageLogger{}
+	m := &ResponseCacheMiddleware{
+		simple: newSimpleCacheMiddleware(store, time.Hour, newUsageHitRecorder(logger, nil)),
+	}
+
+	body := []byte(`{"model":"gpt-4","stream":true,"messages":[{"role":"user","content":"cache-stream-usage-hit"}]}`)
+	rawStream := []byte(
+		"data: {\"id\":\"chatcmpl-cache-hit\",\"object\":\"chat.completion.chunk\",\"model\":\"gpt-4\",\"choices\":[{\"index\":0,\"delta\":{\"content\":\"Hello\"},\"finish_reason\":null}]}\n\n" +
+			"data: {\"id\":\"chatcmpl-cache-hit\",\"object\":\"chat.completion.chunk\",\"model\":\"gpt-4\",\"choices\":[{\"index\":0,\"delta\":{},\"finish_reason\":\"stop\"}],\"usage\":{\"prompt_tokens\":11,\"completion_tokens\":5,\"total_tokens\":16}}\n\n" +
+			"data: [DONE]\n\n",
+	)
+	e := echo.New()
+
+	run := func() *httptest.ResponseRecorder {
+		t.Helper()
+		req := httptest.NewRequest(http.MethodPost, "/v1/chat/completions", bytes.NewReader(body))
+		req.Header.Set("Content-Type", "application/json")
+		rec := httptest.NewRecorder()
+		c := e.NewContext(req, rec)
+		plan := &core.ExecutionPlan{
+			Mode:         core.ExecutionModeTranslated,
+			ProviderType: "openai",
+			Resolution: &core.RequestModelResolution{
+				ResolvedSelector: core.ModelSelector{Provider: "openai", Model: "gpt-4"},
+			},
+		}
+		c.SetRequest(req.WithContext(core.WithExecutionPlan(req.Context(), plan)))
+		if err := m.HandleRequest(c, body, func() error {
+			c.Response().Header().Set("Content-Type", "text/event-stream")
+			c.Response().WriteHeader(http.StatusOK)
+			_, _ = c.Response().Write(rawStream)
+			return nil
+		}); err != nil {
+			t.Fatalf("HandleRequest: %v", err)
+		}
+		return rec
+	}
+
+	rec1 := run()
+	if got := rec1.Header().Get("X-Cache"); got != "" {
+		t.Fatalf("first request should miss exact cache, got X-Cache=%q", got)
+	}
+
+	m.simple.wg.Wait()
+
+	rec2 := run()
+	if got := rec2.Header().Get("X-Cache"); got != "HIT (exact)" {
+		t.Fatalf("second request should be exact hit, got X-Cache=%q", got)
+	}
+	if len(logger.entries) != 1 {
+		t.Fatalf("expected 1 synthetic usage entry, got %d", len(logger.entries))
+	}
+	entry := logger.entries[0]
+	if entry.CacheType != usage.CacheTypeExact {
+		t.Fatalf("CacheType = %q, want %q", entry.CacheType, usage.CacheTypeExact)
+	}
+	if entry.InputTokens != 11 || entry.OutputTokens != 5 || entry.TotalTokens != 16 {
+		t.Fatalf("unexpected tokens: %+v", entry)
+	}
+	if entry.ProviderID != "chatcmpl-cache-hit" {
+		t.Fatalf("ProviderID = %q, want chatcmpl-cache-hit", entry.ProviderID)
+	}
+}
+
+func TestHandleRequest_InvalidStreamingBodySkipsExactCacheWrite(t *testing.T) {
+	store := cache.NewMapStore()
+	defer store.Close()
+
+	m := &ResponseCacheMiddleware{
+		simple: newSimpleCacheMiddleware(store, time.Hour, nil),
+	}
+
+	body := []byte(`{"model":"gpt-4","stream":true,"messages":[{"role":"user","content":"invalid-stream-cache"}]}`)
+	invalidStream := []byte(
+		"data: {\"id\":\"chatcmpl-invalid\",\"object\":\"chat.completion.chunk\",\"model\":\"gpt-4\",\"choices\":[{\"index\":0,\"delta\":{\"content\":\"partial\"},\"finish_reason\":null}]}\n\n",
+	)
+	e := echo.New()
+	handlerCalls := 0
+
+	run := func() *httptest.ResponseRecorder {
+		t.Helper()
+		req := httptest.NewRequest(http.MethodPost, "/v1/chat/completions", bytes.NewReader(body))
+		req.Header.Set("Content-Type", "application/json")
+		rec := httptest.NewRecorder()
+		c := e.NewContext(req, rec)
+		plan := &core.ExecutionPlan{
+			Mode:         core.ExecutionModeTranslated,
+			ProviderType: "openai",
+			Resolution: &core.RequestModelResolution{
+				ResolvedSelector: core.ModelSelector{Provider: "openai", Model: "gpt-4"},
+			},
+		}
+		c.SetRequest(req.WithContext(core.WithExecutionPlan(req.Context(), plan)))
+		if err := m.HandleRequest(c, body, func() error {
+			handlerCalls++
+			c.Response().Header().Set("Content-Type", "text/event-stream")
+			c.Response().WriteHeader(http.StatusOK)
+			_, _ = c.Response().Write(invalidStream)
+			return nil
+		}); err != nil {
+			t.Fatalf("HandleRequest: %v", err)
+		}
+		return rec
+	}
+
+	rec1 := run()
+	if got := rec1.Header().Get("X-Cache"); got != "" {
+		t.Fatalf("first request should miss cache, got X-Cache=%q", got)
+	}
+
+	m.simple.wg.Wait()
+
+	rec2 := run()
+	if got := rec2.Header().Get("X-Cache"); got != "" {
+		t.Fatalf("invalid streaming body should not be cached, got X-Cache=%q", got)
+	}
+	if handlerCalls != 2 {
+		t.Fatalf("expected invalid stream to bypass cache on follow-up, got %d calls", handlerCalls)
+	}
 }
 
 func TestReconstructStreamingResponse_PreservesChatReasoningContent(t *testing.T) {
 
@@ -219,38 +219,54 @@ func TestHashRequest_StreamIncludeUsageChangesKey(t *testing.T) {
 	}
 }
 
-func TestSimpleCacheMiddleware_SharesCacheAcrossStreamingAndNonStreaming(t *testing.T) {
+func TestHashRequest_StreamModeChangesKey(t *testing.T) {
+	base := []byte(`{"model":"gpt-4","messages":[{"role":"user","content":"hi"}]}`)
+	streaming := []byte(`{"model":"gpt-4","stream":true,"messages":[{"role":"user","content":"hi"}]}`)
+	plan := &core.ExecutionPlan{
+		Mode:         core.ExecutionModeTranslated,
+		ProviderType: "openai",
+		Resolution: &core.RequestModelResolution{
+			ResolvedSelector: core.ModelSelector{Provider: "openai", Model: "gpt-4"},
+		},
+	}
+
+	first := hashRequest("/v1/chat/completions", base, plan)
+	second := hashRequest("/v1/chat/completions", streaming, plan)
+
+	if first == second {
+		t.Fatal("stream mode should affect the exact cache key")
+	}
+}
+
+func TestSimpleCacheMiddleware_SeparatesStreamingAndNonStreamingEntries(t *testing.T) {
 	store := cache.NewMapStore()
 	defer store.Close()
 	mw := NewResponseCacheMiddlewareWithStore(store, time.Hour)
 	e := echo.New()
 	installResolvedExecutionPlan(e, "openai", "gpt-4")
 	e.Use(mw.Middleware())
 	callCount := 0
+	rawStream := []byte(
+		"data: {\"id\":\"chatcmpl-stream\",\"object\":\"chat.completion.chunk\",\"created\":1234567890,\"model\":\"gpt-4\",\"choices\":[{\"index\":0,\"delta\":{\"role\":\"assistant\",\"content\":\"streamed\"},\"finish_reason\":null}]}\n\n" +
+			"data: {\"id\":\"chatcmpl-stream\",\"object\":\"chat.completion.chunk\",\"created\":1234567890,\"model\":\"gpt-4\",\"choices\":[{\"index\":0,\"delta\":{},\"finish_reason\":\"stop\"}],\"usage\":{\"prompt_tokens\":9,\"completion_tokens\":1,\"total_tokens\":10}}\n\n" +
+			"data: [DONE]\n\n",
+	)
 	e.POST("/v1/chat/completions", func(c *echo.Context) error {
 		callCount++
-		return c.JSON(http.StatusOK, &core.ChatResponse{
-			ID:       "chatcmpl-shared-cache",
-			Object:   "chat.completion",
-			Model:    "gpt-4",
-			Provider: "openai",
-			Created:  1234567890,
-			Choices: []core.Choice{
-				{
-					Index: 0,
-					Message: core.ResponseMessage{
-						Role:    "assistant",
-						Content: "shared cached response",
-					},
-					FinishReason: "stop",
-				},
-			},
-			Usage: core.Usage{
-				PromptTokens:     9,
-				CompletionTokens: 3,
-				TotalTokens:      12,
-			},
-		})
+		body, cacheable, err := requestBodyForCache(c.Request())
+		if err != nil {
+			t.Fatalf("requestBodyForCache: %v", err)
+		}
+		if !cacheable {
+			t.Fatal("expected request to be cacheable")
+		}
+		if isStreamingRequest(c.Request().URL.Path, body) {
+			c.Response().Header().Set("Content-Type", "text/event-stream")
+			c.Response().WriteHeader(http.StatusOK)
+			_, _ = c.Response().Write(rawStream)
+			return nil
+		}
+		return c.JSON(http.StatusOK, map[string]string{"result": "json cached response"})
 	})
 
 	nonStreamingBody := []byte(`{"model":"gpt-4","messages":[{"role":"user","content":"hi"}]}`)
@@ -270,17 +286,53 @@ func TestSimpleCacheMiddleware_SharesCacheAcrossStreamingAndNonStreaming(t *test
 	req2.Header.Set("Content-Type", "application/json")
 	rec2 := httptest.NewRecorder()
 	e.ServeHTTP(rec2, req2)
-	if got := rec2.Header().Get("X-Cache"); got != "HIT (exact)" {
-		t.Fatalf("streaming request should reuse cached full response, got X-Cache=%q", got)
+	if got := rec2.Header().Get("X-Cache"); got != "" {
+		t.Fatalf("streaming request should miss exact cache because stream mode is keyed separately, got X-Cache=%q", got)
 	}
 	if got := rec2.Header().Get("Content-Type"); got != "text/event-stream" {
+		t.Fatalf("streaming miss Content-Type = %q, want text/event-stream", got)
+	}
+	if !bytes.Equal(rec2.Body.Bytes(), rawStream) {
+		t.Fatalf("streaming miss body = %q, want original SSE payload", rec2.Body.String())
+	}
+	if callCount != 2 {
+		t.Fatalf("expected separate stream miss to call handler again, got %d calls", callCount)
+	}
+
+	mw.simple.wg.Wait()
+
+	req3 := httptest.NewRequest(http.MethodPost, "/v1/chat/completions", bytes.NewReader(streamingBody))
+	req3.Header.Set("Content-Type", "application/json")
+	rec3 := httptest.NewRecorder()
+	e.ServeHTTP(rec3, req3)
+	if got := rec3.Header().Get("X-Cache"); got != "HIT (exact)" {
+		t.Fatalf("streaming follow-up should hit its own exact cache entry, got X-Cache=%q", got)
+	}
+	if got := rec3.Header().Get("Content-Type"); got != "text/event-stream" {
 		t.Fatalf("streaming cache hit Content-Type = %q, want text/event-stream", got)
 	}
-	if !bytes.Contains(rec2.Body.Bytes(), []byte("shared cached response")) || !bytes.Contains(rec2.Body.Bytes(), []byte("[DONE]")) {
-		t.Fatalf("streaming cache hit body = %q, want synthesized SSE", rec2.Body.String())
+	if !bytes.Equal(rec3.Body.Bytes(), rawStream) {
+		t.Fatalf("streaming cache hit body = %q, want verbatim SSE replay", rec3.Body.String())
 	}
-	if callCount != 1 {
-		t.Fatalf("expected streaming replay to avoid second handler call, got %d calls", callCount)
+	if callCount != 2 {
+		t.Fatalf("expected streaming replay to avoid a third handler call, got %d calls", callCount)
+	}
+
+	req4 := httptest.NewRequest(http.MethodPost, "/v1/chat/completions", bytes.NewReader(nonStreamingBody))
+	req4.Header.Set("Content-Type", "application/json")
+	rec4 := httptest.NewRecorder()
+	e.ServeHTTP(rec4, req4)
+	if got := rec4.Header().Get("X-Cache"); got != "HIT (exact)" {
+		t.Fatalf("non-streaming follow-up should hit its own exact cache entry, got X-Cache=%q", got)
+	}
+	if got := rec4.Header().Get("Content-Type"); got != "application/json" {
+		t.Fatalf("non-streaming cache hit Content-Type = %q, want application/json", got)
+	}
+	if !bytes.Contains(rec4.Body.Bytes(), []byte("json cached response")) {
+		t.Fatalf("non-streaming cache hit body = %q, want cached JSON response", rec4.Body.String())
+	}
+	if callCount != 2 {
+		t.Fatalf("non-streaming exact hit should not call handler again, got %d calls", callCount)
 	}
 }
 
 
@@ -94,8 +94,8 @@ func (m *ResponseCacheMiddleware) Middleware() echo.MiddlewareFunc {
 // HandleRequest runs the full dual-layer cache check (exact then semantic) for a
 // translated inference request that has already been guardrail-patched.
 // body is the final patched request bytes; next is the real LLM call.
-// Streaming misses are reconstructed into full JSON before storage; streaming
-// hits replay that stored JSON as synthetic SSE.
+// Streaming and non-streaming requests are cached independently. Streaming
+// misses persist raw SSE bytes and replay them verbatim on cache hits.
 func (m *ResponseCacheMiddleware) HandleRequest(c *echo.Context, body []byte, next func() error) error {
 	if m == nil {
 		return next()