Skip to content

Commit 4c5e54b

Browse files
committed
feat: log full JSON response body matching llama-server log_server_r format
Both streaming and non-streaming handlers now emit: srv log_server_r: response: {complete JSON on one line} Non-streaming: logs exact encoded response JSON Streaming: logs reconstructed response JSON with choices/message/content/usage/timings Removes previous truncated 120-char preview log
1 parent d3da36e commit 4c5e54b

1 file changed

Lines changed: 23 additions & 8 deletions

File tree

Sources/mlx-server/Server.swift

Lines changed: 23 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -990,12 +990,26 @@ func handleChatStreaming(
990990
}
991991
cont.yield("data: [DONE]\n\n")
992992
cont.finish()
993-
// llama-server style generation log
993+
// llama-server style: log full response JSON on one line
994994
let dur = Date().timeIntervalSince(genStart)
995995
let tokPerSec = dur > 0 ? Double(completionTokenCount) / dur : 0
996-
let preview = String(fullText.prefix(120)).replacingOccurrences(of: "\n", with: " ")
997-
let suffix = fullText.count > 120 ? "..." : ""
998-
print("[mlx-server] prompt=\(promptTokenCount)t, gen=\(completionTokenCount)t, speed=\(String(format: "%.2f", tokPerSec))t/s [stream] | \(preview)\(suffix)")
996+
let logResp: [String: Any] = [
997+
"choices": [[
998+
"index": 0,
999+
"message": ["role": "assistant", "content": fullText],
1000+
"finish_reason": reason
1001+
]],
1002+
"usage": [
1003+
"prompt_tokens": promptTokenCount,
1004+
"completion_tokens": completionTokenCount,
1005+
"total_tokens": promptTokenCount + completionTokenCount
1006+
],
1007+
"timings": ["predicted_per_second": tokPerSec]
1008+
]
1009+
if let logData = try? JSONSerialization.data(withJSONObject: logResp),
1010+
let logStr = String(data: logData, encoding: .utf8) {
1011+
print("srv log_server_r: response: \(logStr)")
1012+
}
9991013
}
10001014
}
10011015
}
@@ -1053,10 +1067,7 @@ func handleChatNonStreaming(
10531067
await stats.requestFinished(tokens: completionTokenCount, duration: duration)
10541068
await semaphore.signal()
10551069

1056-
// ── llama-server style generation log ──
1057-
let tokPerSec = duration > 0 ? Double(completionTokenCount) / duration : 0
1058-
let outputPreview = fullText.prefix(120).replacingOccurrences(of: "\n", with: " ")
1059-
print("[mlx-server] prompt=\(promptTokenCount)t, gen=\(completionTokenCount)t, speed=\(String(format: "%.2f", tokPerSec))t/s | \(outputPreview)\(fullText.count > 120 ? "..." : "")")
1070+
10601071

10611072
// ── Apply stop sequences to final text ──
10621073
var finishReason: String
@@ -1104,6 +1115,10 @@ func handleChatNonStreaming(
11041115
usage: TokenUsage(promptTokens: promptTokenCount, completionTokens: completionTokenCount, totalTokens: totalTokens)
11051116
)
11061117
let encoded = try JSONEncoder().encode(resp)
1118+
// llama-server style: log full response JSON on one line
1119+
if let responseStr = String(data: encoded, encoding: .utf8) {
1120+
print("srv log_server_r: response: \(responseStr)")
1121+
}
11071122
return Response(
11081123
status: .ok,
11091124
headers: jsonHeaders(),

0 commit comments

Comments
 (0)