feat(swiftlmchat): proactive iOS lifecycle — unload on background, reload on foreground

solderzzc · solderzzc · commit d454c0cb49a5 · 2026-03-31T13:19:20.000-07:00
InferenceEngine:
- willResignActiveNotification → stopGeneration() + unload() + save backgroundedModelId
- didBecomeActiveNotification → reload backgroundedModelId (or lastLoadedModelId)
- autoOffloadOnBackground: Bool (default true on iOS, false on macOS)
- Observers consolidated into [NSObjectProtocol] for clean deinit
- Reactive memory warning still kept as safety fallback
- Thermal observer migrated to same consolidated array
- Background unload sets .idle (not .error) — clean UX on return
diff --git a/Sources/MLXInferenceCore/InferenceEngine.swift b/Sources/MLXInferenceCore/InferenceEngine.swift
@@ -55,54 +55,117 @@ public final class InferenceEngine: ObservableObject {
     @Published public private(set) var state: ModelState = .idle
     @Published public private(set) var thermalLevel: ThermalLevel = .nominal
 
+    /// Whether to automatically unload the model when the app backgrounds
+    /// and reload it when returning to foreground.
+    /// Defaults to true on iOS (prevents jetsam), false on macOS.
+    public var autoOffloadOnBackground: Bool = {
+        #if os(iOS)
+        return true
+        #else
+        return false
+        #endif
+    }()
+
     /// Shared download + storage manager.
     public let downloadManager = ModelDownloadManager()
 
     private var container: ModelContainer?
     private var currentModelId: String?
     private var generationTask: Task<Void, Never>?
-    private var pressureObserver: NSObjectProtocol?
-    private var thermalObserver: NSObjectProtocol?
+
+    // All NotificationCenter observers collected for clean deregistration
+    private var observers: [NSObjectProtocol] = []
+
+    // Track the model ID that was active before we backgrounded,
+    // so we can restore it when returning to foreground.
+    private var backgroundedModelId: String?
 
     public init() {
         setupPressureHandlers()
     }
 
     deinit {
-        if let o = pressureObserver { NotificationCenter.default.removeObserver(o) }
-        if let o = thermalObserver  { NotificationCenter.default.removeObserver(o) }
+        observers.forEach { NotificationCenter.default.removeObserver($0) }
     }
 
     // MARK: — Pressure Handlers
 
     private func setupPressureHandlers() {
-        // iOS memory pressure → unload model weights immediately
         #if canImport(UIKit)
-        pressureObserver = NotificationCenter.default.addObserver(
-            forName: UIApplication.didReceiveMemoryWarningNotification,
-            object: nil,
-            queue: .main
-        ) { [weak self] _ in
-            Task { @MainActor [weak self] in
-                guard let self else { return }
-                // Only unload if not actively generating
-                if case .generating = self.state { return }
-                self.unload()
-                self.state = .error("Unloaded due to memory pressure. Tap to reload.")
+        // ── REACTIVE: Memory warning (last resort) ────────────────────────────
+        // OS sends this *after* pressure builds. We still handle it as a fallback
+        // in case the proactive unload wasn't triggered (e.g. app was already
+        // under pressure from another process).
+        observers.append(
+            NotificationCenter.default.addObserver(
+                forName: UIApplication.didReceiveMemoryWarningNotification,
+                object: nil, queue: .main
+            ) { [weak self] _ in
+                Task { @MainActor [weak self] in
+                    guard let self else { return }
+                    if case .generating = self.state { return }  // don't interrupt mid-stream
+                    self.unload()
+                    self.state = .error("Unloaded due to memory pressure. Tap to reload.")
+                }
             }
-        }
+        )
+
+        // ── PROACTIVE: App will background ────────────────────────────────────
+        // Fire BEFORE iOS hands control back to springboard.
+        // At this moment the process is still fully foregrounded — Metal context
+        // is valid, memory limit hasn't changed. We unload now so iOS never
+        // accumulates memory pressure against us in the background.
+        observers.append(
+            NotificationCenter.default.addObserver(
+                forName: UIApplication.willResignActiveNotification,
+                object: nil, queue: .main
+            ) { [weak self] _ in
+                Task { @MainActor [weak self] in
+                    guard let self, self.autoOffloadOnBackground else { return }
+                    // Remember what was loaded so we can restore it
+                    self.backgroundedModelId = self.currentModelId
+                    // Stop any in-flight generation cleanly
+                    self.stopGeneration()
+                    self.unload()
+                    self.state = .idle  // clean slate — no error banner on return
+                }
+            }
+        )
+
+        // ── PROACTIVE: App returned to foreground ─────────────────────────────
+        // Silently reload the model the user was using before they left.
+        // We show .loading state so the chat UI doesn't appear broken.
+        observers.append(
+            NotificationCenter.default.addObserver(
+                forName: UIApplication.didBecomeActiveNotification,
+                object: nil, queue: .main
+            ) { [weak self] _ in
+                Task { @MainActor [weak self] in
+                    guard let self, self.autoOffloadOnBackground else { return }
+                    // Prefer the model that was active when we backgrounded;
+                    // fall back to the last persisted model the user chose.
+                    let modelToReload = self.backgroundedModelId
+                        ?? self.downloadManager.lastLoadedModelId
+                    self.backgroundedModelId = nil
+                    if let modelId = modelToReload {
+                        await self.load(modelId: modelId)
+                    }
+                }
+            }
+        )
         #endif
 
-        // Thermal state monitoring (all platforms)
-        thermalObserver = NotificationCenter.default.addObserver(
-            forName: ProcessInfo.thermalStateDidChangeNotification,
-            object: nil,
-            queue: .main
-        ) { [weak self] _ in
-            Task { @MainActor [weak self] in
-                self?.updateThermalLevel()
+        // ── Thermal state monitoring (all platforms) ──────────────────────────
+        observers.append(
+            NotificationCenter.default.addObserver(
+                forName: ProcessInfo.thermalStateDidChangeNotification,
+                object: nil, queue: .main
+            ) { [weak self] _ in
+                Task { @MainActor [weak self] in
+                    self?.updateThermalLevel()
+                }
             }
-        }
+        )
         updateThermalLevel()
     }
 
diff --git a/Sources/SwiftLM/Server.swift b/Sources/SwiftLM/Server.swift
@@ -1065,10 +1065,14 @@ struct ThinkingStateTracker {
 
 // ── Chat Streaming ───────────────────────────────────────────────────────────
 
-/// A lightweight actor-based boolean flag used to coordinate the prefill heartbeat task.
-private actor BoolFlag {
-    private(set) var value: Bool = false
-    func set() { value = true }
+/// Tracks prefill progress: whether it is done, and how many tokens have been processed.
+/// n_past is updated by activePrefillProgressHook (called from LLMModel.prepare after each chunk)
+/// and read by the SSE heartbeat task every 2 s.
+private actor PrefillState {
+    private(set) var done: Bool = false
+    private(set) var nPast: Int = 0
+    func finish() { done = true }
+    func update(nPast: Int) { self.nPast = nPast }
 }
 
 func handleChatStreaming(
@@ -1086,16 +1090,25 @@ func handleChatStreaming(
 ) -> Response {
     let (sseStream, cont) = AsyncStream<String>.makeStream()
 
-    // ── Prefill heartbeat: emit progress events while prompt is being processed ──
-    // This prevents clients from seeing a silent/dead connection during long prefills.
-    let prefillDone = BoolFlag()
+    // ── Prefill heartbeat: emit llama-server-style slot_update progress every 2 s ──
+    // n_past is updated by activePrefillProgressHook in LLMModel.prepare() after each
+    // 512-token chunk; single-chunk prompts only show elapsed_seconds.
+    let prefillState = PrefillState()
+    activePrefillProgressHook = { nPast, _ in
+        Task { await prefillState.update(nPast: nPast) }
+    }
     Task {
         var elapsed = 0
-        while await !prefillDone.value {
+        while await !prefillState.done {
             try? await Task.sleep(for: .seconds(2))
-            if await !prefillDone.value {
+            if await !prefillState.done {
                 elapsed += 2
-                _ = cont.yield(ssePrefillChunk(modelId: modelId, promptTokens: promptTokenCount, elapsedSeconds: elapsed))
+                let nPast = await prefillState.nPast
+                _ = cont.yield(ssePrefillChunk(
+                    modelId: modelId,
+                    nPast: nPast,
+                    promptTokens: promptTokenCount,
+                    elapsedSeconds: elapsed))
             }
         }
     }
@@ -1121,7 +1134,9 @@ func handleChatStreaming(
                 }
                 // Signal first token — stops the prefill heartbeat task
                 if firstToken {
-                    await prefillDone.set()
+                    // First decode token: stop heartbeat and clear the prefill progress hook
+                    activePrefillProgressHook = nil
+                    await prefillState.finish()
                     let prefillDur = Date().timeIntervalSince(prefillStart)
                     let prefillTokPerSec = prefillDur > 0 ? Double(promptTokenCount) / prefillDur : 0
                     print("srv  slot update: id 0 | prefill done | n_tokens=\(promptTokenCount), t=\(String(format: "%.2f", prefillDur))s, \(String(format: "%.1f", prefillTokPerSec))t/s")
@@ -1175,7 +1190,8 @@ func handleChatStreaming(
                 toolCallIndex += 1
 
             case .info(let info):
-                await prefillDone.set()
+                activePrefillProgressHook = nil
+                await prefillState.finish()
                 if !stopped {
                     var reason: String
                     switch info.stopReason {
@@ -1711,15 +1727,23 @@ func sseChunk(modelId: String, reasoningContent: String?, content: String?, fini
 
 /// Prefill-progress heartbeat chunk — emitted every 2s while the server is processing the prompt.
 /// Uses object type "prefill_progress" so clients can filter it without confusing it with real tokens.
-func ssePrefillChunk(modelId: String, promptTokens: Int, elapsedSeconds: Int) -> String {
+/// Format mirrors llama-server's slot_update event:
+///   n_past          : tokens evaluated so far (real value from chunked prefill, or 0 for single-chunk)
+///   n_prompt_tokens : total prompt token count
+///   fraction        : n_past / n_prompt_tokens (0.0–1.0), useful for progress bars
+///   elapsed_seconds : wall-clock time since the request started
+func ssePrefillChunk(modelId: String, nPast: Int = 0, promptTokens: Int, elapsedSeconds: Int) -> String {
+    let fraction = promptTokens > 0 ? Double(nPast) / Double(promptTokens) : 0.0
     let chunk: [String: Any] = [
         "id": "prefill-\(UUID().uuidString)",
         "object": "prefill_progress",
         "created": Int(Date().timeIntervalSince1970),
         "model": modelId,
         "prefill": [
             "status": "processing",
-            "prompt_tokens": promptTokens,
+            "n_past": nPast,
+            "n_prompt_tokens": promptTokens,
+            "fraction": fraction,
             "elapsed_seconds": elapsedSeconds
         ]
     ]