feat(metrics): expose SSD Flash-Stream stats to /metrics endpoint

solderzzc · solderzzc · commit 26d23194a713 · 2026-03-30T14:33:33.000-07:00
Stack: C++ (moe_stream_op.cpp): Add 4 lifetime atomics (bytes, ns, chunks, window_throughput_mbs). Accumulated per expert-chunk load, never reset. After each 10 s log window, write throughput_mbs. Implement extern C mlx_ssd_metrics_snapshot() with full struct definition in the .cpp TU. C ABI (fast.h + include/mlx/c/fast.h): Declare MlxSSDMetricsSnapshot typedef + mlx_ssd_metrics_snapshot() in both the mlx-c copy and the Swift-visible umbrella header. moe_stream_op.h keeps only a forward declaration + extern C bridge (avoids redefinition at link time). Swift (MLXFast.swift): New MLXFast.SSDMetricsSnapshot struct + MLXFast.ssdMetricsSnapshot() calling through to the C function. Server.swift: /metrics emits 4 new Prometheus gauges/counters when the server is started with --stream-experts: swiftlm_ssd_throughput_mbps (gauge, 10 s rolling average) swiftlm_ssd_bytes_read_total (counter, lifetime) swiftlm_ssd_chunks_total (counter, lifetime) swiftlm_ssd_chunk_latency_ms (gauge, lifetime average) Example when SSD streaming is active: $ curl http://127.0.0.1:8080/metrics | grep ssd swiftlm_ssd_throughput_mbps 3456.0 swiftlm_ssd_bytes_read_total 1234567890 swiftlm_ssd_chunks_total 82340 swiftlm_ssd_chunk_latency_ms 0.0028
diff --git a/LocalPackages/mlx-swift/Source/Cmlx/include/mlx/c/fast.h b/LocalPackages/mlx-swift/Source/Cmlx/include/mlx/c/fast.h
@@ -226,6 +226,19 @@ int mlx_fast_pread_into(
 
 /**@}*/
 
+// ── SSD Flash-Stream metrics snapshot ────────────────────────────────────────
+// Cumulative NVMe throughput stats since process start.
+// Call mlx_ssd_metrics_snapshot() from any thread; never resets any counter.
+
+typedef struct MlxSSDMetricsSnapshot {
+    double   throughput_mb_per_s;  /* 10-s rolling window average (0 before first window) */
+    uint64_t total_bytes_read;     /* Lifetime bytes read from SSD */
+    uint64_t total_chunks;         /* Lifetime expert chunks loaded */
+    double   avg_chunk_latency_ms; /* Lifetime average per-chunk latency (ms) */
+} MlxSSDMetricsSnapshot;
+
+void mlx_ssd_metrics_snapshot(MlxSSDMetricsSnapshot* out);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/LocalPackages/mlx-swift/Source/Cmlx/mlx-c/mlx/c/fast.h b/LocalPackages/mlx-swift/Source/Cmlx/mlx-c/mlx/c/fast.h
@@ -231,6 +231,19 @@ int mlx_fast_pread_into(
 
 /**@}*/
 
+// ── SSD Flash-Stream metrics snapshot ────────────────────────────────────────
+// Cumulative NVMe throughput stats since process start.
+// Call mlx_ssd_metrics_snapshot() from any thread to read without resetting counters.
+
+typedef struct MlxSSDMetricsSnapshot {
+    double   throughput_mb_per_s;  /* 10-s rolling window average (0 before first window) */
+    uint64_t total_bytes_read;     /* Lifetime bytes read from SSD */
+    uint64_t total_chunks;         /* Lifetime expert chunks loaded */
+    double   avg_chunk_latency_ms; /* Lifetime average per-chunk latency (ms) */
+} MlxSSDMetricsSnapshot;
+
+void mlx_ssd_metrics_snapshot(MlxSSDMetricsSnapshot* out);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/LocalPackages/mlx-swift/Source/Cmlx/mlx/mlx/core/moe_stream_op.cpp b/LocalPackages/mlx-swift/Source/Cmlx/mlx/mlx/core/moe_stream_op.cpp
@@ -12,11 +12,18 @@
 #include "mlx/backend/metal/utils.h"
 
 // Static SSD metric trackers for aggregate logging
-static std::atomic<size_t> g_total_bytes_read{0};
+static std::atomic<size_t>   g_total_bytes_read{0};
 static std::atomic<uint64_t> g_total_read_ns{0};
-static std::atomic<size_t> g_read_count{0};
+static std::atomic<size_t>   g_read_count{0};
 static std::atomic<uint64_t> g_last_log_ns{0};
 
+// Lifetime totals: never reset, read by /metrics endpoint
+static std::atomic<uint64_t> g_lifetime_bytes{0};
+static std::atomic<uint64_t> g_lifetime_ns{0};
+static std::atomic<uint64_t> g_lifetime_chunks{0};
+// Last-window throughput (MB/s) — written after each 10s window closes
+static std::atomic<double>   g_window_throughput_mbs{0.0};
+
 namespace mlx::core {
 
 class LoadSSDExpert : public Primitive {
@@ -57,6 +64,11 @@ class LoadSSDExpert : public Primitive {
         streamer_->load_sync(block_offset, matrix_bytes, o.data<void>());
         auto end_read = std::chrono::high_resolution_clock::now();
 
+        // Accumulate into lifetime totals (never reset)
+        g_lifetime_bytes  += matrix_bytes;
+        g_lifetime_ns     += static_cast<uint64_t>(std::chrono::duration_cast<std::chrono::nanoseconds>(end_read - start_read).count());
+        g_lifetime_chunks += 1;
+
         // ─────────────────────────────────────────────────────────────────────
         // AGGREGATE LOGGING — 10-second metric intervals, printed to stderr so
         // the metric lines never interleave with the stdout token stream.
@@ -80,6 +92,8 @@ class LoadSSDExpert : public Primitive {
                     double elapsed_s  = ns_t / 1e9;
                     double throughput_mbs = (bytes / (1024.0 * 1024.0)) / elapsed_s;
                     double avg_ms_per_chunk = (ns_t / 1'000'000.0) / count;
+                    // Persist for /metrics endpoint (non-resetting)
+                    g_window_throughput_mbs.store(throughput_mbs, std::memory_order_relaxed);
                     // Print to stderr — never touches the stdout token stream
                     std::cerr << "[⚡️ SSD Stream] "
                               << std::fixed << std::setprecision(0);
@@ -141,3 +155,28 @@ MLX_API array streamed_gather_mm(
 }
 
 } // namespace mlx::core
+
+// ── C-ABI metrics accessor (no name mangling — callable from Swift) ───────────
+// Define the struct here in the TU that owns it; fast.h has the matching
+// typedef so Swift and external C consumers see layout-compatible definitions.
+struct MlxSSDMetricsSnapshot {
+    double   throughput_mb_per_s;
+    uint64_t total_bytes_read;
+    uint64_t total_chunks;
+    double   avg_chunk_latency_ms;
+};
+
+extern "C" void mlx_ssd_metrics_snapshot(struct MlxSSDMetricsSnapshot* out) {
+    if (!out) return;
+    uint64_t bytes  = g_lifetime_bytes.load(std::memory_order_acquire);
+    uint64_t ns     = g_lifetime_ns.load(std::memory_order_acquire);
+    uint64_t chunks = g_lifetime_chunks.load(std::memory_order_acquire);
+    double   tput   = g_window_throughput_mbs.load(std::memory_order_acquire);
+
+    out->total_bytes_read     = bytes;
+    out->total_chunks         = chunks;
+    out->throughput_mb_per_s  = tput;
+    out->avg_chunk_latency_ms = (ns > 0 && chunks > 0)
+        ? (ns / 1'000'000.0) / static_cast<double>(chunks)
+        : 0.0;
+}
diff --git a/LocalPackages/mlx-swift/Source/Cmlx/mlx/mlx/core/moe_stream_op.h b/LocalPackages/mlx-swift/Source/Cmlx/mlx/mlx/core/moe_stream_op.h
@@ -11,6 +11,8 @@
 #include "mlx/utils.h"
 #include <memory>
 #include <vector>
+#include <cstdint>
+#include <cstddef>
 
 namespace mlx::core {
 
@@ -36,3 +38,18 @@ MLX_API array streamed_gather_mm(
 );
 
 } // namespace mlx::core
+
+// ── Metrics snapshot forward declaration ─────────────────────────────────────
+// The struct is defined in mlx/c/fast.h (the Swift-visible Cmlx umbrella).
+// This extern "C" block makes the implementation in moe_stream_op.cpp link
+// correctly without C++ name mangling.
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct MlxSSDMetricsSnapshot;
+void mlx_ssd_metrics_snapshot(struct MlxSSDMetricsSnapshot* out);
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/LocalPackages/mlx-swift/Source/MLX/MLXFast.swift b/LocalPackages/mlx-swift/Source/MLX/MLXFast.swift
@@ -278,6 +278,34 @@ public enum MLXFast {
         return (kTuple, vTuple)
     }
 
+    // ── SSD Flash-Stream Metrics ──────────────────────────────────────────────
+
+    /// Snapshot of cumulative SSD streaming throughput stats.
+    /// Safe to call from any thread at any time.
+    public struct SSDMetricsSnapshot: Sendable {
+        /// Rolling average throughput over the last 10-second window (MB/s).
+        /// Zero until the first 10 s window has elapsed.
+        public let throughputMBperS: Double
+        /// Lifetime bytes loaded from SSD since process start.
+        public let totalBytesRead:   UInt64
+        /// Lifetime expert chunks loaded from SSD since process start.
+        public let totalChunks:      UInt64
+        /// Lifetime average latency per expert chunk (ms).
+        public let avgChunkLatencyMS: Double
+    }
+
+    /// Read the current SSD Flash-Stream metrics without resetting any counters.
+    public static func ssdMetricsSnapshot() -> SSDMetricsSnapshot {
+        var raw = MlxSSDMetricsSnapshot()
+        mlx_ssd_metrics_snapshot(&raw)
+        return SSDMetricsSnapshot(
+            throughputMBperS:  raw.throughput_mb_per_s,
+            totalBytesRead:    raw.total_bytes_read,
+            totalChunks:       raw.total_chunks,
+            avgChunkLatencyMS: raw.avg_chunk_latency_ms
+        )
+    }
+
     public static func streamedGatherMM(
         x: MLXArray, wShape: MLXArray, activeExpert: UInt32, safetensorsPath: String, tensorName: String, stream: StreamOrDevice = .default
     ) -> MLXArray {
diff --git a/Sources/SwiftLM/Server.swift b/Sources/SwiftLM/Server.swift
@@ -572,6 +572,24 @@ struct MLXServer: AsyncParsableCommand {
             lines.append("# HELP swiftlm_uptime_seconds Server uptime")
             lines.append("# TYPE swiftlm_uptime_seconds gauge")
             lines.append("swiftlm_uptime_seconds \(String(format: "%.0f", uptime))")
+
+            // ── SSD Flash-Stream metrics (only emitted when --stream-experts is active) ──
+            if isSSDStream {
+                let ssd = MLXFast.ssdMetricsSnapshot()
+                lines.append("# HELP swiftlm_ssd_throughput_mbps NVMe read throughput (10 s rolling average, MB/s)")
+                lines.append("# TYPE swiftlm_ssd_throughput_mbps gauge")
+                lines.append("swiftlm_ssd_throughput_mbps \(String(format: "%.1f", ssd.throughputMBperS))")
+                lines.append("# HELP swiftlm_ssd_bytes_read_total Lifetime bytes read from SSD for expert weights")
+                lines.append("# TYPE swiftlm_ssd_bytes_read_total counter")
+                lines.append("swiftlm_ssd_bytes_read_total \(ssd.totalBytesRead)")
+                lines.append("# HELP swiftlm_ssd_chunks_total Lifetime expert chunks loaded from SSD")
+                lines.append("# TYPE swiftlm_ssd_chunks_total counter")
+                lines.append("swiftlm_ssd_chunks_total \(ssd.totalChunks)")
+                lines.append("# HELP swiftlm_ssd_chunk_latency_ms Average per-chunk SSD read latency (ms, lifetime)")
+                lines.append("# TYPE swiftlm_ssd_chunk_latency_ms gauge")
+                lines.append("swiftlm_ssd_chunk_latency_ms \(String(format: "%.4f", ssd.avgChunkLatencyMS))")
+            }
+
             lines.append("")
             let metrics = lines.joined(separator: "\n")
             return Response(