Skip to content

Commit 26d2319

Browse files
committed
feat(metrics): expose SSD Flash-Stream stats to /metrics endpoint
Stack: C++ (moe_stream_op.cpp): Add 4 lifetime atomics (bytes, ns, chunks, window_throughput_mbs). Accumulated per expert-chunk load, never reset. After each 10 s log window, write throughput_mbs. Implement extern C mlx_ssd_metrics_snapshot() with full struct definition in the .cpp TU. C ABI (fast.h + include/mlx/c/fast.h): Declare MlxSSDMetricsSnapshot typedef + mlx_ssd_metrics_snapshot() in both the mlx-c copy and the Swift-visible umbrella header. moe_stream_op.h keeps only a forward declaration + extern C bridge (avoids redefinition at link time). Swift (MLXFast.swift): New MLXFast.SSDMetricsSnapshot struct + MLXFast.ssdMetricsSnapshot() calling through to the C function. Server.swift: /metrics emits 4 new Prometheus gauges/counters when the server is started with --stream-experts: swiftlm_ssd_throughput_mbps (gauge, 10 s rolling average) swiftlm_ssd_bytes_read_total (counter, lifetime) swiftlm_ssd_chunks_total (counter, lifetime) swiftlm_ssd_chunk_latency_ms (gauge, lifetime average) Example when SSD streaming is active: $ curl http://127.0.0.1:8080/metrics | grep ssd swiftlm_ssd_throughput_mbps 3456.0 swiftlm_ssd_bytes_read_total 1234567890 swiftlm_ssd_chunks_total 82340 swiftlm_ssd_chunk_latency_ms 0.0028
1 parent 60d538b commit 26d2319

6 files changed

Lines changed: 130 additions & 2 deletions

File tree

LocalPackages/mlx-swift/Source/Cmlx/include/mlx/c/fast.h

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -226,6 +226,19 @@ int mlx_fast_pread_into(
226226

227227
/**@}*/
228228

229+
// ── SSD Flash-Stream metrics snapshot ────────────────────────────────────────
230+
// Cumulative NVMe throughput stats since process start.
231+
// Call mlx_ssd_metrics_snapshot() from any thread; never resets any counter.
232+
233+
typedef struct MlxSSDMetricsSnapshot {
234+
double throughput_mb_per_s; /* 10-s rolling window average (0 before first window) */
235+
uint64_t total_bytes_read; /* Lifetime bytes read from SSD */
236+
uint64_t total_chunks; /* Lifetime expert chunks loaded */
237+
double avg_chunk_latency_ms; /* Lifetime average per-chunk latency (ms) */
238+
} MlxSSDMetricsSnapshot;
239+
240+
void mlx_ssd_metrics_snapshot(MlxSSDMetricsSnapshot* out);
241+
229242
#ifdef __cplusplus
230243
}
231244
#endif

LocalPackages/mlx-swift/Source/Cmlx/mlx-c/mlx/c/fast.h

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -231,6 +231,19 @@ int mlx_fast_pread_into(
231231

232232
/**@}*/
233233

234+
// ── SSD Flash-Stream metrics snapshot ────────────────────────────────────────
235+
// Cumulative NVMe throughput stats since process start.
236+
// Call mlx_ssd_metrics_snapshot() from any thread to read without resetting counters.
237+
238+
typedef struct MlxSSDMetricsSnapshot {
239+
double throughput_mb_per_s; /* 10-s rolling window average (0 before first window) */
240+
uint64_t total_bytes_read; /* Lifetime bytes read from SSD */
241+
uint64_t total_chunks; /* Lifetime expert chunks loaded */
242+
double avg_chunk_latency_ms; /* Lifetime average per-chunk latency (ms) */
243+
} MlxSSDMetricsSnapshot;
244+
245+
void mlx_ssd_metrics_snapshot(MlxSSDMetricsSnapshot* out);
246+
234247
#ifdef __cplusplus
235248
}
236249
#endif

LocalPackages/mlx-swift/Source/Cmlx/mlx/mlx/core/moe_stream_op.cpp

Lines changed: 41 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,11 +12,18 @@
1212
#include "mlx/backend/metal/utils.h"
1313

1414
// Static SSD metric trackers for aggregate logging
15-
static std::atomic<size_t> g_total_bytes_read{0};
15+
static std::atomic<size_t> g_total_bytes_read{0};
1616
static std::atomic<uint64_t> g_total_read_ns{0};
17-
static std::atomic<size_t> g_read_count{0};
17+
static std::atomic<size_t> g_read_count{0};
1818
static std::atomic<uint64_t> g_last_log_ns{0};
1919

20+
// Lifetime totals: never reset, read by /metrics endpoint
21+
static std::atomic<uint64_t> g_lifetime_bytes{0};
22+
static std::atomic<uint64_t> g_lifetime_ns{0};
23+
static std::atomic<uint64_t> g_lifetime_chunks{0};
24+
// Last-window throughput (MB/s) — written after each 10s window closes
25+
static std::atomic<double> g_window_throughput_mbs{0.0};
26+
2027
namespace mlx::core {
2128

2229
class LoadSSDExpert : public Primitive {
@@ -57,6 +64,11 @@ class LoadSSDExpert : public Primitive {
5764
streamer_->load_sync(block_offset, matrix_bytes, o.data<void>());
5865
auto end_read = std::chrono::high_resolution_clock::now();
5966

67+
// Accumulate into lifetime totals (never reset)
68+
g_lifetime_bytes += matrix_bytes;
69+
g_lifetime_ns += static_cast<uint64_t>(std::chrono::duration_cast<std::chrono::nanoseconds>(end_read - start_read).count());
70+
g_lifetime_chunks += 1;
71+
6072
// ─────────────────────────────────────────────────────────────────────
6173
// AGGREGATE LOGGING — 10-second metric intervals, printed to stderr so
6274
// the metric lines never interleave with the stdout token stream.
@@ -80,6 +92,8 @@ class LoadSSDExpert : public Primitive {
8092
double elapsed_s = ns_t / 1e9;
8193
double throughput_mbs = (bytes / (1024.0 * 1024.0)) / elapsed_s;
8294
double avg_ms_per_chunk = (ns_t / 1'000'000.0) / count;
95+
// Persist for /metrics endpoint (non-resetting)
96+
g_window_throughput_mbs.store(throughput_mbs, std::memory_order_relaxed);
8397
// Print to stderr — never touches the stdout token stream
8498
std::cerr << "[⚡️ SSD Stream] "
8599
<< std::fixed << std::setprecision(0);
@@ -141,3 +155,28 @@ MLX_API array streamed_gather_mm(
141155
}
142156

143157
} // namespace mlx::core
158+
159+
// ── C-ABI metrics accessor (no name mangling — callable from Swift) ───────────
160+
// Define the struct here in the TU that owns it; fast.h has the matching
161+
// typedef so Swift and external C consumers see layout-compatible definitions.
162+
struct MlxSSDMetricsSnapshot {
163+
double throughput_mb_per_s;
164+
uint64_t total_bytes_read;
165+
uint64_t total_chunks;
166+
double avg_chunk_latency_ms;
167+
};
168+
169+
extern "C" void mlx_ssd_metrics_snapshot(struct MlxSSDMetricsSnapshot* out) {
170+
if (!out) return;
171+
uint64_t bytes = g_lifetime_bytes.load(std::memory_order_acquire);
172+
uint64_t ns = g_lifetime_ns.load(std::memory_order_acquire);
173+
uint64_t chunks = g_lifetime_chunks.load(std::memory_order_acquire);
174+
double tput = g_window_throughput_mbs.load(std::memory_order_acquire);
175+
176+
out->total_bytes_read = bytes;
177+
out->total_chunks = chunks;
178+
out->throughput_mb_per_s = tput;
179+
out->avg_chunk_latency_ms = (ns > 0 && chunks > 0)
180+
? (ns / 1'000'000.0) / static_cast<double>(chunks)
181+
: 0.0;
182+
}

LocalPackages/mlx-swift/Source/Cmlx/mlx/mlx/core/moe_stream_op.h

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,8 @@
1111
#include "mlx/utils.h"
1212
#include <memory>
1313
#include <vector>
14+
#include <cstdint>
15+
#include <cstddef>
1416

1517
namespace mlx::core {
1618

@@ -36,3 +38,18 @@ MLX_API array streamed_gather_mm(
3638
);
3739

3840
} // namespace mlx::core
41+
42+
// ── Metrics snapshot forward declaration ─────────────────────────────────────
43+
// The struct is defined in mlx/c/fast.h (the Swift-visible Cmlx umbrella).
44+
// This extern "C" block makes the implementation in moe_stream_op.cpp link
45+
// correctly without C++ name mangling.
46+
#ifdef __cplusplus
47+
extern "C" {
48+
#endif
49+
50+
struct MlxSSDMetricsSnapshot;
51+
void mlx_ssd_metrics_snapshot(struct MlxSSDMetricsSnapshot* out);
52+
53+
#ifdef __cplusplus
54+
}
55+
#endif

LocalPackages/mlx-swift/Source/MLX/MLXFast.swift

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -278,6 +278,34 @@ public enum MLXFast {
278278
return (kTuple, vTuple)
279279
}
280280

281+
// ── SSD Flash-Stream Metrics ──────────────────────────────────────────────
282+
283+
/// Snapshot of cumulative SSD streaming throughput stats.
284+
/// Safe to call from any thread at any time.
285+
public struct SSDMetricsSnapshot: Sendable {
286+
/// Rolling average throughput over the last 10-second window (MB/s).
287+
/// Zero until the first 10 s window has elapsed.
288+
public let throughputMBperS: Double
289+
/// Lifetime bytes loaded from SSD since process start.
290+
public let totalBytesRead: UInt64
291+
/// Lifetime expert chunks loaded from SSD since process start.
292+
public let totalChunks: UInt64
293+
/// Lifetime average latency per expert chunk (ms).
294+
public let avgChunkLatencyMS: Double
295+
}
296+
297+
/// Read the current SSD Flash-Stream metrics without resetting any counters.
298+
public static func ssdMetricsSnapshot() -> SSDMetricsSnapshot {
299+
var raw = MlxSSDMetricsSnapshot()
300+
mlx_ssd_metrics_snapshot(&raw)
301+
return SSDMetricsSnapshot(
302+
throughputMBperS: raw.throughput_mb_per_s,
303+
totalBytesRead: raw.total_bytes_read,
304+
totalChunks: raw.total_chunks,
305+
avgChunkLatencyMS: raw.avg_chunk_latency_ms
306+
)
307+
}
308+
281309
public static func streamedGatherMM(
282310
x: MLXArray, wShape: MLXArray, activeExpert: UInt32, safetensorsPath: String, tensorName: String, stream: StreamOrDevice = .default
283311
) -> MLXArray {

Sources/SwiftLM/Server.swift

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -572,6 +572,24 @@ struct MLXServer: AsyncParsableCommand {
572572
lines.append("# HELP swiftlm_uptime_seconds Server uptime")
573573
lines.append("# TYPE swiftlm_uptime_seconds gauge")
574574
lines.append("swiftlm_uptime_seconds \(String(format: "%.0f", uptime))")
575+
576+
// ── SSD Flash-Stream metrics (only emitted when --stream-experts is active) ──
577+
if isSSDStream {
578+
let ssd = MLXFast.ssdMetricsSnapshot()
579+
lines.append("# HELP swiftlm_ssd_throughput_mbps NVMe read throughput (10 s rolling average, MB/s)")
580+
lines.append("# TYPE swiftlm_ssd_throughput_mbps gauge")
581+
lines.append("swiftlm_ssd_throughput_mbps \(String(format: "%.1f", ssd.throughputMBperS))")
582+
lines.append("# HELP swiftlm_ssd_bytes_read_total Lifetime bytes read from SSD for expert weights")
583+
lines.append("# TYPE swiftlm_ssd_bytes_read_total counter")
584+
lines.append("swiftlm_ssd_bytes_read_total \(ssd.totalBytesRead)")
585+
lines.append("# HELP swiftlm_ssd_chunks_total Lifetime expert chunks loaded from SSD")
586+
lines.append("# TYPE swiftlm_ssd_chunks_total counter")
587+
lines.append("swiftlm_ssd_chunks_total \(ssd.totalChunks)")
588+
lines.append("# HELP swiftlm_ssd_chunk_latency_ms Average per-chunk SSD read latency (ms, lifetime)")
589+
lines.append("# TYPE swiftlm_ssd_chunk_latency_ms gauge")
590+
lines.append("swiftlm_ssd_chunk_latency_ms \(String(format: "%.4f", ssd.avgChunkLatencyMS))")
591+
}
592+
575593
lines.append("")
576594
let metrics = lines.joined(separator: "\n")
577595
return Response(

0 commit comments

Comments
 (0)