|
| 1 | +import { monitorEventLoopDelay, performance } from "node:perf_hooks"; |
| 2 | + |
| 3 | +const EVENT_LOOP_MONITOR_RESOLUTION_MS = 20; |
| 4 | +const EVENT_LOOP_DELAY_WARN_MS = 1_000; |
| 5 | +const EVENT_LOOP_UTILIZATION_WARN = 0.95; |
| 6 | +const CPU_CORE_RATIO_WARN = 0.9; |
| 7 | + |
| 8 | +type EventLoopDelayMonitor = ReturnType<typeof monitorEventLoopDelay>; |
| 9 | +type EventLoopUtilization = ReturnType<typeof performance.eventLoopUtilization>; |
| 10 | +type CpuUsage = ReturnType<typeof process.cpuUsage>; |
| 11 | + |
| 12 | +export type GatewayEventLoopHealthReason = "event_loop_delay" | "event_loop_utilization" | "cpu"; |
| 13 | + |
| 14 | +export type GatewayEventLoopHealth = { |
| 15 | + degraded: boolean; |
| 16 | + reasons: GatewayEventLoopHealthReason[]; |
| 17 | + intervalMs: number; |
| 18 | + delayP99Ms: number; |
| 19 | + delayMaxMs: number; |
| 20 | + utilization: number; |
| 21 | + cpuCoreRatio: number; |
| 22 | +}; |
| 23 | + |
| 24 | +export type GatewayEventLoopHealthMonitor = { |
| 25 | + snapshot: () => GatewayEventLoopHealth | undefined; |
| 26 | + stop: () => void; |
| 27 | +}; |
| 28 | + |
| 29 | +function roundMetric(value: number, digits = 3): number { |
| 30 | + if (!Number.isFinite(value)) { |
| 31 | + return 0; |
| 32 | + } |
| 33 | + const factor = 10 ** digits; |
| 34 | + return Math.round(value * factor) / factor; |
| 35 | +} |
| 36 | + |
| 37 | +function nanosecondsToMilliseconds(value: number): number { |
| 38 | + return roundMetric(value / 1_000_000, 1); |
| 39 | +} |
| 40 | + |
| 41 | +export function createGatewayEventLoopHealthMonitor(): GatewayEventLoopHealthMonitor { |
| 42 | + let monitor: EventLoopDelayMonitor | null = null; |
| 43 | + let lastWallAt = Date.now(); |
| 44 | + let lastCpuUsage: CpuUsage | null = process.cpuUsage(); |
| 45 | + let lastEventLoopUtilization: EventLoopUtilization | null = performance.eventLoopUtilization(); |
| 46 | + |
| 47 | + try { |
| 48 | + monitor = monitorEventLoopDelay({ resolution: EVENT_LOOP_MONITOR_RESOLUTION_MS }); |
| 49 | + monitor.enable(); |
| 50 | + monitor.reset(); |
| 51 | + } catch { |
| 52 | + monitor = null; |
| 53 | + } |
| 54 | + |
| 55 | + return { |
| 56 | + snapshot: () => { |
| 57 | + if (!monitor || !lastCpuUsage || !lastEventLoopUtilization || lastWallAt <= 0) { |
| 58 | + return undefined; |
| 59 | + } |
| 60 | + |
| 61 | + const now = Date.now(); |
| 62 | + const intervalMs = Math.max(1, now - lastWallAt); |
| 63 | + const cpuUsage = process.cpuUsage(lastCpuUsage); |
| 64 | + const currentEventLoopUtilization = performance.eventLoopUtilization(); |
| 65 | + const utilization = roundMetric( |
| 66 | + performance.eventLoopUtilization(currentEventLoopUtilization, lastEventLoopUtilization) |
| 67 | + .utilization, |
| 68 | + ); |
| 69 | + const delayP99Ms = nanosecondsToMilliseconds(monitor.percentile(99)); |
| 70 | + const delayMaxMs = nanosecondsToMilliseconds(monitor.max); |
| 71 | + const cpuTotalMs = roundMetric((cpuUsage.user + cpuUsage.system) / 1_000, 1); |
| 72 | + const cpuCoreRatio = roundMetric(cpuTotalMs / intervalMs); |
| 73 | + const reasons: GatewayEventLoopHealthReason[] = []; |
| 74 | + |
| 75 | + if (delayP99Ms >= EVENT_LOOP_DELAY_WARN_MS || delayMaxMs >= EVENT_LOOP_DELAY_WARN_MS) { |
| 76 | + reasons.push("event_loop_delay"); |
| 77 | + } |
| 78 | + if (utilization >= EVENT_LOOP_UTILIZATION_WARN) { |
| 79 | + reasons.push("event_loop_utilization"); |
| 80 | + } |
| 81 | + if (cpuCoreRatio >= CPU_CORE_RATIO_WARN) { |
| 82 | + reasons.push("cpu"); |
| 83 | + } |
| 84 | + |
| 85 | + monitor.reset(); |
| 86 | + lastWallAt = now; |
| 87 | + lastCpuUsage = process.cpuUsage(); |
| 88 | + lastEventLoopUtilization = currentEventLoopUtilization; |
| 89 | + |
| 90 | + return { |
| 91 | + degraded: reasons.length > 0, |
| 92 | + reasons, |
| 93 | + intervalMs, |
| 94 | + delayP99Ms, |
| 95 | + delayMaxMs, |
| 96 | + utilization, |
| 97 | + cpuCoreRatio, |
| 98 | + }; |
| 99 | + }, |
| 100 | + stop: () => { |
| 101 | + monitor?.disable(); |
| 102 | + monitor = null; |
| 103 | + lastWallAt = 0; |
| 104 | + lastCpuUsage = null; |
| 105 | + lastEventLoopUtilization = null; |
| 106 | + }, |
| 107 | + }; |
| 108 | +} |
0 commit comments