Skip to content

Commit e34f409

Browse files
committed
feat(API): add status fields to diagnostics and log at startup
Add ok/warning/critical status with messages to diagnostics sections: disk, memory, cpu, chiaTools, datalayer, fullNode, wallet, network. Remove redundant services section. Log full diagnostics JSON at startup (fire-and-forget) so READ_ONLY nodes also have a baseline snapshot.
1 parent 479b8e1 commit e34f409

4 files changed

Lines changed: 373 additions & 35 deletions

File tree

src/routes/diagnostics.js

Lines changed: 186 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,32 @@ import { probeChiaTools } from '../utils/chia-tools-probe.js';
2222
import { logger } from '../config/logger.js';
2323
import packageJson from '../../package.json' with { type: 'json' };
2424

25+
/**
26+
* Accumulates a worst-case status across multiple checks. Severity only
27+
* escalates: ok → warning → critical. Messages are joined with two-space
28+
* separation into a single string.
29+
*/
30+
class StatusAccumulator {
31+
static #levels = { ok: 0, warning: 1, critical: 2 };
32+
static #names = ['ok', 'warning', 'critical'];
33+
34+
#level = 0;
35+
#messages = [];
36+
37+
escalate(severity, message) {
38+
const target = StatusAccumulator.#levels[severity];
39+
if (target === undefined) throw new Error(`unknown severity: ${severity}`);
40+
if (target > this.#level) this.#level = target;
41+
if (message) this.#messages.push(message);
42+
}
43+
44+
result() {
45+
const status = StatusAccumulator.#names[this.#level];
46+
if (this.#messages.length === 0) return { status };
47+
return { status, message: this.#messages.join(' ') };
48+
}
49+
}
50+
2551
// Timeouts deliberately err on the generous side: /diagnostics is allowed to
2652
// be slow if the goal is a comprehensive snapshot. A busy-but-healthy wallet
2753
// (e.g. long-syncing while operators are debugging) regularly takes several
@@ -430,15 +456,7 @@ export const getDiagnosticsResponse = async () => {
430456
};
431457
})();
432458

433-
// ---- Chia: services / chia-tools / processes ---------------------------
434-
const servicesSection = {
435-
walletReachable,
436-
fullNodeReachable: fullNodeRunningLocally && fullNodeStateRes.ok
437-
? !!fullNodeStateRes.value?.reachable
438-
: false,
439-
datalayerReachable: datalayerAvailableRes.ok ? datalayerAvailableRes.value === true : false,
440-
};
441-
459+
// ---- Chia: chia-tools / processes ---------------------------------------
442460
const chiaToolsSection = chiaToolsRes.ok
443461
? chiaToolsRes.value
444462
: { installed: false, version: null, error: chiaToolsRes.error, note: 'probe failed' };
@@ -448,33 +466,183 @@ export const getDiagnosticsResponse = async () => {
448466
? systemInfoRes.value
449467
: { error: systemInfoRes.error };
450468

469+
// ---- Status computation -------------------------------------------------
470+
471+
// Precompute process-running booleans for status checks below.
472+
const chiaRunningLocally = processesValue.matches.some(
473+
(m) => /chia_/i.test(m.command),
474+
);
475+
const fullNodeProcessRunning = processesValue.matches.some(
476+
(m) => /chia_full_node/i.test(m.command),
477+
);
478+
const dlProcessRunning = processesValue.matches.some(
479+
(m) => /chia_data_layer/i.test(m.command),
480+
);
481+
const walletProcessRunning = processesValue.matches.some(
482+
(m) => /chia_wallet/i.test(m.command),
483+
);
484+
485+
// system.disk
486+
if (systemSection.disk) {
487+
const diskStatus = new StatusAccumulator();
488+
const pct = systemSection.disk.percentUsed;
489+
if (pct != null) {
490+
if (pct > 96) diskStatus.escalate('critical', 'Disk usage above 96%');
491+
else if (pct > 90) diskStatus.escalate('warning', 'Disk usage above 90%');
492+
}
493+
Object.assign(systemSection.disk, diskStatus.result());
494+
}
495+
496+
// system.memory
497+
if (systemSection.memory) {
498+
const memStatus = new StatusAccumulator();
499+
const pct = systemSection.memory.percentUsed;
500+
if (pct != null) {
501+
if (pct > 99) memStatus.escalate('critical', 'Memory usage above 99%');
502+
else if (pct > 90) memStatus.escalate('warning', 'Memory usage above 90%');
503+
}
504+
Object.assign(systemSection.memory, memStatus.result());
505+
}
506+
507+
// system.cpu
508+
if (systemSection.cpu) {
509+
const cpuStatus = new StatusAccumulator();
510+
const cores = systemSection.cpu.cores;
511+
if (cores != null && chiaRunningLocally) {
512+
const msg =
513+
'Both CADT and Chia are heavy on CPU and a 4 core or greater system is recommended when running them together';
514+
if (cores === 1) cpuStatus.escalate('critical', msg);
515+
else if (cores < 4) cpuStatus.escalate('warning', msg);
516+
} else if (cores != null) {
517+
if (cores === 1) {
518+
cpuStatus.escalate(
519+
'warning',
520+
'CADT can often use 100% of a single CPU core, so a 2 core or greater system is recommended when running CADT by itself',
521+
);
522+
}
523+
}
524+
Object.assign(systemSection.cpu, cpuStatus.result());
525+
}
526+
527+
// chiaTools
528+
{
529+
const ctStatus = new StatusAccumulator();
530+
if (!chiaToolsSection.installed) {
531+
ctStatus.escalate('warning', 'chia-tools is recommended to help manage Chia');
532+
}
533+
Object.assign(chiaToolsSection, ctStatus.result());
534+
}
535+
536+
// datalayer
537+
{
538+
const dlStatus = new StatusAccumulator();
539+
if (dlProcessRunning && !datalayerSection.reachable) {
540+
dlStatus.escalate(
541+
'critical',
542+
'Chia DataLayer service unreachable - this usually indicates a crashed or stuck process that needs to be killed',
543+
);
544+
}
545+
if (datalayerSection.subscriptions?.some((s) => s.synced === false)) {
546+
dlStatus.escalate('warning', 'One or more DataLayer subscriptions are not synced');
547+
}
548+
Object.assign(datalayerSection, dlStatus.result());
549+
}
550+
551+
// fullNode — use fullNodeProcessRunning (from matches) rather than
552+
// fullNodeRunningLocally (which defaults true on unreliable scans) to
553+
// avoid false-positive criticals on Windows / minimal Docker images.
554+
{
555+
const fnStatus = new StatusAccumulator();
556+
if (fullNodeProcessRunning && !fullNodeSection.reachable) {
557+
fnStatus.escalate(
558+
'critical',
559+
'Chia full node service unreachable - this usually indicates a crashed or stuck process that needs to be killed',
560+
);
561+
}
562+
Object.assign(fullNodeSection, fnStatus.result());
563+
}
564+
565+
// wallet
566+
{
567+
const walletStatus = new StatusAccumulator();
568+
569+
if (walletProcessRunning && !walletReachable) {
570+
walletStatus.escalate(
571+
'critical',
572+
'Chia wallet service unreachable - this usually indicates a crashed or stuck process that needs to be killed',
573+
);
574+
}
575+
576+
if (walletReachable && !walletSection.synced) {
577+
walletStatus.escalate('warning', 'Wallet syncing');
578+
}
579+
580+
const coinAmount = appConfig.DEFAULT_COIN_AMOUNT ?? 300;
581+
const fee = appConfig.DEFAULT_FEE ?? 3000;
582+
const minMirrorXch = (coinAmount + fee) / 1_000_000_000_000;
583+
if (walletSection.balanceXch != null && walletSection.balanceXch < minMirrorXch) {
584+
walletStatus.escalate('critical', 'Wallet balance too low to create mirrors');
585+
}
586+
587+
const pendingTx = walletSection.pendingTransactions;
588+
if (pendingTx && !pendingTx.error) {
589+
const hasStuck =
590+
pendingTx.standardWallet?.stuck?.length > 0 ||
591+
pendingTx.dataLayerWallet?.stuck?.length > 0;
592+
if (hasStuck) {
593+
walletStatus.escalate('critical', 'Stuck transactions detected that need manual intervention');
594+
}
595+
const hasRejected =
596+
pendingTx.standardWallet?.rejected?.length > 0 ||
597+
pendingTx.dataLayerWallet?.rejected?.length > 0;
598+
if (hasRejected) {
599+
walletStatus.escalate('critical', 'Rejected transactions detected that need attention');
600+
}
601+
}
602+
603+
if (walletReachable && walletSection.trustedFullNodePeers?.hasTrustedConnection === false) {
604+
walletStatus.escalate(
605+
'warning',
606+
'Performance is severely degraded when the Chia wallet is not connected to a trusted full node peer',
607+
);
608+
}
609+
610+
Object.assign(walletSection, walletStatus.result());
611+
}
612+
613+
// network
614+
const networkStatus = new StatusAccumulator();
615+
if (networkMatches === false) {
616+
networkStatus.escalate('warning', 'CADT configured network does not match Chia network');
617+
}
618+
const networkSection = {
619+
chia: actualNetwork,
620+
cadt: configuredNetwork,
621+
matches: networkMatches,
622+
...networkStatus.result(),
623+
};
624+
451625
// ---- Full response ------------------------------------------------------
452-
const fullResponse = {
626+
return {
453627
timestamp,
454628
cadt: cadtSection,
455-
network: {
456-
chia: actualNetwork,
457-
cadt: configuredNetwork,
458-
matches: networkMatches,
459-
},
629+
network: networkSection,
460630
chia: {
461631
version: chiaVersionRes.ok ? chiaVersionRes.value : null,
462632
wallet: walletSection,
463633
fullNode: fullNodeSection,
464634
datalayer: datalayerSection,
465-
services: servicesSection,
466635
chiaTools: chiaToolsSection,
467636
runningProcesses: processesValue,
468637
},
469638
system: systemSection,
470639
};
471-
472-
return fullResponse;
473640
};
474641

475642
export const __test = {
476643
settle,
477644
collectSubscriptions,
478645
buildTrustedPeerView,
479646
normalizeNodeId,
647+
StatusAccumulator,
480648
};

src/routes/index.js

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ import { pullPickListValues } from '../utils/data-loaders';
1010
import { pullPickListValuesV2 } from '../utils/v2-data-loaders';
1111
import { getConfig } from '../utils/config-loader';
1212
import { getConfigV2 } from '../utils/config-loader';
13+
import { getDiagnosticsResponse } from './diagnostics.js';
1314

1415
import app from '../middleware';
1516

@@ -73,6 +74,13 @@ export const initializeDatabases = async () => {
7374
migrationsReady = true;
7475
logger.info('All database migrations completed');
7576

77+
// Snapshot diagnostics into the log so operators always have a baseline,
78+
// even on READ_ONLY nodes that can't serve the /diagnostics endpoint.
79+
// Fire-and-forget: don't block scheduler start on RPC timeouts.
80+
getDiagnosticsResponse()
81+
.then((d) => logger.info(`Startup diagnostics: ${JSON.stringify(d)}`))
82+
.catch((e) => logger.warn(`Startup diagnostics collection failed: ${e.message}`));
83+
7684
// Start scheduler after migrations complete
7785
// Note: scheduler.start is async - it runs coin management first before starting other tasks
7886
setTimeout(async () => {

0 commit comments

Comments
 (0)