Skip to content

Commit 367ff89

Browse files
committed
feat(API): add /diagnostics endpoint for system-wide debug snapshot
Adds GET /diagnostics, a single JSON endpoint that summarizes the running CADT, Chia, DataLayer, and host machine state. The endpoint is intended for sysadmins debugging a CADT install: it reports CADT version, configured / actual chia network, wallet/full-node/datalayer reachability + sync status, wallet balance, trusted-peer cross-reference, DataLayer subscriptions with per-store sync status, governance body IDs, V1/V2 home org IDs, datalayer URLs, CADT config + database paths, CPU/RAM/disk numbers, a chia process scan, and a chia-tools probe. The endpoint is mounted on the root app (not /v1 or /v2) and lives in HEALTH_ENDPOINTS so it bypasses the rate limiter, startup gates, and the chia/datalayer assertions -- the whole point of a diagnostics endpoint is to be useful when those subsystems are broken. Every external call goes through a settle() wrapper with a per-call timeout and Promise.all fan-out, so one slow or wedged RPC can't block the rest of the response. Worst-case wall-clock is ~30s (subscription enumeration budget); healthy responses come back in well under a second. Authentication is enforced by the existing global API-key middleware (no duplicate check needed). When V1 or V2 READ_ONLY is set, the response is reduced to non-sensitive public data and short-circuits before the wallet / datalayer RPC fan-out, matching the precedent in wallet-health.js. Also adds isHealthEndpoint() skips to the wallet-synced, home-org-synced, and all-data-synced header middlewares so /diagnostics (and /health*) don't hang on the wallet RPC's 300s socket timeout or on waitForMigrations when the database layer is the broken subsystem.
1 parent 26362d2 commit 367ff89

10 files changed

Lines changed: 1819 additions & 0 deletions

File tree

src/datalayer/fullNodeRpc.js

Lines changed: 126 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,126 @@
1+
/**
2+
* Minimal RPC client for the locally-running chia full-node, used only by
3+
* the /diagnostics endpoint. CADT does not depend on the full node for any
4+
* other functionality; this client exists so we can answer "is the full
5+
* node running locally, and if so is it synced?".
6+
*
7+
* Like the wallet client, we use mTLS with the standard chia SSL files
8+
* under `${chiaRoot}/config/ssl/full_node/`. If those files are missing
9+
* (e.g. on a CADT-only host), the helpers return `{ reachable: false }`
10+
* instead of throwing.
11+
*/
12+
13+
import fs from 'fs';
14+
import path from 'path';
15+
import superagent from 'superagent';
16+
17+
import _ from 'lodash';
18+
19+
import { getChiaConfig } from './fullNode.js';
20+
import { getChiaRoot } from '../utils/chia-root.js';
21+
import { getActiveConfig } from '../utils/config-loader.js';
22+
import { logger } from '../config/logger.js';
23+
24+
const DEFAULT_FULL_NODE_RPC_PORT = 8555;
25+
// 10s matches the outer settle() default in routes/diagnostics.js. A full node
26+
// catching up under heavy load can take several seconds to answer
27+
// get_blockchain_state, and falsely tagging it unreachable is worse than
28+
// waiting a bit longer.
29+
const DEFAULT_TIMEOUT_MS = 10000;
30+
31+
const getCertificateFolderPath = () => {
32+
const chiaRoot = getChiaRoot();
33+
const overridden = getActiveConfig()?.APP?.CERTIFICATE_FOLDER_PATH;
34+
return overridden || `${chiaRoot}/config/ssl`;
35+
};
36+
37+
const getRpcPort = () => {
38+
try {
39+
const chiaConfig = getChiaConfig();
40+
return _.get(chiaConfig, 'full_node.rpc_port', DEFAULT_FULL_NODE_RPC_PORT);
41+
} catch (error) {
42+
logger.debug(`[diagnostics]: could not read chia config for full-node rpc_port: ${error.message}`);
43+
return DEFAULT_FULL_NODE_RPC_PORT;
44+
}
45+
};
46+
47+
const buildRpcUrl = () => `https://localhost:${getRpcPort()}`;
48+
49+
const loadFullNodeCerts = () => {
50+
const certificateFolderPath = getCertificateFolderPath();
51+
const certFile = path.resolve(`${certificateFolderPath}/full_node/private_full_node.crt`);
52+
const keyFile = path.resolve(`${certificateFolderPath}/full_node/private_full_node.key`);
53+
return {
54+
cert: fs.readFileSync(certFile),
55+
key: fs.readFileSync(keyFile),
56+
};
57+
};
58+
59+
const callRpc = async (endpoint, payload = {}, { timeout = DEFAULT_TIMEOUT_MS } = {}) => {
60+
const url = `${buildRpcUrl()}${endpoint}`;
61+
const { cert, key } = loadFullNodeCerts();
62+
const response = await superagent
63+
.post(url)
64+
.key(key)
65+
.cert(cert)
66+
.timeout(timeout)
67+
.send(payload);
68+
return response.body || JSON.parse(response.text);
69+
};
70+
71+
/**
72+
* Fetch full-node blockchain state.
73+
* @returns {Promise<Object>} `{ reachable, synced, syncing, peakHeight, syncMode, error? }`
74+
*/
75+
export const getBlockchainState = async (options = {}) => {
76+
const rpcUrl = buildRpcUrl();
77+
try {
78+
const data = await callRpc('/get_blockchain_state', {}, options);
79+
if (!data?.success) {
80+
return { rpcUrl, reachable: true, error: data?.error || 'unknown error' };
81+
}
82+
const state = data.blockchain_state || {};
83+
return {
84+
rpcUrl,
85+
reachable: true,
86+
synced: state.sync?.synced === true,
87+
syncing: state.sync?.sync_mode === true,
88+
peakHeight: state.peak?.height ?? null,
89+
syncMode: state.sync?.sync_mode === true ? 'syncing' : state.sync?.synced ? 'synced' : 'not_synced',
90+
genesisChallengeInitialized: state.genesis_challenge_initialized ?? null,
91+
};
92+
} catch (error) {
93+
logger.debug(`[diagnostics]: full-node get_blockchain_state failed: ${error.message}`);
94+
return { rpcUrl, reachable: false, error: error.message };
95+
}
96+
};
97+
98+
/**
99+
* Fetch full-node peer connections. We only return the host strings; CADT
100+
* never needs to act on peer details. Used to count outbound peers for the
101+
* diagnostics view.
102+
* @returns {Promise<{rpcUrl: string, reachable: boolean, connections?: Array, error?: string}>}
103+
*/
104+
export const getFullNodeConnections = async (options = {}) => {
105+
const rpcUrl = buildRpcUrl();
106+
try {
107+
const data = await callRpc('/get_connections', {}, options);
108+
if (!data?.success) {
109+
return { rpcUrl, reachable: true, error: data?.error || 'unknown error' };
110+
}
111+
const connections = (data.connections || []).map((c) => ({
112+
peerHost: c.peer_host,
113+
peerPort: c.peer_port,
114+
type: c.type,
115+
}));
116+
return { rpcUrl, reachable: true, connections };
117+
} catch (error) {
118+
logger.debug(`[diagnostics]: full-node get_connections failed: ${error.message}`);
119+
return { rpcUrl, reachable: false, error: error.message };
120+
}
121+
};
122+
123+
export default {
124+
getBlockchainState,
125+
getFullNodeConnections,
126+
};

src/datalayer/wallet.js

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -329,6 +329,49 @@ const getActiveNetwork = async () => {
329329
}
330330
};
331331

332+
/**
333+
* Return the wallet's peer connections (used by /diagnostics to cross-reference
334+
* connected full-node peers against the trusted_peers map in the chia config).
335+
*
336+
* Returns an object with `success: false` on connection errors instead of
337+
* throwing, so the diagnostics endpoint can degrade gracefully without
338+
* bringing down the rest of the response.
339+
*
340+
* @returns {Promise<{success: boolean, connections?: Array, error?: string}>}
341+
*/
342+
const getWalletConnections = async () => {
343+
if (USE_SIMULATOR) {
344+
return { success: true, connections: [] };
345+
}
346+
347+
const url = `${rpcUrl}/get_connections`;
348+
try {
349+
const { cert, key, timeout } = getBaseOptions();
350+
const response = await superagent
351+
.post(url)
352+
.key(key)
353+
.cert(cert)
354+
.timeout(timeout)
355+
.send({});
356+
357+
const data = response.body || JSON.parse(response.text);
358+
if (!data?.success) {
359+
return { success: false, error: data?.error || 'unknown error' };
360+
}
361+
362+
const connections = (data.connections || []).map((c) => ({
363+
peerHost: c.peer_host,
364+
peerPort: c.peer_port,
365+
type: c.type,
366+
nodeId: c.node_id,
367+
}));
368+
return { success: true, connections };
369+
} catch (error) {
370+
logger.debug(`[diagnostics]: wallet get_connections failed: ${error.message}`);
371+
return { success: false, error: error.message };
372+
}
373+
};
374+
332375
/**
333376
* Get coin records from the wallet (includes coin IDs)
334377
* @returns {Promise<{success: boolean, coin_records: Array}>} Object with coin_records array and success flag
@@ -884,6 +927,7 @@ export default {
884927
waitForAllTransactionsToConfirm,
885928
waitForSpendableCoins,
886929
getActiveNetwork,
930+
getWalletConnections,
887931
getLastWalletSyncError,
888932
getWalletBlockchainSyncStatus,
889933
getCoinRecords,

src/middleware.js

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@ const HEALTH_ENDPOINTS = new Set([
4141
'/v2/health',
4242
'/v1/health/wallet',
4343
'/v2/health/wallet',
44+
'/diagnostics',
4445
]);
4546

4647
const isHealthEndpoint = (path) => HEALTH_ENDPOINTS.has(path);
@@ -314,6 +315,13 @@ app.use(function (req, res, next) {
314315
});
315316

316317
app.use(async function (req, res, next) {
318+
// Skip the home-organization-synced header probe on health endpoints so
319+
// /diagnostics (and /health*) can respond even when migrations or the
320+
// organizations table are slow to come up.
321+
if (isHealthEndpoint(req.path)) {
322+
return next();
323+
}
324+
317325
if (process.env.NODE_ENV !== 'test') {
318326
// Wait for migrations to complete before accessing organizations table
319327
const { waitForMigrations } = await import('./routes/index.js');
@@ -396,6 +404,13 @@ app.use(async function (req, res, next) {
396404
});
397405

398406
app.use(async function (req, res, next) {
407+
// Skip the all-data-synced header probe on health endpoints so /diagnostics
408+
// (and /health*) can respond even when migrations or the organizations
409+
// table are slow to come up.
410+
if (isHealthEndpoint(req.path)) {
411+
return next();
412+
}
413+
399414
// Wait for migrations to complete before accessing organizations table
400415
const { waitForMigrations } = await import('./routes/index.js');
401416
await waitForMigrations();
@@ -474,6 +489,14 @@ app.use(async function (req, res, next) {
474489
});
475490

476491
app.use(async function (req, res, next) {
492+
// Skip the wallet-synced header probe for health endpoints. walletIsSynced
493+
// can hang for up to 300s when the wallet RPC is unreachable, which would
494+
// defeat the purpose of /diagnostics (and slow down /health) precisely in
495+
// the scenarios where those endpoints are most useful.
496+
if (isHealthEndpoint(req.path)) {
497+
return next();
498+
}
499+
477500
if (USE_SIMULATOR) {
478501
res.setHeader(headerKeys.WALLET_SYNCED, true);
479502
} else {
@@ -490,6 +513,38 @@ app.get('/health', (req, res) => {
490513
});
491514
});
492515

516+
// System-wide diagnostics. Mounted on the root app (not under /v1 or /v2) so
517+
// it can report CADT, Chia, and machine status independent of the data-model
518+
// version. Lives in HEALTH_ENDPOINTS above so it bypasses the rate limiter,
519+
// startup gates, and the Chia/datalayer assertions -- this endpoint is meant
520+
// to be useful precisely when those subsystems are broken.
521+
//
522+
// Auth: handled by the global API-key middleware further up, which runs for
523+
// EVERY route including HEALTH_ENDPOINTS. When CADT_API_KEY is configured the
524+
// caller must present x-api-key before reaching this handler. We rely on
525+
// that single enforcement point rather than duplicating the constant-time
526+
// check here.
527+
//
528+
// Read-only: when V1 or V2 READ_ONLY is set we strip sensitive fields
529+
// (balances, transaction details, peer details, subscription IDs, home org
530+
// IDs) the same way wallet-health.js does for /v{1,2}/health/wallet.
531+
app.get('/diagnostics', async (req, res) => {
532+
try {
533+
const configV1 = getConfig();
534+
const configV2 = getConfigV2();
535+
const readOnly = configV2.READ_ONLY === true || configV1.READ_ONLY === true;
536+
const { getDiagnosticsResponse } = await import('./routes/diagnostics.js');
537+
const result = await getDiagnosticsResponse({ readOnly });
538+
return res.status(200).json(result);
539+
} catch (error) {
540+
logger.error(`[diagnostics]: unexpected error building response: ${error.message}`);
541+
return res.status(200).json({
542+
timestamp: new Date().toISOString(),
543+
error: `Failed to build diagnostics response: ${error.message}`,
544+
});
545+
}
546+
});
547+
493548
// Conditionally mount V1 and V2 routes based on config
494549
// Each version's enable flag is in its own config file
495550
const configV1 = getConfig();

0 commit comments

Comments
 (0)