Skip to content

Commit f50954f

Browse files
Clevin Canalesclaude
authored andcommitted
feat(orphans): add gbrain orphans command for finding under-connected pages
Surfaces pages with zero inbound wikilinks. Essential for content enrichment cycles in KBs with 1000+ pages. By default filters out auto-generated pages, raw sources, and pseudo-pages where no inbound links is expected; --include-pseudo to disable. Supports text (grouped by domain), --json, --count outputs. Also exposed as find_orphans MCP operation. Tests cover basic detection, filtering, all output modes. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
1 parent 0992b72 commit f50954f

5 files changed

Lines changed: 458 additions & 14 deletions

File tree

src/cli.ts

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ for (const op of operations) {
1818
}
1919

2020
// CLI-only commands that bypass the operation layer
21-
const CLI_ONLY = new Set(['init', 'upgrade', 'post-upgrade', 'check-update', 'integrations', 'publish', 'check-backlinks', 'lint', 'report', 'import', 'export', 'files', 'embed', 'serve', 'call', 'config', 'doctor', 'migrate', 'eval', 'sync', 'extract', 'features', 'autopilot', 'graph-query', 'jobs', 'apply-migrations', 'skillpack-check']);
21+
const CLI_ONLY = new Set(['init', 'upgrade', 'post-upgrade', 'check-update', 'integrations', 'publish', 'check-backlinks', 'lint', 'report', 'import', 'export', 'files', 'embed', 'serve', 'call', 'config', 'doctor', 'migrate', 'eval', 'sync', 'extract', 'features', 'autopilot', 'graph-query', 'jobs', 'apply-migrations', 'skillpack-check', 'orphans']);
2222

2323
async function main() {
2424
const args = process.argv.slice(2);
@@ -412,6 +412,11 @@ async function handleCliOnly(command: string, args: string[]) {
412412
await runGraphQuery(engine, args);
413413
break;
414414
}
415+
case 'orphans': {
416+
const { runOrphans } = await import('./commands/orphans.ts');
417+
await runOrphans(engine, args);
418+
break;
419+
}
415420
}
416421
} finally {
417422
if (command !== 'serve') await engine.disconnect();
@@ -520,6 +525,7 @@ TOOLS
520525
publish <page.md> [--password] Shareable HTML (strips private data, optional AES-256)
521526
check-backlinks <check|fix> [dir] Find/fix missing back-links across brain
522527
lint <dir|file> [--fix] Catch LLM artifacts, placeholder dates, bad frontmatter
528+
orphans [--json] [--count] Find pages with no inbound wikilinks
523529
report --type <name> --content ... Save timestamped report to brain/reports/
524530
525531
JOBS (Minions)

src/commands/extract.ts

Lines changed: 1 addition & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -156,19 +156,7 @@ export function resolveSlug(fileDir: string, relTarget: string, allSlugs: Set<st
156156
return null;
157157
}
158158

159-
/** Infer link type from directory structure */
160-
function inferLinkType(fromDir: string, toDir: string, frontmatter?: Record<string, unknown>): string {
161-
const from = fromDir.split('/')[0];
162-
const to = toDir.split('/')[0];
163-
if (from === 'people' && to === 'companies') {
164-
if (Array.isArray(frontmatter?.founded)) return 'founded';
165-
return 'works_at';
166-
}
167-
if (from === 'people' && to === 'deals') return 'involved_in';
168-
if (from === 'deals' && to === 'companies') return 'deal_for';
169-
if (from === 'meetings' && to === 'people') return 'attendee';
170-
return 'mention';
171-
}
159+
// inferLinkType is now imported from ../core/link-extraction.ts (v0.12.0 canonical extractor)
172160

173161
/** Extract links from frontmatter fields */
174162
function extractFrontmatterLinks(slug: string, fm: Record<string, unknown>): ExtractedLink[] {

src/commands/orphans.ts

Lines changed: 227 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,227 @@
1+
/**
2+
* gbrain orphans — Surface pages with no inbound wikilinks.
3+
*
4+
* Deterministic: zero LLM calls. Queries the links table for pages with
5+
* no entries where to_page_id = pages.id. By default filters out
6+
* auto-generated pages and pseudo-pages where no inbound links is expected.
7+
*
8+
* Usage:
9+
* gbrain orphans # list orphans grouped by domain
10+
* gbrain orphans --json # JSON output for agent consumption
11+
* gbrain orphans --count # just the number
12+
* gbrain orphans --include-pseudo # include auto-generated/pseudo pages
13+
*/
14+
15+
import type { BrainEngine } from '../core/engine.ts';
16+
import * as db from '../core/db.ts';
17+
18+
// --- Types ---
19+
20+
export interface OrphanPage {
21+
slug: string;
22+
title: string;
23+
domain: string;
24+
}
25+
26+
export interface OrphanResult {
27+
orphans: OrphanPage[];
28+
total_orphans: number;
29+
total_linkable: number;
30+
total_pages: number;
31+
excluded: number;
32+
}
33+
34+
// --- Filter constants ---
35+
36+
/** Slug suffixes that are always auto-generated root files */
37+
const AUTO_SUFFIX_PATTERNS = ['/_index', '/log'];
38+
39+
/** Page slugs that are pseudo-pages by convention */
40+
const PSEUDO_SLUGS = new Set(['_atlas', '_index', '_stats', '_orphans', '_scratch', 'claude']);
41+
42+
/** Slug segment that marks raw sources */
43+
const RAW_SEGMENT = '/raw/';
44+
45+
/** Slug prefixes where no inbound links is expected */
46+
const DENY_PREFIXES = [
47+
'output/',
48+
'dashboards/',
49+
'scripts/',
50+
'templates/',
51+
'openclaw/config/',
52+
];
53+
54+
/** First slug segments where no inbound links is expected */
55+
const FIRST_SEGMENT_EXCLUSIONS = new Set(['scratch', 'thoughts', 'catalog', 'entities']);
56+
57+
// --- Filter logic ---
58+
59+
/**
60+
* Returns true if a slug should be excluded from orphan reporting by default.
61+
* These are pages where having no inbound links is expected / not a content problem.
62+
*/
63+
export function shouldExclude(slug: string): boolean {
64+
// Pseudo-pages (exact match)
65+
if (PSEUDO_SLUGS.has(slug)) return true;
66+
67+
// Auto-generated suffix patterns
68+
for (const suffix of AUTO_SUFFIX_PATTERNS) {
69+
if (slug.endsWith(suffix)) return true;
70+
}
71+
72+
// Raw source slugs
73+
if (slug.includes(RAW_SEGMENT)) return true;
74+
75+
// Deny-prefix slugs
76+
for (const prefix of DENY_PREFIXES) {
77+
if (slug.startsWith(prefix)) return true;
78+
}
79+
80+
// First-segment exclusions
81+
const firstSegment = slug.split('/')[0];
82+
if (FIRST_SEGMENT_EXCLUSIONS.has(firstSegment)) return true;
83+
84+
return false;
85+
}
86+
87+
/**
88+
* Derive domain from frontmatter or first slug segment.
89+
*/
90+
export function deriveDomain(frontmatterDomain: string | null | undefined, slug: string): string {
91+
if (frontmatterDomain && typeof frontmatterDomain === 'string' && frontmatterDomain.trim()) {
92+
return frontmatterDomain.trim();
93+
}
94+
return slug.split('/')[0] || 'root';
95+
}
96+
97+
// --- Core query ---
98+
99+
/**
100+
* Find pages with no inbound links.
101+
* Returns raw rows from the DB (all pages regardless of filter).
102+
*/
103+
export async function queryOrphanPages(): Promise<{ slug: string; title: string; domain: string | null }[]> {
104+
const sql = db.getConnection();
105+
const rows = await sql`
106+
SELECT
107+
p.slug,
108+
COALESCE(p.title, p.slug) AS title,
109+
p.frontmatter->>'domain' AS domain
110+
FROM pages p
111+
WHERE NOT EXISTS (
112+
SELECT 1 FROM links l WHERE l.to_page_id = p.id
113+
)
114+
ORDER BY p.slug
115+
`;
116+
return rows as { slug: string; title: string; domain: string | null }[];
117+
}
118+
119+
/**
120+
* Find orphan pages, with optional pseudo-page filtering.
121+
* Returns structured OrphanResult with totals.
122+
*/
123+
export async function findOrphans(includePseudo: boolean = false): Promise<OrphanResult> {
124+
const allOrphans = await queryOrphanPages();
125+
const totalPages = allOrphans.length; // pages with no inbound links
126+
127+
// Count total pages in DB for the summary line
128+
const sql = db.getConnection();
129+
const [{ count: totalPagesCount }] = await sql`SELECT count(*)::int AS count FROM pages`;
130+
const total = Number(totalPagesCount);
131+
132+
const filtered = includePseudo
133+
? allOrphans
134+
: allOrphans.filter(row => !shouldExclude(row.slug));
135+
136+
const orphans: OrphanPage[] = filtered.map(row => ({
137+
slug: row.slug,
138+
title: row.title,
139+
domain: deriveDomain(row.domain, row.slug),
140+
}));
141+
142+
const excluded = allOrphans.length - filtered.length;
143+
144+
return {
145+
orphans,
146+
total_orphans: orphans.length,
147+
total_linkable: filtered.length + (total - allOrphans.length),
148+
total_pages: total,
149+
excluded,
150+
};
151+
}
152+
153+
// --- Output formatters ---
154+
155+
export function formatOrphansText(result: OrphanResult): string {
156+
const lines: string[] = [];
157+
158+
const { orphans, total_orphans, total_linkable, total_pages, excluded } = result;
159+
lines.push(
160+
`${total_orphans} orphans out of ${total_linkable} linkable pages (${total_pages} total; ${excluded} excluded)\n`,
161+
);
162+
163+
if (orphans.length === 0) {
164+
lines.push('No orphan pages found.');
165+
return lines.join('\n');
166+
}
167+
168+
// Group by domain, sort alphabetically within each group
169+
const byDomain = new Map<string, OrphanPage[]>();
170+
for (const page of orphans) {
171+
const list = byDomain.get(page.domain) || [];
172+
list.push(page);
173+
byDomain.set(page.domain, list);
174+
}
175+
176+
// Sort domains alphabetically
177+
const sortedDomains = [...byDomain.keys()].sort();
178+
for (const domain of sortedDomains) {
179+
const pages = byDomain.get(domain)!.sort((a, b) => a.slug.localeCompare(b.slug));
180+
lines.push(`[${domain}]`);
181+
for (const page of pages) {
182+
lines.push(` ${page.slug} ${page.title}`);
183+
}
184+
lines.push('');
185+
}
186+
187+
return lines.join('\n').trimEnd();
188+
}
189+
190+
// --- CLI entry point ---
191+
192+
export async function runOrphans(_engine: BrainEngine, args: string[]) {
193+
const json = args.includes('--json');
194+
const count = args.includes('--count');
195+
const includePseudo = args.includes('--include-pseudo');
196+
197+
if (args.includes('--help') || args.includes('-h')) {
198+
console.log(`Usage: gbrain orphans [options]
199+
200+
Find pages with no inbound wikilinks.
201+
202+
Options:
203+
--json Output as JSON (for agent consumption)
204+
--count Output just the number of orphans
205+
--include-pseudo Include auto-generated and pseudo pages in results
206+
--help, -h Show this help
207+
208+
Output (default): grouped by domain, sorted alphabetically within each group
209+
Summary line: N orphans out of M linkable pages (K total; K-M excluded)
210+
`);
211+
return;
212+
}
213+
214+
const result = await findOrphans(includePseudo);
215+
216+
if (count) {
217+
console.log(String(result.total_orphans));
218+
return;
219+
}
220+
221+
if (json) {
222+
console.log(JSON.stringify(result, null, 2));
223+
return;
224+
}
225+
226+
console.log(formatOrphansText(result));
227+
}

src/core/operations.ts

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1082,6 +1082,24 @@ const send_job_message: Operation = {
10821082
},
10831083
};
10841084

1085+
// --- Orphans ---
1086+
1087+
const find_orphans: Operation = {
1088+
name: 'find_orphans',
1089+
description: 'Find pages with no inbound wikilinks. Essential for content enrichment cycles.',
1090+
params: {
1091+
include_pseudo: {
1092+
type: 'boolean',
1093+
description: 'Include auto-generated and pseudo pages (default: false)',
1094+
},
1095+
},
1096+
handler: async (_ctx, p) => {
1097+
const { findOrphans } = await import('../commands/orphans.ts');
1098+
return findOrphans((p.include_pseudo as boolean) || false);
1099+
},
1100+
cliHints: { name: 'orphans', hidden: true },
1101+
};
1102+
10851103
// --- Exports ---
10861104

10871105
export const operations: Operation[] = [
@@ -1110,6 +1128,8 @@ export const operations: Operation[] = [
11101128
// Jobs (Minions)
11111129
submit_job, get_job, list_jobs, cancel_job, retry_job, get_job_progress,
11121130
pause_job, resume_job, replay_job, send_job_message,
1131+
// Orphans
1132+
find_orphans,
11131133
];
11141134

11151135
export const operationsByName = Object.fromEntries(

0 commit comments

Comments
 (0)