Skip to content

Commit 3a89ee4

Browse files
committed
fix(upgrade): pin base image via pull-then-inspect and rebuild stale sandboxes (#1904)
Re-implements the digest pinning and upgrade-sandboxes features reverted in #1938. The original PR (#1937) broke all e2e tests because it read a digest from blueprint.yaml (belongs to ghcr.io/nvidia/openshell-community) and applied it to ghcr.io/nvidia/nemoclaw/sandbox-base — a different registry. Docker digests are repo-specific, so every pull returned "manifest unknown". This fix never reads blueprint.yaml for the base image digest. Instead: - pullAndResolveBaseImageDigest() pulls sandbox-base:latest from GHCR - docker inspect extracts the actual repo digest - patchStagedDockerfile() pins ARG BASE_IMAGE to the inspected digest The digest is self-consistent by construction — it always comes from the same registry we pin to. Falls back to unpinned :latest when GHCR is unreachable (offline/firewall users). Also re-adds the upgrade-sandboxes command with fixes from code review: - --auto flag for non-interactive installer contexts - sandbox list failure handling before classifying sandboxes - throwOnError option for sandboxRebuild to prevent process.exit in loops Closes #1904 Signed-off-by: Test User <test@example.com> Signed-off-by: Aaron Erickson <aerickson@nvidia.com>
1 parent bb5fdb4 commit 3a89ee4

5 files changed

Lines changed: 514 additions & 6 deletions

File tree

scripts/install.sh

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1265,6 +1265,12 @@ except Exception:
12651265
if run_installer_host_preflight; then
12661266
run_onboard
12671267
ONBOARD_RAN=true
1268+
# After onboard, check for stale sandboxes that need rebuilding (#1904).
1269+
# Uses --auto so it runs non-interactively in piped/CI contexts.
1270+
if [ "${_has_sandboxes:-0}" -gt 0 ] 2>/dev/null && command_exists nemoclaw; then
1271+
info "Checking for sandboxes that need upgrading…"
1272+
nemoclaw upgrade-sandboxes --auto 2>&1 || warn "Sandbox upgrade check failed (non-fatal)."
1273+
fi
12681274
else
12691275
warn "Skipping onboarding until the host prerequisites above are fixed."
12701276
fi

src/lib/onboard.ts

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -454,6 +454,48 @@ function getBlueprintMaxOpenshellVersion(rootDir = ROOT) {
454454
return getBlueprintVersionField("max_openshell_version", rootDir);
455455
}
456456

457+
// ── Base image digest resolution ────────────────────────────────
458+
// Pulls the sandbox-base image from GHCR and inspects it to get the
459+
// actual repo digest. This avoids the registry mismatch that broke
460+
// e2e tests in #1937 — the digest always comes from the same registry
461+
// we're pinning to. See #1904.
462+
463+
const SANDBOX_BASE_IMAGE = "ghcr.io/nvidia/nemoclaw/sandbox-base";
464+
const SANDBOX_BASE_TAG = "latest";
465+
466+
/**
467+
* Pull sandbox-base:latest from GHCR and resolve its repo digest.
468+
* Returns { digest, ref } on success, or null when the pull or
469+
* inspect fails (offline, GHCR outage, local-only build).
470+
*/
471+
function pullAndResolveBaseImageDigest() {
472+
const imageWithTag = `${SANDBOX_BASE_IMAGE}:${SANDBOX_BASE_TAG}`;
473+
try {
474+
run(["docker", "pull", imageWithTag], { suppressOutput: true });
475+
} catch {
476+
// Pull failed — caller should fall back to unpin :latest
477+
return null;
478+
}
479+
480+
let inspectOutput;
481+
try {
482+
inspectOutput = runCapture(
483+
["docker", "inspect", "--format", "{{index .RepoDigests 0}}", imageWithTag],
484+
{ ignoreError: false },
485+
);
486+
} catch {
487+
return null;
488+
}
489+
490+
// inspectOutput looks like "ghcr.io/nvidia/nemoclaw/sandbox-base@sha256:abc123..."
491+
const match = (inspectOutput || "").match(/sha256:[0-9a-f]{64}/);
492+
if (!match) return null;
493+
494+
const digest = match[0];
495+
const ref = `${SANDBOX_BASE_IMAGE}@${digest}`;
496+
return { digest, ref };
497+
}
498+
457499
function getStableGatewayImageRef(versionOutput = null) {
458500
const version = getInstalledOpenshellVersion(versionOutput);
459501
if (!version) return null;
@@ -991,10 +1033,17 @@ function patchStagedDockerfile(
9911033
messagingChannels = [],
9921034
messagingAllowedIds = {},
9931035
discordGuilds = {},
1036+
baseImageRef = null,
9941037
) {
9951038
const { providerKey, primaryModelRef, inferenceBaseUrl, inferenceApi, inferenceCompat } =
9961039
getSandboxInferenceConfig(model, provider, preferredInferenceApi);
9971040
let dockerfile = fs.readFileSync(dockerfilePath, "utf8");
1041+
// Pin the base image to a specific digest when available (#1904).
1042+
// The ref must come from pullAndResolveBaseImageDigest() — never from
1043+
// blueprint.yaml, whose digest belongs to a different registry.
1044+
if (baseImageRef) {
1045+
dockerfile = dockerfile.replace(/^ARG BASE_IMAGE=.*$/m, `ARG BASE_IMAGE=${baseImageRef}`);
1046+
}
9981047
dockerfile = dockerfile.replace(/^ARG NEMOCLAW_MODEL=.*$/m, `ARG NEMOCLAW_MODEL=${model}`);
9991048
dockerfile = dockerfile.replace(
10001049
/^ARG NEMOCLAW_PROVIDER_KEY=.*$/m,
@@ -2734,6 +2783,15 @@ async function createSandbox(
27342783
};
27352784
}
27362785
}
2786+
// Pull the base image and resolve its digest so the Dockerfile is pinned to
2787+
// exactly what we just fetched. This prevents stale :latest tags from
2788+
// silently reusing a cached old image after NemoClaw upgrades (#1904).
2789+
const resolved = pullAndResolveBaseImageDigest();
2790+
if (resolved) {
2791+
console.log(` Pinning base image to ${resolved.digest.slice(0, 19)}...`);
2792+
} else {
2793+
console.warn(" Warning: could not pull base image from registry; using cached :latest.");
2794+
}
27372795
patchStagedDockerfile(
27382796
stagedDockerfile,
27392797
model,
@@ -2745,6 +2803,7 @@ async function createSandbox(
27452803
activeMessagingChannels,
27462804
messagingAllowedIds,
27472805
discordGuilds,
2806+
resolved ? resolved.ref : null,
27482807
);
27492808
// Only pass non-sensitive env vars to the sandbox. Credentials flow through
27502809
// OpenShell providers — the gateway injects them as placeholders and the L7
@@ -5468,6 +5527,9 @@ module.exports = {
54685527
getInstalledOpenshellVersion,
54695528
getBlueprintMinOpenshellVersion,
54705529
getBlueprintMaxOpenshellVersion,
5530+
pullAndResolveBaseImageDigest,
5531+
SANDBOX_BASE_IMAGE,
5532+
SANDBOX_BASE_TAG,
54715533
versionGte,
54725534
getRequestedModelHint,
54735535
getRequestedProviderHint,

src/nemoclaw.ts

Lines changed: 114 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,7 @@ const GLOBAL_COMMANDS = new Set([
8080
"uninstall",
8181
"credentials",
8282
"backup-all",
83+
"upgrade-sandboxes",
8384
"help",
8485
"--help",
8586
"-h",
@@ -1601,24 +1602,33 @@ function _rebuildLog(msg) {
16011602
console.error(` ${D}[rebuild ${new Date().toISOString()}] ${msg}${R}`);
16021603
}
16031604

1604-
async function sandboxRebuild(sandboxName, args = []) {
1605+
async function sandboxRebuild(sandboxName, args = [], opts = {}) {
16051606
const verbose =
16061607
args.includes("--verbose") ||
16071608
args.includes("-v") ||
16081609
process.env.NEMOCLAW_REBUILD_VERBOSE === "1";
16091610
const log = verbose ? _rebuildLog : () => {};
16101611
const skipConfirm = args.includes("--yes") || args.includes("--force");
1612+
// When called from upgradeSandboxes in a loop, throwOnError prevents
1613+
// process.exit from aborting the entire batch on the first failure.
1614+
const bail = opts.throwOnError
1615+
? (msg, code = 1) => {
1616+
throw new Error(msg);
1617+
}
1618+
: (_msg, code = 1) => process.exit(code);
16111619
const sb = registry.getSandbox(sandboxName);
16121620
if (!sb) {
16131621
console.error(` Sandbox '${sandboxName}' not found in registry.`);
1614-
process.exit(1);
1622+
bail(`Sandbox '${sandboxName}' not found in registry.`);
1623+
return;
16151624
}
16161625

16171626
// Multi-agent guard (temporary — until swarm lands)
16181627
if (sb.agents && sb.agents.length > 1) {
16191628
console.error(" Multi-agent sandbox rebuild is not yet supported.");
16201629
console.error(" Back up state manually and recreate with `nemoclaw onboard`.");
1621-
process.exit(1);
1630+
bail("Multi-agent sandbox rebuild is not yet supported.");
1631+
return;
16221632
}
16231633

16241634
const agent = agentRuntime.getSessionAgent(sandboxName);
@@ -1660,7 +1670,8 @@ async function sandboxRebuild(sandboxName, args = []) {
16601670
if (!liveNames.has(sandboxName)) {
16611671
console.error(` Sandbox '${sandboxName}' is not running. Cannot back up state.`);
16621672
console.error(" Start it first or recreate with `nemoclaw onboard --recreate-sandbox`.");
1663-
process.exit(1);
1673+
bail(`Sandbox '${sandboxName}' is not running.`);
1674+
return;
16641675
}
16651676

16661677
// Step 2: Backup
@@ -1679,7 +1690,8 @@ async function sandboxRebuild(sandboxName, args = []) {
16791690
console.error(` Failed: ${backup.failedDirs.join(", ")}`);
16801691
}
16811692
console.error(" Aborting rebuild to prevent data loss.");
1682-
process.exit(1);
1693+
bail("Failed to back up sandbox state.");
1694+
return;
16831695
}
16841696
console.log(` ${G}\u2713${R} State backed up (${backup.backedUpDirs.length} directories)`);
16851697
console.log(` Backup: ${backup.manifest.backupPath}`);
@@ -1705,7 +1717,8 @@ async function sandboxRebuild(sandboxName, args = []) {
17051717
if (deleteResult.status !== 0 && !alreadyGone) {
17061718
console.error(" Failed to delete sandbox. Aborting rebuild.");
17071719
console.error(" State backup is preserved at: " + backup.manifest.backupPath);
1708-
process.exit(deleteResult.status || 1);
1720+
bail("Failed to delete sandbox.", deleteResult.status || 1);
1721+
return;
17091722
}
17101723
registry.removeSandbox(sandboxName);
17111724
log(
@@ -1813,6 +1826,95 @@ async function sandboxRebuild(sandboxName, args = []) {
18131826
}
18141827
}
18151828

1829+
// ── Upgrade sandboxes (#1904) ────────────────────────────────────
1830+
// Detect sandboxes running stale agent versions and offer to rebuild them.
1831+
1832+
async function upgradeSandboxes(args = []) {
1833+
const checkOnly = args.includes("--check");
1834+
const auto = args.includes("--auto");
1835+
const skipConfirm = auto || args.includes("--yes");
1836+
1837+
const sandboxes = registry.listSandboxes().sandboxes;
1838+
if (sandboxes.length === 0) {
1839+
console.log(" No sandboxes found in the registry.");
1840+
return;
1841+
}
1842+
1843+
// Query live sandboxes so we can tell the user which are running
1844+
const liveResult = captureOpenshell(["sandbox", "list"], { ignoreError: true });
1845+
if (liveResult.status !== 0) {
1846+
console.error(" Failed to query running sandboxes from OpenShell.");
1847+
console.error(" Ensure OpenShell is running: openshell status");
1848+
return;
1849+
}
1850+
const liveNames = parseLiveSandboxNames(liveResult.output || "");
1851+
1852+
// Classify sandboxes as stale or current
1853+
const stale = [];
1854+
for (const sb of sandboxes) {
1855+
const versionCheck = sandboxVersion.checkAgentVersion(sb.name);
1856+
if (versionCheck.isStale) {
1857+
stale.push({
1858+
name: sb.name,
1859+
current: versionCheck.sandboxVersion,
1860+
expected: versionCheck.expectedVersion,
1861+
running: liveNames.has(sb.name),
1862+
});
1863+
}
1864+
}
1865+
1866+
if (stale.length === 0) {
1867+
console.log(" All sandboxes are up to date.");
1868+
return;
1869+
}
1870+
1871+
console.log(`\n ${B}Stale sandboxes:${R}`);
1872+
for (const s of stale) {
1873+
const status = s.running ? `${G}running${R}` : `${D}stopped${R}`;
1874+
console.log(` ${s.name} v${s.current || "?"} → v${s.expected} (${status})`);
1875+
}
1876+
console.log("");
1877+
1878+
if (checkOnly) {
1879+
console.log(` ${stale.length} sandbox(es) need upgrading.`);
1880+
console.log(" Run `nemoclaw upgrade-sandboxes` to rebuild them.");
1881+
return;
1882+
}
1883+
1884+
const rebuildable = stale.filter((s) => s.running);
1885+
const stopped = stale.filter((s) => !s.running);
1886+
if (stopped.length > 0) {
1887+
console.log(` ${D}Skipping ${stopped.length} stopped sandbox(es) — start them first.${R}`);
1888+
}
1889+
if (rebuildable.length === 0) {
1890+
console.log(" No running stale sandboxes to rebuild.");
1891+
return;
1892+
}
1893+
1894+
let rebuilt = 0;
1895+
let failed = 0;
1896+
for (const s of rebuildable) {
1897+
if (!skipConfirm) {
1898+
const answer = await askPrompt(` Rebuild '${s.name}'? [y/N]: `);
1899+
if (answer.trim().toLowerCase() !== "y" && answer.trim().toLowerCase() !== "yes") {
1900+
console.log(` Skipped '${s.name}'.`);
1901+
continue;
1902+
}
1903+
}
1904+
try {
1905+
await sandboxRebuild(s.name, ["--yes"], { throwOnError: true });
1906+
rebuilt++;
1907+
} catch (err) {
1908+
console.error(` ${YW}\u26a0${R} Failed to rebuild '${s.name}': ${err.message}`);
1909+
failed++;
1910+
}
1911+
}
1912+
1913+
console.log("");
1914+
if (rebuilt > 0) console.log(` ${G}\u2713${R} ${rebuilt} sandbox(es) rebuilt.`);
1915+
if (failed > 0) console.log(` ${YW}\u26a0${R} ${failed} sandbox(es) failed — see errors above.`);
1916+
}
1917+
18161918
// ── Pre-upgrade backup ───────────────────────────────────────────
18171919

18181920
// ── Snapshot ─────────────────────────────────────────────────────
@@ -2033,6 +2135,9 @@ function help() {
20332135
${G}Backup:${R}
20342136
nemoclaw backup-all Back up all sandbox state before upgrade
20352137
2138+
${G}Upgrade:${R}
2139+
nemoclaw upgrade-sandboxes Detect and rebuild stale sandboxes ${D}(--check, --auto)${R}
2140+
20362141
Cleanup:
20372142
nemoclaw uninstall [flags] Run uninstall.sh (local only; no remote fallback)
20382143
@@ -2098,6 +2203,9 @@ const [cmd, ...args] = process.argv.slice(2);
20982203
case "backup-all":
20992204
backupAll();
21002205
break;
2206+
case "upgrade-sandboxes":
2207+
await upgradeSandboxes(args);
2208+
break;
21012209
case "--version":
21022210
case "-v": {
21032211
console.log(`nemoclaw v${getVersion()}`);

0 commit comments

Comments
 (0)