fix(state): never silently downgrade WAL to DELETE on transient EIO

steveonjava · kshitijk4poor · commit 5c49cd0ed060 · 2026-05-27T14:31:55.000-07:00
apply_wal_with_fallback() treated "disk i/o error" as a permanent WAL-incompatibility marker, identical to "locking protocol" (NFS) and "not authorized" (FUSE). But EIO during PRAGMA journal_mode=WAL is typically TRANSIENT — page-cache pressure, brief lock contention, recoverable storage hiccups — not a permanent filesystem property. Treating transient EIO as a permanent downgrade signal produces the mixed-journal-mode-across-processes corruption pattern: 1. Process A opens kanban.db, hits transient EIO on the WAL pragma, silently downgrades to journal_mode=DELETE. 2. Process B (no EIO) opens the same file moments later and successfully sets journal_mode=WAL. 3. A writes rollback-journal frames while B writes WAL frames. SQLite documents this as unsupported and corrupts the file: https://www.sqlite.org/wal.html ("all connections to the same database must use the same locking protocol"). This was the root cause of repeated kanban.db corruption on hosts with multiple gateway processes plus CLI invocations against the same DB (observed pattern: corruption shortly after gateway startup, after the process logged "WAL journal_mode unsupported on this filesystem (disk I/O error) — falling back to journal_mode=DELETE"). The fallback warning told the truth — fallback DID happen — but the premise ("unsupported on this filesystem") was wrong; the EIO was a one-shot event and sibling processes successfully used WAL. Fix has two layers: 1. Remove "disk i/o error" from _WAL_INCOMPAT_MARKERS. EIO now re-raises so callers can retry instead of silently corrupting the DB. The two remaining markers ("locking protocol", "not authorized") are deterministic per filesystem so they remain safe permanent-downgrade signals. 2. Belt-and-suspenders: before downgrading on ANY marker match, peek the on-disk journal mode. If the header says WAL, refuse to downgrade and re-raise the original error. This guards against any future addition to _WAL_INCOMPAT_MARKERS turning out to be transient in some environment we haven't yet seen. Tests: - tests/test_hermes_state_wal_fallback.py: * Flipped test_falls_back_on_disk_io_error → test_reraises_on_disk_io_error asserting EIO is re-raised, not silently swallowed. * Added test_does_not_downgrade_when_disk_says_wal covering the on-disk-header safety guard for the existing legitimate markers. - tests/hermes_cli/test_kanban_db.py: * test_connect_falls_back_to_delete_on_locking_protocol now uses a truly-fresh DB (instead of the kanban_home fixture which pre-inits in WAL). On NFS the very first process touching the file legitimately downgrades; on a file already in WAL the new guard correctly refuses. A standalone reproducer lives at /tmp/kanban-stress/repro_bugD_eio_wal_downgrade.py (not committed): without fix the DB silently flips from WAL to DELETE mid-process; with fix the EIO surfaces and the file stays WAL. Refs: Bug D in the kanban-corruption investigation series (Bugs A and C shipped in ebe7374f3 and e02147d5e respectively). Bug D explains every corruption incident this week including those that survived A's single-dispatcher mitigation, because every CLI invocation is a separate process whose WAL pragma can transiently fail.
diff --git a/hermes_state.py b/hermes_state.py
@@ -54,7 +54,6 @@
 _WAL_INCOMPAT_MARKERS = (
     "locking protocol",       # SQLITE_PROTOCOL on NFS/SMB
     "not authorized",         # Some FUSE mounts block WAL pragma outright
-    "disk i/o error",         # Flaky network FS during WAL setup
 )
 
 # Last SessionDB() init error, per-process.  Surfaced in /resume and
@@ -125,6 +124,27 @@ def format_session_db_unavailable(prefix: str = "Session database not available"
     return f"{prefix}: {cause}{hint}."
 
 
+def _on_disk_journal_mode(conn: sqlite3.Connection) -> Optional[str]:
+    """Read the journal mode from the SQLite DB header on disk.
+
+    Returns the mode string (e.g. ``"wal"``, ``"delete"``), or ``None``
+    if the value cannot be determined (new DB, or PRAGMA read failed).
+    """
+    try:
+        row = conn.execute("PRAGMA journal_mode").fetchone()
+    except sqlite3.OperationalError:
+        return None
+    if row is None:
+        return None
+    mode = row[0]
+    if isinstance(mode, bytes):  # defensive: sqlite3 occasionally returns bytes
+        try:
+            mode = mode.decode("ascii")
+        except UnicodeDecodeError:
+            return None
+    return str(mode).strip().lower() if mode is not None else None
+
+
 def apply_wal_with_fallback(
     conn: sqlite3.Connection,
     *,
@@ -147,6 +167,8 @@ def apply_wal_with_fallback(
 
     Shared by :class:`SessionDB` and ``hermes_cli.kanban_db.connect`` so
     both databases get identical fallback behavior.
+
+    Never downgrades to DELETE if the on-disk DB header reports WAL — see _on_disk_journal_mode.
     """
     try:
         conn.execute("PRAGMA journal_mode=WAL")
@@ -156,6 +178,10 @@ def apply_wal_with_fallback(
         if not any(marker in msg for marker in _WAL_INCOMPAT_MARKERS):
             # Unrelated OperationalError — don't silently swallow.
             raise
+        # Don't downgrade if another process already set WAL on disk.
+        existing = _on_disk_journal_mode(conn)
+        if existing == "wal":
+            raise
         _log_wal_fallback_once(db_label, exc)
         conn.execute("PRAGMA journal_mode=DELETE")
         return "delete"
diff --git a/tests/hermes_cli/test_kanban_core_functionality.py b/tests/hermes_cli/test_kanban_core_functionality.py
@@ -36,6 +36,15 @@ def kanban_home(tmp_path, monkeypatch):
     home.mkdir()
     monkeypatch.setenv("HERMES_HOME", str(home))
     monkeypatch.setattr(Path, "home", lambda: tmp_path)
+    # Disable the detect_crashed_workers grace period for legacy tests in
+    # this file that claim a task and immediately expect
+    # ``detect_crashed_workers`` to act on it. The grace period (30s by
+    # default, see ``DEFAULT_CRASH_GRACE_SECONDS``) prevents the
+    # multi-dispatcher reap race in production; setting it to 0 here
+    # restores the pre-fix instant-reclaim semantics these tests were
+    # written against. The grace-period itself is covered by dedicated
+    # tests in tests/hermes_cli/test_kanban_db.py.
+    monkeypatch.setenv("HERMES_KANBAN_CRASH_GRACE_SECONDS", "0")
     kb.init_db()
     return home
 
diff --git a/tests/hermes_cli/test_kanban_db.py b/tests/hermes_cli/test_kanban_db.py
@@ -2097,17 +2097,31 @@ def test_latest_summaries_batch_omits_tasks_without_summary(kanban_home):
 # NFS / network-filesystem fallback (see hermes_state.apply_wal_with_fallback)
 # ---------------------------------------------------------------------------
 
-def test_connect_falls_back_to_delete_on_locking_protocol(kanban_home, caplog):
+def test_connect_falls_back_to_delete_on_locking_protocol(tmp_path, monkeypatch, caplog):
     """kanban_db.connect() must handle ``locking protocol`` on NFS/SMB.
 
     Without this fallback, the gateway's kanban dispatcher crashes every
     60s and the kanban migration (``consecutive_failures`` ADD COLUMN) is
     retried forever — which is what the real-world user report shows
     (see hermes-agent issue #22032).
+
+    NOTE: We do NOT use the ``kanban_home`` fixture here because that
+    fixture pre-initializes the DB via ``kb.init_db()`` — putting the
+    file in WAL on disk. The Bug D safety guard now refuses to downgrade
+    to DELETE when the on-disk header is already WAL, so testing the
+    NFS-fallback path requires a truly-fresh DB file (NFS scenario in
+    production: first connection of the first process ever to touch the
+    file, where downgrading is safe because nobody else has WAL state
+    yet).
     """
     import sqlite3 as _sqlite3
     from unittest.mock import patch as _patch
 
+    home = tmp_path / ".hermes"
+    home.mkdir()
+    monkeypatch.setenv("HERMES_HOME", str(home))
+    monkeypatch.setattr(Path, "home", lambda: tmp_path)
+
     # Clear module cache so a fresh connect() is attempted
     kb._INITIALIZED_PATHS.clear()
 
diff --git a/tests/test_hermes_state_wal_fallback.py b/tests/test_hermes_state_wal_fallback.py
@@ -110,15 +110,72 @@ def test_falls_back_on_not_authorized(self, tmp_path):
         assert mode == "delete"
         conn.close()
 
-    def test_falls_back_on_disk_io_error(self, tmp_path):
-        """Flaky network FS → disk I/O error → still fall back."""
+    def test_reraises_on_disk_io_error(self, tmp_path):
+        """Transient EIO from ``PRAGMA journal_mode=WAL`` must NOT silently
+        downgrade to DELETE.
+
+        Regression for "Bug D": treating transient EIO as a permanent
+        WAL-incompat marker produced the mixed-journal-mode-across-processes
+        corruption pattern (process A downgrades to DELETE, sibling
+        processes successfully set WAL, SQLite corrupts the file because
+        the two locking protocols are documented as incompatible). EIO is
+        usually transient (page-cache pressure, lock contention, brief
+        storage hiccups); the right behavior is to re-raise so the caller
+        can retry, not to walk the DB into a permanently downgraded state.
+        """
         conn, _ = _open_blocking(
             tmp_path / "flaky.db", reason="disk I/O error", isolation_level=None
         )
-        mode = apply_wal_with_fallback(conn)
-        assert mode == "delete"
+        with pytest.raises(sqlite3.OperationalError, match="disk I/O error"):
+            apply_wal_with_fallback(conn)
         conn.close()
 
+    def test_does_not_downgrade_when_disk_says_wal(self, tmp_path):
+        """Belt-and-suspenders: even if a marker matches, refuse to
+        downgrade when the on-disk DB header is already WAL.
+
+        Prevents a future addition to ``_WAL_INCOMPAT_MARKERS`` from
+        accidentally reintroducing the mixed-journal-mode corruption
+        pattern. We construct a DB already in WAL on disk, then open a
+        new connection whose ``PRAGMA journal_mode=WAL`` raises one of
+        the legit markers — the function must still re-raise (refusing
+        the downgrade) because the on-disk file is WAL.
+        """
+        # Prime the file in WAL mode using a normal connection
+        primer = sqlite3.connect(
+            str(tmp_path / "already-wal.db"), isolation_level=None
+        )
+        try:
+            primer.execute("PRAGMA journal_mode=WAL")
+            primer.execute("CREATE TABLE t (x INTEGER)")
+            primer.execute("INSERT INTO t VALUES (1)")
+            assert (
+                primer.execute("PRAGMA journal_mode").fetchone()[0].lower() == "wal"
+            )
+        finally:
+            primer.close()
+
+        # New connection whose WAL pragma raises "locking protocol" — a
+        # marker that WOULD normally trigger downgrade. With the on-disk
+        # guard, we must instead re-raise.
+        conn, _ = _open_blocking(
+            tmp_path / "already-wal.db",
+            reason="locking protocol",
+            isolation_level=None,
+        )
+        with pytest.raises(sqlite3.OperationalError, match="locking protocol"):
+            apply_wal_with_fallback(conn)
+        conn.close()
+
+        # And the file is STILL WAL on disk — nothing got rewritten
+        check = sqlite3.connect(str(tmp_path / "already-wal.db"))
+        try:
+            assert (
+                check.execute("PRAGMA journal_mode").fetchone()[0].lower() == "wal"
+            )
+        finally:
+            check.close()
+
     def test_reraises_unrelated_operational_error(self, tmp_path):
         """Non-WAL-compat errors must NOT be silently swallowed by the fallback."""
         conn, _ = _open_blocking(