Skip to content

Commit dc98314

Browse files
steveonjavakshitijk4poor
authored andcommitted
fix(kanban): skip redundant WAL pragma on already-WAL connections
apply_wal_with_fallback() issued PRAGMA journal_mode=WAL on every call, including connections to DBs already in WAL mode. This triggered the WAL init code path, causing SQLite to acquire EXCLUSIVE, checkpoint, and unlink kanban.db-{wal,shm}. Other open connections received (deleted) FDs and raised sqlite3.OperationalError: disk I/O error. Add a cheap read probe (PRAGMA journal_mode, no flock/checkpoint/unlink) before the set-pragma path. If already wal, return early. The set-pragma and DELETE fallback paths are unchanged. Closes #31158. Addresses root cause that PRs #32226 and #32322 attempted via connection-sharing/caching approaches.
1 parent ffdc937 commit dc98314

3 files changed

Lines changed: 251 additions & 15 deletions

File tree

hermes_state.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -170,6 +170,15 @@ def apply_wal_with_fallback(
170170
171171
Never downgrades to DELETE if the on-disk DB header reports WAL — see _on_disk_journal_mode.
172172
"""
173+
# Read-only probe — no flock, no checkpoint, no WAL/SHM unlink.
174+
# Skipping the set-pragma prevents WAL-init from unlinking files other connections hold open.
175+
try:
176+
current_mode = conn.execute("PRAGMA journal_mode").fetchone()
177+
if current_mode and current_mode[0] == "wal":
178+
return "wal"
179+
except sqlite3.OperationalError:
180+
pass
181+
173182
try:
174183
conn.execute("PRAGMA journal_mode=WAL")
175184
return "wal"

tests/test_hermes_state.py

Lines changed: 220 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3021,3 +3021,223 @@ def test_v10_to_v11_upgrade_backfills_tool_fields(self, tmp_path):
30213021
finally:
30223022
session_db.close()
30233023

3024+
3025+
# ---------------------------------------------------------------------------
3026+
# apply_wal_with_fallback — read-only probe tests
3027+
# ---------------------------------------------------------------------------
3028+
3029+
3030+
class TestApplyWalProbe:
3031+
"""Unit tests for the journal_mode probe in apply_wal_with_fallback."""
3032+
3033+
def test_skips_set_pragma_when_already_wal(self, tmp_path):
3034+
"""Already-WAL connection must not trigger the set-pragma."""
3035+
import sqlite3
3036+
from hermes_state import apply_wal_with_fallback
3037+
3038+
class _TracingConn(sqlite3.Connection):
3039+
def __init__(self, *a, **kw):
3040+
super().__init__(*a, **kw)
3041+
self.executed = []
3042+
3043+
def execute(self, sql, params=()):
3044+
self.executed.append(sql)
3045+
return super().execute(sql, params)
3046+
3047+
db_path = tmp_path / "wal.db"
3048+
# Prime the file into WAL mode first.
3049+
with sqlite3.connect(str(db_path)) as seed:
3050+
seed.execute("PRAGMA journal_mode=WAL")
3051+
3052+
conn = _TracingConn(str(db_path))
3053+
try:
3054+
result = apply_wal_with_fallback(conn)
3055+
finally:
3056+
conn.close()
3057+
3058+
assert result == "wal"
3059+
# Only the probe should have fired; the set-pragma must NOT appear.
3060+
assert any("PRAGMA journal_mode" == sql.strip() for sql in conn.executed), (
3061+
"probe PRAGMA should have run"
3062+
)
3063+
assert not any("journal_mode=WAL" in sql for sql in conn.executed), (
3064+
"set-pragma must not run when already in WAL mode"
3065+
)
3066+
3067+
def test_sets_wal_on_fresh_connection(self, tmp_path):
3068+
"""Probe sees 'delete', then set-pragma runs and returns 'wal'."""
3069+
import sqlite3
3070+
from hermes_state import apply_wal_with_fallback
3071+
3072+
class _TracingConn(sqlite3.Connection):
3073+
def __init__(self, *a, **kw):
3074+
super().__init__(*a, **kw)
3075+
self.executed = []
3076+
3077+
def execute(self, sql, params=()):
3078+
self.executed.append(sql)
3079+
return super().execute(sql, params)
3080+
3081+
db_path = tmp_path / "fresh.db"
3082+
conn = _TracingConn(str(db_path))
3083+
try:
3084+
result = apply_wal_with_fallback(conn)
3085+
finally:
3086+
conn.close()
3087+
3088+
assert result == "wal"
3089+
assert any("journal_mode=WAL" in sql for sql in conn.executed), (
3090+
"set-pragma must fire on a fresh (non-WAL) connection"
3091+
)
3092+
3093+
def test_apply_wal_concurrent_connects_no_eio(self, tmp_path):
3094+
"""20 threads calling connect() on the same DB must not see disk I/O error."""
3095+
import sys
3096+
import threading
3097+
import sqlite3
3098+
from hermes_state import apply_wal_with_fallback
3099+
3100+
db_path = tmp_path / "concurrent.db"
3101+
errors = []
3102+
3103+
def _connect_cycle():
3104+
for _ in range(5):
3105+
try:
3106+
conn = sqlite3.connect(str(db_path))
3107+
apply_wal_with_fallback(conn)
3108+
conn.close()
3109+
except sqlite3.OperationalError as exc:
3110+
if "disk i/o error" in str(exc).lower():
3111+
errors.append(exc)
3112+
3113+
threads = [threading.Thread(target=_connect_cycle) for _ in range(20)]
3114+
for t in threads:
3115+
t.start()
3116+
for t in threads:
3117+
t.join()
3118+
3119+
assert not errors, f"disk I/O errors from concurrent connects: {errors}"
3120+
3121+
# Linux-only: no (deleted) WAL/SHM FDs should accumulate.
3122+
if sys.platform == "linux":
3123+
import os
3124+
3125+
fd_dir = f"/proc/{os.getpid()}/fd"
3126+
deleted_fds = []
3127+
for fd_name in os.listdir(fd_dir):
3128+
try:
3129+
target = os.readlink(os.path.join(fd_dir, fd_name))
3130+
if "(deleted)" in target and (
3131+
"wal" in target.lower() or "shm" in target.lower()
3132+
):
3133+
deleted_fds.append(target)
3134+
except OSError:
3135+
pass
3136+
assert not deleted_fds, f"stale deleted WAL/SHM FDs: {deleted_fds}"
3137+
3138+
def test_fallback_to_delete_still_works(self, tmp_path):
3139+
"""When set-pragma raises a WAL-incompat error, falls back to DELETE."""
3140+
import sqlite3
3141+
from hermes_state import apply_wal_with_fallback
3142+
3143+
class _IncompatConn(sqlite3.Connection):
3144+
def __init__(self, *a, **kw):
3145+
super().__init__(*a, **kw)
3146+
self._call_count = 0
3147+
3148+
def execute(self, sql, params=()):
3149+
self._call_count += 1
3150+
# First call is the read probe; let it return "delete".
3151+
# Second call is the set-pragma; raise a WAL-incompat error.
3152+
if "journal_mode=WAL" in sql:
3153+
raise sqlite3.OperationalError("locking protocol")
3154+
return super().execute(sql, params)
3155+
3156+
db_path = tmp_path / "incompat.db"
3157+
conn = _IncompatConn(str(db_path))
3158+
try:
3159+
result = apply_wal_with_fallback(conn, db_label="test.db")
3160+
finally:
3161+
conn.close()
3162+
3163+
assert result == "delete"
3164+
3165+
def test_probe_failure_falls_through_to_set_pragma(self, tmp_path):
3166+
"""When the read probe raises OperationalError, fall through to set-pragma."""
3167+
import sqlite3
3168+
from hermes_state import apply_wal_with_fallback
3169+
3170+
class _ProbeFails(sqlite3.Connection):
3171+
def __init__(self, *a, **kw):
3172+
super().__init__(*a, **kw)
3173+
self._first = True
3174+
3175+
def execute(self, sql, params=()):
3176+
if self._first and "journal_mode" in sql and "WAL" not in sql:
3177+
self._first = False
3178+
raise sqlite3.OperationalError("simulated probe failure")
3179+
return super().execute(sql, params)
3180+
3181+
db_path = tmp_path / "probe_fail.db"
3182+
conn = _ProbeFails(str(db_path))
3183+
try:
3184+
result = apply_wal_with_fallback(conn)
3185+
finally:
3186+
conn.close()
3187+
3188+
# Despite probe failure, set-pragma must still run and succeed.
3189+
assert result == "wal"
3190+
3191+
def test_no_downgrade_from_wal_to_delete_on_eio(self, tmp_path):
3192+
"""OperationalError NOT in _WAL_INCOMPAT_MARKERS must propagate, not downgrade."""
3193+
import sqlite3
3194+
import pytest
3195+
from hermes_state import apply_wal_with_fallback
3196+
3197+
class _EIOConn(sqlite3.Connection):
3198+
def __init__(self, *a, **kw):
3199+
super().__init__(*a, **kw)
3200+
self._first = True
3201+
3202+
def execute(self, sql, params=()):
3203+
# Let the probe succeed (returns "delete" for fresh DB).
3204+
if "journal_mode=WAL" in sql:
3205+
raise sqlite3.OperationalError("some unexpected hardware failure")
3206+
return super().execute(sql, params)
3207+
3208+
db_path = tmp_path / "eio.db"
3209+
conn = _EIOConn(str(db_path))
3210+
try:
3211+
with pytest.raises(
3212+
sqlite3.OperationalError, match="some unexpected hardware failure"
3213+
):
3214+
apply_wal_with_fallback(conn)
3215+
finally:
3216+
conn.close()
3217+
3218+
def test_returns_wal_not_delete_from_probe(self, tmp_path):
3219+
"""Early-return only on 'wal'; 'delete' or 'memory' must fall through to set-pragma."""
3220+
import sqlite3
3221+
from hermes_state import apply_wal_with_fallback
3222+
3223+
class _TracingConn(sqlite3.Connection):
3224+
def __init__(self, *a, **kw):
3225+
super().__init__(*a, **kw)
3226+
self.executed = []
3227+
3228+
def execute(self, sql, params=()):
3229+
self.executed.append(sql)
3230+
return super().execute(sql, params)
3231+
3232+
# Fresh DB is in "delete" mode — probe returns "delete", must NOT early-return.
3233+
db_path = tmp_path / "delete_mode.db"
3234+
conn = _TracingConn(str(db_path))
3235+
try:
3236+
result = apply_wal_with_fallback(conn)
3237+
finally:
3238+
conn.close()
3239+
3240+
assert result == "wal"
3241+
assert any("journal_mode=WAL" in sql for sql in conn.executed), (
3242+
"set-pragma must fire when probe returns 'delete'"
3243+
)

tests/test_hermes_state_wal_fallback.py

Lines changed: 22 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -131,15 +131,17 @@ def test_reraises_on_disk_io_error(self, tmp_path):
131131
conn.close()
132132

133133
def test_does_not_downgrade_when_disk_says_wal(self, tmp_path):
134-
"""Belt-and-suspenders: even if a marker matches, refuse to
135-
downgrade when the on-disk DB header is already WAL.
136-
137-
Prevents a future addition to ``_WAL_INCOMPAT_MARKERS`` from
138-
accidentally reintroducing the mixed-journal-mode corruption
139-
pattern. We construct a DB already in WAL on disk, then open a
140-
new connection whose ``PRAGMA journal_mode=WAL`` raises one of
141-
the legit markers — the function must still re-raise (refusing
142-
the downgrade) because the on-disk file is WAL.
134+
"""Refuse to downgrade an already-WAL DB even if the set-pragma path
135+
would have raised a downgrade-eligible marker.
136+
137+
With the WAL-skip patch, the read-only probe short-circuits before
138+
``PRAGMA journal_mode=WAL`` ever runs on an already-WAL connection,
139+
so the set-pragma path is unreachable here and ``attempts`` stays 0.
140+
Either outcome (skip-via-probe OR re-raise-on-disk-check) preserves
141+
the property this test guards: we never silently DELETE-downgrade
142+
a WAL-mode file. The on-disk guard remains in place as
143+
belt-and-suspenders for any future code path that bypasses the
144+
probe.
143145
"""
144146
# Prime the file in WAL mode using a normal connection
145147
primer = sqlite3.connect(
@@ -155,16 +157,21 @@ def test_does_not_downgrade_when_disk_says_wal(self, tmp_path):
155157
finally:
156158
primer.close()
157159

158-
# New connection whose WAL pragma raises "locking protocol" — a
159-
# marker that WOULD normally trigger downgrade. With the on-disk
160-
# guard, we must instead re-raise.
161-
conn, _ = _open_blocking(
160+
# New connection whose set-WAL pragma would raise "locking protocol"
161+
# if it were ever called. With the WAL-skip patch the probe sees
162+
# journal_mode=wal and returns early, so set-WAL is never attempted.
163+
conn, attempts = _open_blocking(
162164
tmp_path / "already-wal.db",
163165
reason="locking protocol",
164166
isolation_level=None,
165167
)
166-
with pytest.raises(sqlite3.OperationalError, match="locking protocol"):
167-
apply_wal_with_fallback(conn)
168+
result = apply_wal_with_fallback(conn)
169+
assert result == "wal", (
170+
"must report wal mode (either skipped via probe or refused downgrade)"
171+
)
172+
assert attempts[0] == 0, (
173+
"set-WAL pragma must not run when the on-disk header already says wal"
174+
)
168175
conn.close()
169176

170177
# And the file is STILL WAL on disk — nothing got rewritten

0 commit comments

Comments
 (0)