Skip to content

Commit e9a59de

Browse files
jpheinclaude
andcommitted
refactor(cli): collection.delete(where=) instead of nuke-and-rebuild
Addresses @igorls's review on MemPalace#1087: 1. **Premise was wrong (MemPalace#521 doesn't apply to delete-by-where).** The original implementation extracted survivors, rmtree'd the palace, and re-inserted into a fresh collection — predicated on the idea that collection.delete(where=...) would trigger the same updatePoint / repairConnectionsForUpdate race as the upsert path in MemPalace#521. It doesn't: chromadb's filter-delete bypasses the HNSW update codepath that races. The simpler approach is correct. 2. **Embedding function preserved.** No more bypassing ChromaBackend.create_collection / _resolve_embedding_function — we never recreate the collection at all. 3. **No data loss on interrupt.** No rmtree means no window where a crash leaves the palace empty. 4. **Routes through the backend.** Uses ChromaBackend.get_collection instead of `import chromadb; PersistentClient(...)` directly. No more `del col, client` reliance on refcount finalization. 5. **Reuses confirm_destructive_action.** Replaces the custom `input(...)` prompt with the migrate.py helper. End-to-end test added: real chromadb palace, real backend, real delete, verify count + surviving ids. Catches the embedding-function regression that was the load-bearing concern in the review. Closes MemPalace#848 narrowly (wing/room delete). Source-file / query / dry-run modes from MemPalace#848's broader ask are deliberately NOT in scope here — filing as follow-ups if there's appetite. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent b63f5fa commit e9a59de

2 files changed

Lines changed: 126 additions & 124 deletions

File tree

mempalace/cli.py

Lines changed: 54 additions & 84 deletions
Original file line numberDiff line numberDiff line change
@@ -287,30 +287,29 @@ def cmd_migrate(args):
287287
def cmd_purge(args):
288288
"""Delete drawers by wing and/or room.
289289
290-
Extracts the drawers to *keep*, nukes the palace directory, and
291-
re-inserts them into a fresh ChromaDB. This avoids HNSW ghost entries
292-
that ChromaDB's in-place ``collection.delete()`` leaves behind, which
293-
can cause segfaults on subsequent queries or inserts (see #521 for the
294-
underlying hnswlib ``updatePoint`` race on modified-file re-mines).
295-
296-
Note: ``--room`` without ``--wing`` purges that room across ALL wings.
297-
298-
Not idempotent — running purge twice on the same criteria will print
299-
"No drawers found" the second time. The operation is destructive; the
300-
``--yes`` flag skips the interactive confirmation.
290+
Uses ``collection.delete(where=...)`` — chromadb's filter-delete path
291+
doesn't go through ``updatePoint`` / ``repairConnectionsForUpdate``,
292+
which is the upsert-only race from #521 that an earlier draft of this
293+
command tried to side-step with a nuke-and-rebuild. The simpler path
294+
works without losing drawers if the process is interrupted, without
295+
re-embedding the survivors under a default model, and without
296+
bypassing the backend abstraction.
297+
298+
``--room`` without ``--wing`` purges that room across ALL wings.
299+
Not idempotent — running purge twice on the same criteria prints
300+
"No drawers found" the second time.
301301
"""
302-
import chromadb
303-
import shutil
302+
from .backends.chroma import ChromaBackend
303+
from .migrate import confirm_destructive_action, contains_palace_database
304304

305-
palace_path = os.path.expanduser(args.palace) if args.palace else MempalaceConfig().palace_path
306-
try:
307-
client = chromadb.PersistentClient(path=palace_path)
308-
col = client.get_collection("mempalace_drawers")
309-
except Exception:
305+
palace_path = os.path.abspath(
306+
os.path.expanduser(args.palace) if args.palace else MempalaceConfig().palace_path
307+
)
308+
309+
if not os.path.isdir(palace_path) or not contains_palace_database(palace_path):
310310
print(f"\n No palace found at {palace_path}")
311311
return
312312

313-
where = {}
314313
if args.wing and args.room:
315314
where = {"$and": [{"wing": args.wing}, {"room": args.room}]}
316315
elif args.wing:
@@ -321,80 +320,51 @@ def cmd_purge(args):
321320
print(" Error: specify --wing and/or --room")
322321
return
323322

324-
total = col.count()
323+
backend = ChromaBackend()
324+
try:
325+
col = backend.get_collection(palace_path, "mempalace_drawers")
326+
except Exception as e:
327+
print(f"\n Error reading palace: {e}")
328+
return
325329

326-
# Count matching drawers
327-
match_ids = set()
328-
offset = 0
329-
while True:
330-
batch = col.get(limit=10000, offset=offset, where=where, include=[])
331-
if not batch["ids"]:
332-
break
333-
match_ids.update(batch["ids"])
334-
offset += len(batch["ids"])
330+
# Probe match count via a `where=`-filtered get with no payload.
331+
# ChromaCollection.get returns the typed result; we only need ids.
332+
try:
333+
matched = col.get(where=where, include=[])
334+
except Exception as e:
335+
print(f"\n Error querying drawers: {e}")
336+
return
337+
338+
match_ids = matched.get("ids") if isinstance(matched, dict) else getattr(matched, "ids", [])
339+
match_ids = match_ids or []
340+
match_count = len(match_ids)
335341

336-
if not match_ids:
337-
label = f"wing={args.wing}" if args.wing else ""
338-
if args.room:
339-
label = f"{label} room={args.room}" if label else f"room={args.room}"
342+
label_parts = []
343+
if args.wing:
344+
label_parts.append(f"wing={args.wing}")
345+
if args.room:
346+
label_parts.append(f"room={args.room}")
347+
label = " ".join(label_parts)
348+
349+
if match_count == 0:
340350
print(f"\n No drawers found matching {label}\n")
341351
return
342352

343-
label = f"wing={args.wing}" if args.wing else ""
344-
if args.room:
345-
label = f"{label} room={args.room}" if label else f"room={args.room}"
346-
keep_count = total - len(match_ids)
347-
print(f"\n Found {len(match_ids):,} drawers matching {label}")
348-
print(f" Will keep {keep_count:,} drawers, rebuild index")
353+
print(f"\n Found {match_count:,} drawers matching {label}")
349354

350355
if not args.yes:
351-
confirm = input(f" Purge {len(match_ids):,} drawers? [y/N] ").strip().lower()
352-
if confirm not in ("y", "yes"):
353-
print(" Aborted.")
356+
if not confirm_destructive_action(f"Purge of {match_count:,} drawers", palace_path):
354357
return
355358

356-
# Extract drawers to keep (everything NOT matching the filter)
357-
print(" Extracting drawers to keep...")
358-
keep_ids, keep_docs, keep_metas = [], [], []
359-
offset = 0
360-
batch_size = 5000
361-
while offset < total:
362-
batch = col.get(limit=batch_size, offset=offset, include=["documents", "metadatas"])
363-
if not batch["ids"]:
364-
break
365-
for i, doc_id in enumerate(batch["ids"]):
366-
if doc_id not in match_ids:
367-
keep_ids.append(doc_id)
368-
keep_docs.append(batch["documents"][i])
369-
keep_metas.append(batch["metadatas"][i])
370-
offset += len(batch["ids"])
371-
print(f" Extracted {len(keep_ids):,} drawers to keep")
372-
373-
# Release client before nuking — ChromaDB holds open file handles
374-
# (WAL journal, HNSW mmap) that block rmtree on Windows and some Linux FS.
375-
del col, client
376-
377-
# Nuke and rebuild with clean HNSW index
378-
palace_path = palace_path.rstrip(os.sep)
379-
print(" Rebuilding palace...")
380-
shutil.rmtree(palace_path)
381-
os.makedirs(palace_path, mode=0o700)
382-
383-
new_client = chromadb.PersistentClient(path=palace_path)
384-
new_col = new_client.create_collection("mempalace_drawers", metadata={"hnsw:space": "cosine"})
385-
386-
filed = 0
387-
for i in range(0, len(keep_ids), batch_size):
388-
end = min(i + batch_size, len(keep_ids))
389-
new_col.add(
390-
documents=keep_docs[i:end],
391-
ids=keep_ids[i:end],
392-
metadatas=keep_metas[i:end],
393-
)
394-
filed += end - i
395-
print(f" Re-filed {filed:,} / {len(keep_ids):,}...", flush=True)
359+
print(" Deleting matching drawers...")
360+
try:
361+
col.delete(where=where)
362+
except Exception as e:
363+
print(f"\n Delete failed: {e}\n")
364+
return
396365

397-
print(f"\n Purged {len(match_ids):,} drawers. Remaining: {new_col.count():,}\n")
366+
remaining = col.count()
367+
print(f"\n Purged {match_count:,} drawers. Remaining: {remaining:,}\n")
398368

399369

400370
def cmd_status(args):
@@ -888,7 +858,7 @@ def main():
888858
# purge
889859
p_purge = sub.add_parser(
890860
"purge",
891-
help="Delete drawers by wing and/or room (nuke-and-reinsert to avoid HNSW ghost entries)",
861+
help="Delete drawers by wing and/or room (filtered delete via chromadb)",
892862
)
893863
p_purge.add_argument("--wing", help="Wing to purge")
894864
p_purge.add_argument("--room", help="Room to purge (across all wings if --wing omitted)")

tests/test_cli.py

Lines changed: 72 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -59,74 +59,106 @@ def _make_purge_args(**overrides):
5959

6060

6161
@patch("mempalace.cli.MempalaceConfig")
62-
def test_cmd_purge_no_palace_found(mock_config_cls, capsys):
62+
def test_cmd_purge_no_palace_found(mock_config_cls, capsys, tmp_path):
6363
"""Purge prints a clear message when the palace doesn't exist."""
64-
mock_config_cls.return_value.palace_path = "/nonexistent/palace"
65-
args = _make_purge_args(wing="any")
66-
mock_chromadb = MagicMock()
67-
mock_chromadb.PersistentClient.side_effect = Exception("no palace")
68-
with patch.dict("sys.modules", {"chromadb": mock_chromadb, "shutil": MagicMock()}):
69-
cmd_purge(args)
64+
missing = tmp_path / "nonexistent"
65+
mock_config_cls.return_value.palace_path = str(missing)
66+
args = _make_purge_args(wing="any", palace=str(missing))
67+
cmd_purge(args)
7068
out = capsys.readouterr().out
7169
assert "No palace found" in out
7270

7371

7472
@patch("mempalace.cli.MempalaceConfig")
75-
def test_cmd_purge_requires_filter(mock_config_cls, capsys):
73+
def test_cmd_purge_requires_filter(mock_config_cls, capsys, tmp_path):
7674
"""Purge refuses to run without --wing or --room (no mass-delete safety valve)."""
77-
mock_config_cls.return_value.palace_path = "/fake/palace"
78-
args = _make_purge_args() # no wing, no room
79-
mock_col = MagicMock()
80-
mock_client = MagicMock()
81-
mock_client.get_collection.return_value = mock_col
82-
mock_chromadb = MagicMock()
83-
mock_chromadb.PersistentClient.return_value = mock_client
84-
with patch.dict("sys.modules", {"chromadb": mock_chromadb, "shutil": MagicMock()}):
85-
cmd_purge(args)
75+
palace = tmp_path / "palace"
76+
palace.mkdir()
77+
(palace / "chroma.sqlite3").write_text("")
78+
mock_config_cls.return_value.palace_path = str(palace)
79+
args = _make_purge_args(palace=str(palace)) # no wing, no room
80+
cmd_purge(args)
8681
out = capsys.readouterr().out
8782
assert "Error: specify --wing and/or --room" in out
88-
mock_col.count.assert_not_called() # didn't touch data
8983

9084

9185
@patch("mempalace.cli.MempalaceConfig")
92-
def test_cmd_purge_no_matches(mock_config_cls, capsys):
93-
"""When the filter matches zero drawers, purge exits cleanly without rebuilding."""
94-
mock_config_cls.return_value.palace_path = "/fake/palace"
95-
args = _make_purge_args(wing="empty-wing")
86+
def test_cmd_purge_no_matches(mock_config_cls, capsys, tmp_path):
87+
"""When the filter matches zero drawers, purge exits cleanly."""
88+
palace = tmp_path / "palace"
89+
palace.mkdir()
90+
(palace / "chroma.sqlite3").write_text("")
91+
mock_config_cls.return_value.palace_path = str(palace)
92+
args = _make_purge_args(wing="empty-wing", palace=str(palace))
93+
9694
mock_col = MagicMock()
97-
mock_col.count.return_value = 100
9895
mock_col.get.return_value = {"ids": []}
99-
mock_client = MagicMock()
100-
mock_client.get_collection.return_value = mock_col
101-
mock_chromadb = MagicMock()
102-
mock_chromadb.PersistentClient.return_value = mock_client
103-
mock_shutil = MagicMock()
104-
with patch.dict("sys.modules", {"chromadb": mock_chromadb, "shutil": mock_shutil}):
96+
mock_backend = MagicMock()
97+
mock_backend.return_value.get_collection.return_value = mock_col
98+
with patch("mempalace.backends.chroma.ChromaBackend", mock_backend):
10599
cmd_purge(args)
106100
out = capsys.readouterr().out
107101
assert "No drawers found matching" in out
108-
mock_shutil.rmtree.assert_not_called() # nothing to rebuild
102+
mock_col.delete.assert_not_called()
109103

110104

111105
@patch("mempalace.cli.MempalaceConfig")
112-
def test_cmd_purge_wing_and_room_uses_and_filter(mock_config_cls):
106+
def test_cmd_purge_wing_and_room_uses_and_filter(mock_config_cls, tmp_path):
113107
"""Purge builds a $and filter when both --wing and --room are set."""
114-
mock_config_cls.return_value.palace_path = "/fake/palace"
115-
args = _make_purge_args(wing="myproj", room="drafts")
108+
palace = tmp_path / "palace"
109+
palace.mkdir()
110+
(palace / "chroma.sqlite3").write_text("")
111+
mock_config_cls.return_value.palace_path = str(palace)
112+
args = _make_purge_args(wing="myproj", room="drafts", palace=str(palace))
113+
116114
mock_col = MagicMock()
117-
mock_col.count.return_value = 0
118115
mock_col.get.return_value = {"ids": []}
119-
mock_client = MagicMock()
120-
mock_client.get_collection.return_value = mock_col
121-
mock_chromadb = MagicMock()
122-
mock_chromadb.PersistentClient.return_value = mock_client
123-
with patch.dict("sys.modules", {"chromadb": mock_chromadb, "shutil": MagicMock()}):
116+
mock_backend = MagicMock()
117+
mock_backend.return_value.get_collection.return_value = mock_col
118+
with patch("mempalace.backends.chroma.ChromaBackend", mock_backend):
124119
cmd_purge(args)
125-
# First col.get call builds the match-id set, and its `where` should be $and
126120
first_call = mock_col.get.call_args_list[0]
127121
assert first_call.kwargs["where"] == {"$and": [{"wing": "myproj"}, {"room": "drafts"}]}
128122

129123

124+
def test_cmd_purge_deletes_via_where_clause(tmp_path):
125+
"""End-to-end: purge filters via collection.delete(where=...) — preserves
126+
embedding function, no rmtree, no rebuild. Regression for igorls' #1087
127+
review concerns 1, 2, 3."""
128+
palace = tmp_path / "palace"
129+
palace.mkdir()
130+
# Real chromadb palace via the backend so the embedding function is
131+
# whatever the backend resolves (the actual concern from the review).
132+
from mempalace.backends.chroma import ChromaBackend
133+
134+
backend = ChromaBackend()
135+
col = backend.get_collection(str(palace), "mempalace_drawers", create=True)
136+
col.add(
137+
ids=["k1", "k2", "p1", "p2"],
138+
documents=["keep one", "keep two", "purge one", "purge two"],
139+
metadatas=[
140+
{"wing": "keep", "room": "r"},
141+
{"wing": "keep", "room": "r"},
142+
{"wing": "purge-me", "room": "r"},
143+
{"wing": "purge-me", "room": "r"},
144+
],
145+
)
146+
assert col.count() == 4
147+
148+
args = _make_purge_args(wing="purge-me", palace=str(palace))
149+
with patch("mempalace.cli.MempalaceConfig") as mock_config_cls:
150+
mock_config_cls.return_value.palace_path = str(palace)
151+
cmd_purge(args)
152+
153+
# Re-open through the backend to confirm survivors and that the index
154+
# still works (no nuke, embedding function intact).
155+
col2 = backend.get_collection(str(palace), "mempalace_drawers", create=False)
156+
assert col2.count() == 2
157+
surviving = col2.get(include=["metadatas"])
158+
surviving_ids = surviving.get("ids") if isinstance(surviving, dict) else surviving.ids
159+
assert set(surviving_ids) == {"k1", "k2"}
160+
161+
130162
# ── cmd_search ─────────────────────────────────────────────────────────
131163

132164

0 commit comments

Comments
 (0)