Skip to content

Commit aa84fa4

Browse files
jpheinclaude
andcommitted
feat(repair): preserve embeddings on rebuild — cherry-pick of upstream MemPalace#1367 (embeddings portion only)
Cherry-picks the `_extract_drawers` embeddings-preserve fix from upstream PR MemPalace#1367 (@zhapostolski), per the cherry-pick evaluation in #31 and @messelink's review guidance: take the embeddings fix, SKIP the divergence-floor cap (which would re-create _refresh_vector_disabled_flag false-positives on our sync_threshold= 50000 palace per MemPalace#1287's design). What the embeddings-preserve fix does: - _extract_drawers now includes "embeddings" in the col.get() call - Returns a 4-tuple: ids, docs, metas, embeddings (or None when any embedding slot is missing — HNSW unreadable case) - _rebuild_collection_via_temp accepts an all_embeddings= kwarg and passes embeddings through to upsert() when available - rebuild_index threads embeddings from _extract_drawers into _rebuild_collection_via_temp When embeddings ARE available (the common case), rebuild skips ONNX recomputation entirely. On a 150K+ palace this cuts rebuild from hours to minutes. Fallback to recompute when any embedding is None (the bug condition we're trying to repair). Updates two call sites in mempalace/cli.py (the cmd_repair path) and mempalace/repair.py (the rebuild_index path) for the new 4-tuple signature. Adds 3 tests: - test_extract_drawers_preserves_embeddings_when_all_present - test_extract_drawers_falls_back_to_none_when_any_embedding_missing - test_extract_drawers_embeddings_absent_in_batch_signals_recompute Plus updates the existing 5 _extract_drawers tests to unpack the new 4-tuple. All 75 tests pass. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
1 parent 18606af commit aa84fa4

3 files changed

Lines changed: 127 additions & 13 deletions

File tree

mempalace/cli.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1378,8 +1378,13 @@ def cmd_repair(args):
13781378
# Extract all drawers in batches
13791379
print("\n Extracting drawers...")
13801380
batch_size = 5000
1381-
all_ids, all_docs, all_metas = _extract_drawers(col, total, batch_size)
1382-
print(f" Extracted {len(all_ids)} drawers")
1381+
all_ids, all_docs, all_metas, all_embeddings = _extract_drawers(col, total, batch_size)
1382+
emb_status = (
1383+
"with stored embeddings"
1384+
if all_embeddings is not None
1385+
else "without embeddings (will recompute)"
1386+
)
1387+
print(f" Extracted {len(all_ids)} drawers ({emb_status})")
13831388

13841389
# ── #1208 guard ──────────────────────────────────────────────────
13851390
# Cross-check against the SQLite ground truth before doing anything
@@ -1422,6 +1427,7 @@ def cmd_repair(args):
14221427
batch_size,
14231428
collection_name=collection_name,
14241429
progress=print,
1430+
all_embeddings=all_embeddings,
14251431
)
14261432
except RebuildCollectionError as e:
14271433
print(f" Repair failed: {e}")

mempalace/repair.py

Lines changed: 58 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -130,12 +130,32 @@ def _paginate_ids(col, where=None):
130130

131131

132132
def _extract_drawers(col, total: int, batch_size: int):
133+
"""Extract drawers from the source collection for a rebuild.
134+
135+
Returns ``(ids, documents, metadatas, embeddings)`` — embeddings is a
136+
list aligned with ids when every slot has a vector, or ``None`` when
137+
any slot is missing (HNSW segment unreadable; caller must let
138+
ChromaDB recompute during upsert).
139+
140+
Embeddings-preserve fix cherry-picked from MemPalace/mempalace#1367
141+
(Zharko Apostolski) — the embeddings portion only, NOT the
142+
divergence-floor cap (per @messelink's review on jphein/mempalace#31:
143+
the divergence cap would false-positive on our sync_threshold=50000
144+
palace). Without this, _extract_drawers omitted embeddings entirely
145+
and rebuild had to recompute every vector through ONNX, multi-hour
146+
cost on a 150K+ palace.
147+
"""
133148
all_ids = []
134149
all_docs = []
135150
all_metas = []
151+
all_embeddings = []
136152
offset = 0
137153
while offset < total:
138-
batch = col.get(limit=batch_size, offset=offset, include=["documents", "metadatas"])
154+
batch = col.get(
155+
limit=batch_size,
156+
offset=offset,
157+
include=["documents", "metadatas", "embeddings"],
158+
)
139159
if not batch["ids"]:
140160
break
141161
all_ids.extend(batch["ids"])
@@ -151,8 +171,22 @@ def _extract_drawers(col, total: int, batch_size: int):
151171
for m in batch["metadatas"]
152172
]
153173
all_metas.extend(sanitized_metas)
174+
# embeddings may be None when the HNSW segment is unreadable (the
175+
# exact scenario we are trying to repair). Fall back to None so the
176+
# caller can pass embeddings=None and let ChromaDB recompute them.
177+
batch_embeddings = batch.get("embeddings")
178+
if batch_embeddings is not None and any(e is None for e in batch_embeddings):
179+
batch_embeddings = None
180+
all_embeddings.extend(
181+
batch_embeddings
182+
if batch_embeddings is not None
183+
else [None] * len(batch["ids"])
184+
)
154185
offset += len(batch["ids"])
155-
return all_ids, all_docs, all_metas
186+
# If any embedding slot is None the whole list is unusable for direct
187+
# upsert — signal that to the caller with None so ChromaDB recomputes.
188+
has_all = all_embeddings and all(e is not None for e in all_embeddings)
189+
return all_ids, all_docs, all_metas, all_embeddings if has_all else None
156190

157191

158192
def _verify_collection_count(col, expected: int, label: str) -> None:
@@ -194,12 +228,18 @@ def _rebuild_collection_via_temp(
194228
batch_size: int,
195229
collection_name: Optional[str] = None,
196230
progress=print,
231+
all_embeddings=None,
197232
) -> int:
198233
expected = len(all_ids)
199234
collection_name = collection_name or _drawers_collection_name()
200235
temp_name = f"{collection_name}__repair_tmp"
201236
live_replaced = False
202237

238+
if all_embeddings is not None:
239+
progress(" Re-using stored embeddings (no recomputation needed).")
240+
else:
241+
progress(" Stored embeddings unavailable — will recompute during rebuild.")
242+
203243
try:
204244
_delete_collection_if_exists(backend, palace_path, temp_name)
205245

@@ -210,7 +250,10 @@ def _rebuild_collection_via_temp(
210250
batch_ids = all_ids[i : i + batch_size]
211251
batch_docs = all_docs[i : i + batch_size]
212252
batch_metas = all_metas[i : i + batch_size]
213-
temp_col.upsert(documents=batch_docs, ids=batch_ids, metadatas=batch_metas)
253+
kwargs = {"documents": batch_docs, "ids": batch_ids, "metadatas": batch_metas}
254+
if all_embeddings is not None:
255+
kwargs["embeddings"] = all_embeddings[i : i + batch_size]
256+
temp_col.upsert(**kwargs)
214257
staged += len(batch_ids)
215258
progress(f" Staged {staged}/{expected} drawers...")
216259
_verify_collection_count(temp_col, expected, "temporary rebuild")
@@ -225,7 +268,10 @@ def _rebuild_collection_via_temp(
225268
batch_ids = all_ids[i : i + batch_size]
226269
batch_docs = all_docs[i : i + batch_size]
227270
batch_metas = all_metas[i : i + batch_size]
228-
new_col.upsert(documents=batch_docs, ids=batch_ids, metadatas=batch_metas)
271+
kwargs = {"documents": batch_docs, "ids": batch_ids, "metadatas": batch_metas}
272+
if all_embeddings is not None:
273+
kwargs["embeddings"] = all_embeddings[i : i + batch_size]
274+
new_col.upsert(**kwargs)
229275
rebuilt += len(batch_ids)
230276
progress(f" Re-filed {rebuilt}/{expected} drawers...")
231277
_verify_collection_count(new_col, expected, "rebuilt live collection")
@@ -755,8 +801,13 @@ def rebuild_index(
755801
# Extract all drawers in batches
756802
progress("\n Extracting drawers...")
757803
batch_size = 5000
758-
all_ids, all_docs, all_metas = _extract_drawers(col, total, batch_size)
759-
progress(f" Extracted {len(all_ids)} drawers")
804+
all_ids, all_docs, all_metas, all_embeddings = _extract_drawers(col, total, batch_size)
805+
emb_status = (
806+
"with stored embeddings"
807+
if all_embeddings is not None
808+
else "without embeddings (will recompute)"
809+
)
810+
progress(f" Extracted {len(all_ids)} drawers ({emb_status})")
760811

761812
# ── #1208 guard ──────────────────────────────────────────────────
762813
# Refuse to ``delete_collection`` + rebuild when extraction looks
@@ -793,6 +844,7 @@ def rebuild_index(
793844
batch_size,
794845
collection_name=collection_name,
795846
progress=progress,
847+
all_embeddings=all_embeddings,
796848
)
797849
except RebuildCollectionError as e:
798850
progress(f"\n ERROR during rebuild: {e}")

tests/test_repair.py

Lines changed: 61 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -87,7 +87,7 @@ def test_extract_drawers_preserves_valid_metadata():
8787
"documents": ["doc1", "doc2"],
8888
"metadatas": [{"wing": "a", "room": "1"}, {"wing": "b", "room": "2"}],
8989
}
90-
all_ids, all_docs, all_metas = repair._extract_drawers(col, total=2, batch_size=2)
90+
all_ids, all_docs, all_metas, _ = repair._extract_drawers(col, total=2, batch_size=2)
9191
assert all_ids == ["id1", "id2"]
9292
assert all_docs == ["doc1", "doc2"]
9393
assert all_metas == [{"wing": "a", "room": "1"}, {"wing": "b", "room": "2"}]
@@ -106,7 +106,7 @@ def test_extract_drawers_sanitizes_none_metadata():
106106
"documents": ["doc1", "doc2", "doc3"],
107107
"metadatas": [{"wing": "a"}, None, {"wing": "c"}],
108108
}
109-
_, _, all_metas = repair._extract_drawers(col, total=3, batch_size=3)
109+
_, _, all_metas, _ = repair._extract_drawers(col, total=3, batch_size=3)
110110
assert all_metas[0] == {"wing": "a"}
111111
assert all_metas[1] == {"_repaired_empty_meta": True}
112112
assert all_metas[2] == {"wing": "c"}
@@ -124,7 +124,7 @@ def test_extract_drawers_sanitizes_empty_dict_metadata():
124124
"documents": ["doc1", "doc2"],
125125
"metadatas": [{}, {"wing": "b"}],
126126
}
127-
_, _, all_metas = repair._extract_drawers(col, total=2, batch_size=2)
127+
_, _, all_metas, _ = repair._extract_drawers(col, total=2, batch_size=2)
128128
assert all_metas[0] == {"_repaired_empty_meta": True}
129129
assert all_metas[1] == {"wing": "b"}
130130

@@ -142,7 +142,7 @@ def test_extract_drawers_sanitization_preserves_alignment():
142142
"documents": ["d1", "d2", "d3", "d4"],
143143
"metadatas": [None, {"k": "v"}, {}, None],
144144
}
145-
all_ids, all_docs, all_metas = repair._extract_drawers(col, total=4, batch_size=4)
145+
all_ids, all_docs, all_metas, _ = repair._extract_drawers(col, total=4, batch_size=4)
146146
assert len(all_ids) == len(all_docs) == len(all_metas) == 4
147147
assert all_ids == ["id1", "id2", "id3", "id4"]
148148
assert all_metas[0] == {"_repaired_empty_meta": True}
@@ -159,11 +159,67 @@ def test_extract_drawers_multiple_batches():
159159
{"ids": ["id3"], "documents": ["d3"], "metadatas": [{}]},
160160
{"ids": [], "documents": [], "metadatas": []},
161161
]
162-
all_ids, all_docs, all_metas = repair._extract_drawers(col, total=3, batch_size=2)
162+
all_ids, all_docs, all_metas, _ = repair._extract_drawers(col, total=3, batch_size=2)
163163
assert all_ids == ["id1", "id2", "id3"]
164164
assert all_metas == [{"a": 1}, {"_repaired_empty_meta": True}, {"_repaired_empty_meta": True}]
165165

166166

167+
# ── embeddings preservation (upstream #1367 cherry-pick) ────────────
168+
169+
170+
def test_extract_drawers_preserves_embeddings_when_all_present():
171+
"""When every drawer has an embedding, return them aligned with ids.
172+
173+
The fast-path of upstream #1367: re-uses stored vectors during repair
174+
instead of forcing ChromaDB to recompute via ONNX. Saves hours on
175+
large palaces.
176+
"""
177+
col = MagicMock()
178+
col.get.return_value = {
179+
"ids": ["id1", "id2"],
180+
"documents": ["d1", "d2"],
181+
"metadatas": [{"a": 1}, {"b": 2}],
182+
"embeddings": [[0.1] * 384, [0.2] * 384],
183+
}
184+
_, _, _, all_embeddings = repair._extract_drawers(col, total=2, batch_size=2)
185+
assert all_embeddings is not None
186+
assert len(all_embeddings) == 2
187+
assert all_embeddings[0] == [0.1] * 384
188+
189+
190+
def test_extract_drawers_falls_back_to_none_when_any_embedding_missing():
191+
"""If any drawer's embedding is None (HNSW unreadable), return None.
192+
193+
Signals to the rebuild caller that ChromaDB must recompute embeddings
194+
rather than try to upsert with partial vectors. The whole-or-nothing
195+
semantics mean we never end up with a half-vectored collection.
196+
"""
197+
col = MagicMock()
198+
col.get.return_value = {
199+
"ids": ["id1", "id2", "id3"],
200+
"documents": ["d1", "d2", "d3"],
201+
"metadatas": [{}, {}, {}],
202+
"embeddings": [[0.1] * 384, None, [0.3] * 384],
203+
}
204+
_, _, _, all_embeddings = repair._extract_drawers(col, total=3, batch_size=3)
205+
assert all_embeddings is None, "any None forces caller to recompute all"
206+
207+
208+
def test_extract_drawers_embeddings_absent_in_batch_signals_recompute():
209+
"""When chromadb returns no `embeddings` key at all, the result is None."""
210+
col = MagicMock()
211+
col.get.return_value = {
212+
"ids": ["id1"],
213+
"documents": ["d1"],
214+
"metadatas": [{}],
215+
# 'embeddings' key absent entirely (chromadb may return None for
216+
# this on unreadable HNSW)
217+
"embeddings": None,
218+
}
219+
_, _, _, all_embeddings = repair._extract_drawers(col, total=1, batch_size=1)
220+
assert all_embeddings is None
221+
222+
167223
# ── scan_palace ───────────────────────────────────────────────────────
168224

169225

0 commit comments

Comments
 (0)