Skip to content

Commit 558d327

Browse files
jpheinclaude
andcommitted
feat(search): recency decay weighting + mempalace prune --stale-days CLI (#158)
Recency is an opt-in, recall-preserving ranking signal: a bounded exponential-decay distance shift on drawer age (mempalace.recency), gated off by default via PALACE_RECENCY_BOOST, half-life tunable via PALACE_RECENCY_HALFLIFE_DAYS. The shift can reorder neighbours but is capped so it never displaces a relevant drawer out of the result set. `mempalace prune --stale-days N` removes drawers older than N days from an optional wing/room scope. Dry-run by default; deletion requires --confirm. Drawers with no parseable filed_at are treated as ageless and are never pruned. Fully local — no network, no external API. Upstream tracks Weibull decay in MemPalace#1032 (informational). Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
1 parent 8fa3ffe commit 558d327

4 files changed

Lines changed: 582 additions & 1 deletion

File tree

mempalace/cli.py

Lines changed: 168 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1931,6 +1931,155 @@ def cmd_purge(args):
19311931
print(f"\n Purged {match_count:,} drawers. Remaining: {remaining:,}\n")
19321932

19331933

1934+
def cmd_prune(args):
1935+
"""Delete drawers older than ``--stale-days N`` (dry-run by default).
1936+
1937+
Age is the span between a drawer's ``filed_at`` timestamp and now. Unlike
1938+
``purge``'s metadata-equality filter, the staleness predicate is a string
1939+
timestamp that chromadb ``where=`` can't range-compare reliably, so we
1940+
fetch candidate metadata and decide age in Python (``mempalace.recency``),
1941+
then delete by explicit id list.
1942+
1943+
Safety: this is the only command that destroys data on a *time* predicate
1944+
rather than an explicit selection, so it is **dry-run by default**. Nothing
1945+
is deleted unless ``--confirm`` is passed. A drawer with no parseable
1946+
``filed_at`` is treated as ageless and is **never** pruned — we never
1947+
delete a drawer we can't date.
1948+
"""
1949+
from datetime import datetime, timezone
1950+
1951+
from .backends.base import PalaceRef
1952+
from .backends.chroma import ChromaBackend
1953+
from .migrate import contains_palace_database
1954+
from .recency import age_days
1955+
1956+
want_json = getattr(args, "json", False)
1957+
stale_days = args.stale_days
1958+
confirm = getattr(args, "confirm", False)
1959+
1960+
if stale_days is None or stale_days <= 0:
1961+
msg = "--stale-days must be a positive integer"
1962+
print(json.dumps({"error": msg}) if want_json else f" Error: {msg}")
1963+
return
1964+
1965+
palace_path = os.path.abspath(
1966+
os.path.expanduser(args.palace) if args.palace else MempalaceConfig().palace_path
1967+
)
1968+
1969+
if not os.path.isdir(palace_path) or not contains_palace_database(palace_path):
1970+
if want_json:
1971+
print(json.dumps({"error": "no palace database", "palace": palace_path}))
1972+
else:
1973+
_print_retired_local_palace_or_default(palace_path)
1974+
return
1975+
1976+
# Optional wing/room scope — without it, prune spans the whole palace.
1977+
clauses = []
1978+
if args.wing:
1979+
clauses.append({"wing": args.wing})
1980+
if args.room:
1981+
clauses.append({"room": args.room})
1982+
where = None
1983+
if clauses:
1984+
where = clauses[0] if len(clauses) == 1 else {"$and": clauses}
1985+
1986+
backend = ChromaBackend()
1987+
try:
1988+
col = backend.get_collection(
1989+
palace=PalaceRef(id=palace_path, local_path=palace_path),
1990+
collection_name="mempalace_drawers",
1991+
)
1992+
except Exception as e:
1993+
print(json.dumps({"error": str(e)}) if want_json else f"\n Error reading palace: {e}")
1994+
return
1995+
1996+
# Pull ids + metadata for the scope; age is decided in Python.
1997+
try:
1998+
got = (
1999+
col.get(where=where, include=["metadatas"]) if where else col.get(include=["metadatas"])
2000+
)
2001+
except Exception as e:
2002+
print(json.dumps({"error": str(e)}) if want_json else f"\n Error querying drawers: {e}")
2003+
return
2004+
2005+
if isinstance(got, dict):
2006+
all_ids = got.get("ids") or []
2007+
all_metas = got.get("metadatas") or []
2008+
else:
2009+
all_ids = getattr(got, "ids", []) or []
2010+
all_metas = getattr(got, "metadatas", []) or []
2011+
2012+
now = datetime.now(timezone.utc)
2013+
stale_ids = []
2014+
undated = 0
2015+
for did, meta in zip(all_ids, all_metas):
2016+
age = age_days(meta or {}, now=now)
2017+
if age is None:
2018+
undated += 1
2019+
continue
2020+
if age >= stale_days:
2021+
stale_ids.append(did)
2022+
2023+
scope_parts = []
2024+
if args.wing:
2025+
scope_parts.append(f"wing={args.wing}")
2026+
if args.room:
2027+
scope_parts.append(f"room={args.room}")
2028+
scope = " ".join(scope_parts) if scope_parts else "entire palace"
2029+
2030+
if want_json:
2031+
print(
2032+
json.dumps(
2033+
{
2034+
"stale_days": stale_days,
2035+
"scope": scope,
2036+
"scanned": len(all_ids),
2037+
"stale": len(stale_ids),
2038+
"undated_skipped": undated,
2039+
"confirmed": bool(confirm),
2040+
"deleted": 0,
2041+
}
2042+
)
2043+
)
2044+
else:
2045+
print(f"\n Scanned {len(all_ids):,} drawers in {scope}")
2046+
print(f" {len(stale_ids):,} older than {stale_days} days; {undated:,} undated (kept)")
2047+
2048+
if not stale_ids:
2049+
if not want_json:
2050+
print(" Nothing to prune.\n")
2051+
return
2052+
2053+
if not confirm:
2054+
if not want_json:
2055+
print(
2056+
f"\n DRY RUN — nothing deleted. Re-run with --confirm to delete "
2057+
f"{len(stale_ids):,} drawers.\n"
2058+
)
2059+
return
2060+
2061+
try:
2062+
col.delete(ids=stale_ids)
2063+
except Exception as e:
2064+
print(json.dumps({"error": str(e)}) if want_json else f"\n Delete failed: {e}\n")
2065+
return
2066+
2067+
remaining = col.count()
2068+
if want_json:
2069+
print(
2070+
json.dumps(
2071+
{
2072+
"stale_days": stale_days,
2073+
"scope": scope,
2074+
"deleted": len(stale_ids),
2075+
"remaining": remaining,
2076+
}
2077+
)
2078+
)
2079+
else:
2080+
print(f"\n Pruned {len(stale_ids):,} drawers. Remaining: {remaining:,}\n")
2081+
2082+
19342083
def cmd_rename_wing(args):
19352084
want_json = getattr(args, "json", False)
19362085
from_wing = args.from_wing
@@ -3527,6 +3676,24 @@ def main():
35273676
)
35283677
p_purge.add_argument("--yes", "-y", action="store_true", help="Skip confirmation prompt")
35293678

3679+
p_prune = sub.add_parser(
3680+
"prune",
3681+
help="Delete drawers older than --stale-days N (dry-run unless --confirm)",
3682+
)
3683+
p_prune.add_argument(
3684+
"--stale-days",
3685+
type=int,
3686+
required=True,
3687+
help="Prune drawers whose filed_at is older than this many days",
3688+
)
3689+
p_prune.add_argument("--wing", help="Limit prune to this wing")
3690+
p_prune.add_argument("--room", help="Limit prune to this room")
3691+
p_prune.add_argument(
3692+
"--confirm",
3693+
action="store_true",
3694+
help="Actually delete (without this flag, prune only reports a dry-run count)",
3695+
)
3696+
35303697
p_rename_wing = sub.add_parser(
35313698
"rename-wing",
35323699
help="Rename all drawers from one wing to another (atomic on postgres)",
@@ -3699,6 +3866,7 @@ def _nonneg_int(value: str) -> int:
36993866
"migrate": cmd_migrate,
37003867
"migrate-to-postgres": cmd_migrate_to_postgres,
37013868
"purge": cmd_purge,
3869+
"prune": cmd_prune,
37023870
"rename-wing": cmd_rename_wing,
37033871
"rooms": cmd_rooms,
37043872
"status": cmd_status,

mempalace/recency.py

Lines changed: 102 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,102 @@
1+
"""Recency weighting for search results (#158).
2+
3+
Recency is an opt-in *ranking signal*, never a gate. A drawer's age — the
4+
span between its ``filed_at`` timestamp and now — produces a small, bounded
5+
distance adjustment in the searcher: newer drawers get nudged up, older ones
6+
left where they are. The adjustment is capped so it can reorder neighbors but
7+
can never push a relevant drawer out of the result set — 100% recall is the
8+
design requirement, so recency reorders, never excludes.
9+
10+
The signal is exponential decay (half-life based), the same shape upstream
11+
tracks for Weibull decay in MemPalace/mempalace#1032: a drawer one half-life
12+
old keeps half its maximum boost, two half-lives old a quarter, and so on. A
13+
drawer with no parseable timestamp gets zero adjustment (treated as ageless,
14+
never penalized).
15+
16+
Pure functions, no I/O. The searcher owns reading ``filed_at`` from metadata
17+
and applying the score; this module owns the math.
18+
"""
19+
20+
from __future__ import annotations
21+
22+
from datetime import datetime, timezone
23+
24+
FILED_AT_KEY = "filed_at"
25+
26+
# Maximum upward nudge for a freshly-filed drawer, in cosine-distance units.
27+
# Sits below the weakest closet rung (0.04) and at the rating step (0.03) so a
28+
# recency signal tilts order without overpowering semantic match or an
29+
# explicit human rating.
30+
RECENCY_DISTANCE_MAX = 0.03
31+
32+
# Age (in days) at which the boost has decayed to half its maximum. Chosen so
33+
# a drawer stays "fresh enough to nudge" for a couple of months, then fades —
34+
# tunable via PALACE_RECENCY_HALFLIFE_DAYS.
35+
RECENCY_HALFLIFE_DAYS = 30.0
36+
37+
38+
def _parse_filed_at(value) -> datetime | None:
39+
"""Parse a ``filed_at`` metadata value into an aware UTC datetime.
40+
41+
Drawers store ``datetime.now().isoformat()`` (naive local) but
42+
externally-edited or imported rows may carry a ``Z`` suffix, an offset,
43+
or garbage. Anything unparseable returns ``None`` so the caller can treat
44+
the drawer as ageless rather than crashing.
45+
"""
46+
if not isinstance(value, str) or not value.strip():
47+
return None
48+
text = value.strip()
49+
# fromisoformat in 3.9 rejects a trailing 'Z'; normalize to +00:00.
50+
if text.endswith("Z"):
51+
text = text[:-1] + "+00:00"
52+
try:
53+
dt = datetime.fromisoformat(text)
54+
except ValueError:
55+
return None
56+
if dt.tzinfo is None:
57+
dt = dt.replace(tzinfo=timezone.utc)
58+
return dt
59+
60+
61+
def age_days(meta: dict | None, now: datetime | None = None) -> float | None:
62+
"""Age of a drawer in days from its ``filed_at``, or ``None`` if unknown.
63+
64+
A negative span (future-dated row) clamps to 0.0 — a clock skew shouldn't
65+
invert the signal into a penalty.
66+
"""
67+
if not meta:
68+
return None
69+
dt = _parse_filed_at(meta.get(FILED_AT_KEY))
70+
if dt is None:
71+
return None
72+
ref = now or datetime.now(timezone.utc)
73+
if ref.tzinfo is None:
74+
ref = ref.replace(tzinfo=timezone.utc)
75+
span = (ref - dt).total_seconds() / 86400.0
76+
return span if span > 0.0 else 0.0
77+
78+
79+
def recency_distance_adjustment(
80+
meta: dict | None,
81+
now: datetime | None = None,
82+
halflife_days: float = RECENCY_HALFLIFE_DAYS,
83+
max_shift: float = RECENCY_DISTANCE_MAX,
84+
) -> float:
85+
"""Bounded cosine-distance shift for a drawer's recency.
86+
87+
Exponential decay: a drawer ``halflife_days`` old keeps half the maximum
88+
shift, ``2*halflife_days`` a quarter, and so on. The result is always in
89+
``[-max_shift, 0.0]`` — a value to be *added* to the effective distance,
90+
so a fresh drawer (large boost) yields a more-negative number and moves
91+
*up*. A drawer with no parseable timestamp yields 0.0 (ageless).
92+
93+
``max_shift <= 0`` or ``halflife_days <= 0`` disables the signal (returns
94+
0.0) so a misconfigured weight can't invert ranking.
95+
"""
96+
if max_shift <= 0.0 or halflife_days <= 0.0:
97+
return 0.0
98+
age = age_days(meta, now=now)
99+
if age is None:
100+
return 0.0
101+
decay = 0.5 ** (age / halflife_days)
102+
return -max_shift * decay

mempalace/searcher.py

Lines changed: 38 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
from .backends import CollectionNotInitializedError, PalaceNotFoundError
2121
from .palace import get_closets_collection, get_collection
2222
from .ratings import net_rating, rating_distance_adjustment
23+
from .recency import RECENCY_HALFLIFE_DAYS, recency_distance_adjustment
2324

2425
# Closet pointer line format: "topic|entities|→drawer_id_a,drawer_id_b"
2526
# Multiple lines may join with newlines inside one closet document.
@@ -298,6 +299,32 @@ def _rating_boost_enabled() -> bool:
298299
return os.environ.get("PALACE_RATING_BOOST", "1").strip() != "0"
299300

300301

302+
def _recency_boost_enabled() -> bool:
303+
"""Whether the recency ranking signal (#158) is active.
304+
305+
Off by default — recency is an experimental tilt we A/B against our own
306+
corpus before trusting it, so it ships dark. Set ``PALACE_RECENCY_BOOST=1``
307+
to enable. Read live from the environment so the daemon picks it up
308+
without a restart, mirroring the rating gate.
309+
"""
310+
return os.environ.get("PALACE_RECENCY_BOOST", "0").strip() == "1"
311+
312+
313+
def _recency_halflife_days() -> float:
314+
"""Half-life (days) for the recency decay, from the environment.
315+
316+
Falls back to ``RECENCY_HALFLIFE_DAYS`` when unset or unparseable. A
317+
non-positive value disables the signal in ``recency_distance_adjustment``.
318+
"""
319+
raw = os.environ.get("PALACE_RECENCY_HALFLIFE_DAYS", "").strip()
320+
if not raw:
321+
return RECENCY_HALFLIFE_DAYS
322+
try:
323+
return float(raw)
324+
except ValueError:
325+
return RECENCY_HALFLIFE_DAYS
326+
327+
301328
def build_where_filter(
302329
wing: str = None,
303330
room: str = None,
@@ -1967,12 +1994,22 @@ def search_memories( # noqa: C901 — fork-only fallback orchestration; complex
19671994
if _rating_boost_enabled():
19681995
rating_adj = rating_distance_adjustment(meta)
19691996

1997+
# Recency adjustment (#158): newer drawers get a small upward nudge via
1998+
# exponential decay on age. Bounded and capped (mempalace.recency) so
1999+
# it reorders neighbors but never displaces a relevant drawer out of
2000+
# the result set — recall is preserved. Off by default; gated by
2001+
# PALACE_RECENCY_BOOST (set "1" to enable), half-life configurable via
2002+
# PALACE_RECENCY_HALFLIFE_DAYS.
2003+
recency_adj = 0.0
2004+
if _recency_boost_enabled():
2005+
recency_adj = recency_distance_adjustment(meta, halflife_days=_recency_halflife_days())
2006+
19702007
# Clamp to the valid cosine-distance range [0, 2]. When a strong
19712008
# closet boost (up to 0.40) exceeds the raw distance, the subtraction
19722009
# can go negative — which (a) yields ``similarity > 1.0`` downstream
19732010
# and (b) makes the sort key land *below* ordinary positive distances,
19742011
# inverting the ranking so the best hybrid matches sort last.
1975-
effective_dist = max(0.0, min(2.0, dist - boost + rating_adj))
2012+
effective_dist = max(0.0, min(2.0, dist - boost + rating_adj + recency_adj))
19762013
entry = {
19772014
"drawer_id": drawer_id,
19782015
"text": doc,

0 commit comments

Comments
 (0)