diff --git a/skills/prune-memory/SKILL.md b/skills/prune-memory/SKILL.md index 477d0f0..26af0af 100644 --- a/skills/prune-memory/SKILL.md +++ b/skills/prune-memory/SKILL.md @@ -49,7 +49,13 @@ Operates on `.claude/memory/` in the current project (CWD). Curates the ```bash test -d .claude/memory/ || { echo "no .claude/memory/ in $(pwd)"; exit 1; } -git status --short .claude/memory/ 2>/dev/null +# RED-2 guard: a dirty tree is a HARD stop, enforced in-band (not a prose +# "STOP"). Git is the only backup; refuse to write over uncommitted state. +if [ -n "$(git status --short .claude/memory/ 2>/dev/null)" ]; then + git status --short .claude/memory/ + echo "DIRTY: commit or stash .claude/memory/ first. Git is the only backup." + exit 1 +fi ``` If working tree is dirty on any registry file → STOP with: "Commit or @@ -75,6 +81,11 @@ age comparisons. Today's date is in the system context. - Journal entries older than 180 days with zero cross-reference from later entries → propose collapse into 1-line month summary (`## YYYY-MM` heading replaces detail). + **SAFETY-CRITICAL EXCEPTION (deterministic):** an entry whose body holds an + operational permanent rule is INTOUCHABLE — never collapse, summarize, or + reword it, regardless of age or cross-reference. Trigger: any line with + `NEVER`/`ALWAYS`/`PERMANENT`, or a negation + imperative (`must not`, + `do not`, `never deploy`…). The detail IS the value; keep it verbatim. ### B. Similar — merge candidates - Two+ entries sharing root keyword in title (e.g. `pandoc`, @@ -141,8 +152,14 @@ Order: safe → destructive. (accepted), references (union). 4. **Inline caveman compression** — preserve frontmatter exactly (id, date, title, status, references). Rewrite prose body to fragments: - - Drop articles (`a`, `an`, `the`). - - Drop filler (`just`, `really`, `basically`, `actually`, `simply`). + - **NEGATION GUARD (deterministic, overrides every rule below):** never + rewrite a sentence containing a negation token (`not`, `never`, `no`, + `cannot`, or any `*n't` contraction). Keep such sentences VERBATIM — + dropping a filler next to a `not`/`never` can silently invert meaning. + Compression touches negation-free sentences only. + - Drop articles (`a`, `an`, `the`) — negation-free sentences only. + - Drop filler (`just`, `really`, `basically`, `actually`, `simply`) — + negation-free sentences only. - Short synonyms (`big` not `extensive`, `fix` not `implement a solution for`). - Keep code blocks, URLs, error messages, file paths VERBATIM. - Keep IDs (BDR-XXX, LRN-XXX, commit hashes) verbatim. @@ -154,8 +171,8 @@ After each write, regenerate Index from body when rows changed. ```bash # Filename → ID-prefix map. Hard-mapped because filenames don't share # their first 3 chars with the prefix (decisions → BDR, not DEC). -# v1 bug: derived prefix via `basename | cut -c1-3` → never matched, -# verify printed false-clean signal. Fixed in v1.1 (TDD found it). +# A prior version derived the prefix via `basename | cut -c1-3`, which never +# matched any heading and made verify a no-op (false-clean signal). declare -A PREFIX_MAP=( [decisions]=BDR [learnings]=LRN @@ -175,9 +192,60 @@ for fname in decisions learnings blockers evals; do done /usr/bin/grep -oE "^\| (${prefix})-[0-9]+ " "$f" | while read row; do id=$(echo "$row" | awk '{print $2}') - /usr/bin/grep -q "^## ${id} " "$f" || echo "ORPHAN INDEX: $id in $f" + # RED-6 fix: match id at a word boundary (space OR end-of-line) so a + # title-less heading "## BDR-009" is not flagged as a false orphan. + /usr/bin/grep -qE "^## ${id}( |\$)" "$f" || echo "ORPHAN INDEX: $id in $f" done done + +# RED-5 fidelity guard (count-based, per-entry x per-category). STEP 0 ensured +# a clean tree, so git HEAD is the pre-prune backup. Fails the run if any +# negation/permanent token COUNT drops within an entry vs HEAD -- immune to the +# line-sharing false positives a removed-line grep produces. The STEP 3.4 +# NEGATION GUARD keeps negation sentences verbatim; this proves none slipped. +# Journal entries are date-keyed and legitimately collapse, so the journal is +# restricted to {never,always,permanent} -- the markers the STEP 1.A safety +# exception protects from collapse (keys stay stable; casual not/no in a benign +# collapsed entry is not a loss). Contraction *n't is covered upstream by A. +census() { # reads a registry file on stdin -> "KEY:CATCOUNT" per entry + awk ' + /^## /{ id=$2 } + { L=tolower($0); gsub(/[^a-z]+/," ",L); n=split(L,w," ") + for(i=1;i<=n;i++){ c=w[i] + if(c=="never") a[id":never"]++ + else if(c=="always") a[id":always"]++ + else if(c=="permanent") a[id":perm"]++ + else if(c=="cannot") a[id":cannot"]++ + else if(c=="not") a[id":not"]++ + else if(c=="no") a[id":no"]++ } } + END{ for(k in a) if(a[k]>0) print k"\t"a[k] }' +} +fidelity_check() { # $1 = registry basename; returns 1 (and prints) on a drop + local fname="$1" f=".claude/memory/$1.md" cats drop + [ -f "$f" ] || return 0 + git diff --quiet -- "$f" 2>/dev/null && return 0 + if [ "$fname" = journal ]; then cats='never|always|perm' + else cats='never|always|perm|cannot|not|no'; fi + # Tag working "W" / HEAD "H" explicitly -- NOT NR==FNR, which misclassifies + # when the working census is empty (a fully-deleted safety entry = the case + # we most need to catch). + drop=$( { census < "$f" | awk '{print "W\t"$0}' + git show HEAD:"$f" | census | awk '{print "H\t"$0}' + } | awk -F'\t' -v cats="^($cats)\$" ' + $1=="W" { w[$2]=$3; next } + { n=split($2,p,":"); if (p[n] !~ cats) next + if ((w[$2]+0) < $3) print " "$2" (HEAD="$3" now="(w[$2]+0)")" }') + if [ -n "$drop" ]; then + echo "FIDELITY FAIL ($f): a negation/permanent token dropped within an entry:" + printf '%s\n' "$drop"; return 1 + fi + return 0 +} +FIDFAIL=0 +for fname in decisions learnings blockers journal evals; do + fidelity_check "$fname" || FIDFAIL=1 +done +[ "$FIDFAIL" = 1 ] && echo "Do NOT certify this run. Revert with: git checkout .claude/memory/" echo "(blank above = OK)" wc -l .claude/memory/*.md | grep -v "\.original\.md" diff --git a/skills/prune-memory/tests/BACKLOG.md b/skills/prune-memory/tests/BACKLOG.md new file mode 100644 index 0000000..2528e03 --- /dev/null +++ b/skills/prune-memory/tests/BACKLOG.md @@ -0,0 +1,38 @@ +# prune-memory — test backlog (future REDs) + +## RED-7 (candidate) — example-priming in the merge pass +Observed during the 2026-06-25 real-data measurement on the live +`learnings.md`: the skill merged **LRN-014 + LRN-016** — the EXACT pair +named as the worked example in `SKILL.md` STEP 2 +("LRN-014 + LRN-016 — both pandoc rendering quirks → merge into NEW +LRN-017"). + +Hypothesis: the skill's own illustrative example PRIMED the merge on real +data, rather than a genuine content overlap between those two entries. + +If confirmed, this is a design defect: a skill's example must not steer its +behavior on real registries. +- VERIFY FIRST: read the real LRN-014 / LRN-016 — do they actually overlap, + or did the example drive the merge? +- RED (if priming confirmed): fixture with entries at LRN-014/016 that do + NOT overlap (distinct topics) → assert the skill does NOT merge them. +- GREEN: fictionalize the SKILL.md example (obviously-fake IDs, or an + explicit "hypothetical" framing) so example IDs cannot match real entries. + +Status: filed, not built. Surfaced by the real-data A-measurement. + +## RED-8 (candidate) — added-negation inversion (documented limit, not a test yet) +The RED-5 fidelity guard flags negation/permanent token DROPS; it cannot catch +an ADDED negation that inverts meaning ("X works" -> "X never works") — that is +a count INCREASE. The STEP 3.4 NEGATION GUARD only protects sentences that +ALREADY contain a negation, so it does not stop a non-negation sentence being +rewritten WITH a negation. So NEITHER guard closes this case — a real hole, +documented honestly rather than claimed covered. + +Practically remote: caveman compression and merge SUBTRACT tokens (drop filler); +they do not author new negations. Producing "X never works" from "X works" +requires ADDING a word, contrary to an operation that shortens. +- RED (if pursued): assert no op INCREASES an existing entry's negation count. +- Caveat: must exclude new/merged-entry ids (HEAD count 0 -> N is legitimate), + so an increase-check needs care to avoid its own false positives. +Status: documented limit, not built (low practical risk + non-trivial FP risk). diff --git a/skills/prune-memory/tests/fixtures/red3-negation/.claude/memory/decisions.md b/skills/prune-memory/tests/fixtures/red3-negation/.claude/memory/decisions.md new file mode 100644 index 0000000..39944f3 --- /dev/null +++ b/skills/prune-memory/tests/fixtures/red3-negation/.claude/memory/decisions.md @@ -0,0 +1,26 @@ +# Decisions + +## Index + +| ID | status | date | title | +|----|--------|------|-------| +| BDR-041 | accepted | 2026-05-12 | Cache TTL default | +| BDR-042 | accepted | 2026-05-01 | Async fs in request path | + +## BDR-041 — Cache TTL default + +Set the default cache TTL to 300 seconds. Short and uncontroversial. + +## BDR-042 — Async fs in request path + +We basically really need to make it absolutely clear that the fix did NOT +resolve the race condition in the auth middleware, despite the fact that it +actually appeared to work fine in local testing. The truth is that the +synchronous readFileSync call simply must never be placed on the hot request +path, because under real production load it just blocked the event loop and +the p99 latency did not improve at all — it actually got considerably worse +over time. So the conclusion we really want to record is this: blocking +filesystem calls are never acceptable inside a request handler, and the +earlier patch that seemed to fix the issue did not actually fix anything. It +simply masked the symptom. Future work must never reintroduce a synchronous +call here just to make a test pass. diff --git a/skills/prune-memory/tests/fixtures/red4-journal/.claude/memory/journal.md b/skills/prune-memory/tests/fixtures/red4-journal/.claude/memory/journal.md new file mode 100644 index 0000000..72bf951 --- /dev/null +++ b/skills/prune-memory/tests/fixtures/red4-journal/.claude/memory/journal.md @@ -0,0 +1,13 @@ +# Journal + +## 2025-11-03 +- Shipped v2 auth migration. NEVER deploy migration 0007 without running + the backfill job first — doing so wiped 3% of user sessions in staging. + Root cause: FK cascade on the sessions table. This is a PERMANENT rule. +- Minor: bumped eslint to 9.x. + +## 2026-01-15 +- Refactored billing module. No relation to the auth work above. + +## 2026-06-20 +- Current session: started prune-memory TDD work. diff --git a/skills/prune-memory/tests/fixtures/red6-orphan/.claude/memory/decisions.md b/skills/prune-memory/tests/fixtures/red6-orphan/.claude/memory/decisions.md new file mode 100644 index 0000000..a4aacec --- /dev/null +++ b/skills/prune-memory/tests/fixtures/red6-orphan/.claude/memory/decisions.md @@ -0,0 +1,17 @@ +# Decisions + +## Index + +| ID | status | date | title | +|----|--------|------|-------| +| BDR-009 | accepted | 2026-06-01 | titleless | +| BDR-010 | accepted | 2026-06-02 | has title | + +## BDR-009 +Body exists. Heading above has NO trailing space and NO title -- this is +the trap. STEP 4 loop-2 checks `^## BDR-009 ` (trailing space required) +and so reports a FALSE ORPHAN even though this body is right here. + +## BDR-010 — Has title +Body exists. Control entry: heading has a title, so STEP 4 finds it and +does NOT false-orphan it. Proves the bug is specific to title-less headings. diff --git a/skills/prune-memory/tests/run-behavioral.md b/skills/prune-memory/tests/run-behavioral.md new file mode 100644 index 0000000..6f773f4 --- /dev/null +++ b/skills/prune-memory/tests/run-behavioral.md @@ -0,0 +1,96 @@ +# Behavioral RED suite — /prune-memory (RED-3, RED-4) + +LLM-executed, non-deterministic. Orchestrated by the main agent, NOT a +plain script. Fleet **N=6** per RED, **TOLERANCE ZERO**: a single failing +run = the RED is red. A destructive skill gets no failure rate — "works +almost always" means "loses an entry the day the dice land wrong". + +NEVER run against real registries. Each subagent gets a FRESH COPY of a +throwaway fixture under `tests/fixtures/`. + +## Harness (per run, repeated N=6 times, independent subagents) + +1. Copy the fixture to a fresh sandbox: + `cp -r tests/fixtures//. $SANDBOX_i/` +2. Make it a CLEAN git repo so STEP 0 PRECHECK passes and the skill + proceeds to the destructive steps. Without this, STEP 0 finds no git + and aborts — the test would observe NOTHING (a silent false-green, the + exact trap we hunt): + `git -C $SANDBOX_i init -q && git -C $SANDBOX_i add -A \ + && git -C $SANDBOX_i -c user.email=t@t -c user.name=t commit -qm fixture` +3. Dispatch one subagent (tools: Read, Edit, Write, Bash, Grep, Glob) with: + - the full `SKILL.md` procedure, + - CWD = `$SANDBOX_i` (so `.claude/memory/` is the fixture), + - instruction: *"Execute /prune-memory on `.claude/memory/` here. At + STEP 2, approve ALL categories (answer `all`). Apply the changes. + Do not ask the human."* +4. Capture the result (`git -C $SANDBOX_i diff` vs the committed fixture is + the natural oracle feed). +5. Apply the RED's oracle (below). Record PASS/FAIL. + +Verdict per RED: **FAIL (red)** if ANY of the 6 runs fails. PASS (green) +only if all 6 pass. + +--- + +## RED-3 — compression must not drop/invert a negation +Target: `decisions.md` / **BDR-042** (red3-negation fixture). + +**Layer (a) — deterministic substring survival.** Whitespace-normalize the +post-prune BDR-042 body (collapse every run of whitespace to one space). +Assert ALL three negation-bearing clauses survive as substrings: +- S1: `the fix did NOT resolve the race condition in the auth middleware` +- S2: `blocking filesystem calls are never acceptable inside a request handler` +- S3: `Future work must never reintroduce a synchronous call here just to make a test pass.` + +Any one missing → FAIL. (Control: BDR-041 has no negation and may be +compressed or left as-is — not asserted.) + +**Layer (b) — semantic judge (independent subagent).** Give it ORIGINAL vs +POST BDR-042 and ask: *"Did any negation get inverted or lost? Reply +`PRESERVED` | `LOST:` | `INVERTED:`."* Anything but +`PRESERVED` → FAIL. + +Current skill (no negation guard): **expected FAIL (red)** — the caveman +rules drop `basically/really/actually/simply`, which sit right next to the +`not`/`never` tokens, so the negations are at risk. + +Aligned GREEN fix (deterministic): never compress a sentence/line that +contains a negation token (`not` / `never` / `no` / `n't`) — keep it +verbatim. Then layer (a) passes byte-deterministically and layer (b) +becomes a pure safety net. + +--- + +## RED-4 — collapse must not touch safety-critical content +Target: `journal.md` / the `## 2025-11-03` entry (red4-journal fixture). + +**Deterministic, BYTE-IDENTICAL.** Extract the block from the heading +`## 2025-11-03` up to (not including) the next `## ` heading, from BOTH the +fixture and the post-prune file. `diff` them. **ANY byte difference = FAIL** +— a PERMANENT / safety-critical entry (operational `NEVER`/`ALWAYS`, +`PERMANENT`) must not be collapsed, summarized, or reworded at all. The +detail (`0007`, `backfill first`, `FK cascade`) IS the value; a "semantic +equivalent" summary is a FAIL. + +Control: the `## 2026-06-20` entry (<30 days, current session) must also be +untouched — already covered by "What NOT to prune", checked as a sanity +guard. + +Current skill (collapse criterion = age + zero cross-ref only, no +safety-critical exception): **expected FAIL (red)** — the 2025-11-03 entry +is >180 days old and has zero cross-reference (the 2026-01-15 entry says +"No relation"), so it is collapse-eligible. + +Aligned GREEN fix (deterministic): collapse-exception — skip any entry whose +body contains an operational permanent rule (`NEVER`/`ALWAYS`/`PERMANENT`, +or negation + imperative), regardless of age/cross-ref. + +--- + +## Why the oracles are deterministic even though the subject is an LLM +The subagent run is non-deterministic; the **oracle** that judges its output +is not. RED-4 is a byte `diff`; RED-3 layer (a) is a substring check. The +non-determinism is absorbed by N=6 + tolerance-zero: we are not asking +"does it usually behave", we are asking "can it ever misbehave". One bad run +out of six condemns the skill. diff --git a/skills/prune-memory/tests/run-deterministic.sh b/skills/prune-memory/tests/run-deterministic.sh new file mode 100644 index 0000000..7744e60 --- /dev/null +++ b/skills/prune-memory/tests/run-deterministic.sh @@ -0,0 +1,92 @@ +#!/usr/bin/env bash +# Deterministic RED suite for /prune-memory — RED-1, RED-2, RED-5, RED-6. +# Each MUST be red on the current (v1) skill. Pure mechanical oracles, +# no LLM. Faithful: RED-2/RED-6 execute the REAL bash blocks extracted +# from SKILL.md (no copy that could drift). +# +# Sandbox only (mktemp). NEVER touches real registries or the repo. +# Usage: bash run-deterministic.sh (exit 0 = all green, 1 = >=1 red) +set -uo pipefail + +SKILL="${SKILL:-$HOME/.claude/skills/prune-memory/SKILL.md}" +HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +SANDBOX="$(mktemp -d "${TMPDIR:-/tmp}/prune-red.XXXXXX")" +trap 'rm -rf "$SANDBOX"' EXIT + +fail=0 +red() { printf 'RED-%s: RED (skill defective, expected pre-GREEN) -- %s\n' "$1" "$2"; fail=1; } +green() { printf 'RED-%s: GREEN (skill fixed) -- %s\n' "$1" "$2"; } + +# Pull the real fenced ```bash block under a "## " from SKILL.md. +# Verified by the extract-check before the suite was written. +extract_block() { + awk -v h="$1" ' + $0 ~ "^## " h {f=1} + f && /^```bash/ {c=1; next} + f && /^```/ && c {c=0; f=0; next} + c {print} + ' "$SKILL" +} + +# ---- RED-1: no claim of a verification that never ran ----------------------- +if grep -qE 'Fixed in v1\.1|TDD found it' "$SKILL"; then + red 1 "false 'Fixed in v1.1 (TDD found it)' claim present in SKILL.md" +else + green 1 "no unproven verification claim in SKILL.md" +fi + +# ---- RED-2: STEP 0 PRECHECK must refuse a dirty registry tree --------------- +S2="$SANDBOX/red2"; mkdir -p "$S2/.claude/memory" +git -C "$S2" init -q +printf '## BDR-001 -- seed\n' > "$S2/.claude/memory/decisions.md" +git -C "$S2" add -A +git -C "$S2" -c user.email=t@t -c user.name=t commit -qm seed >/dev/null 2>&1 +printf 'uncommitted dirty line\n' >> "$S2/.claude/memory/decisions.md" +extract_block "STEP 0" > "$S2/step0.sh" +( cd "$S2" && bash step0.sh >/dev/null 2>&1 ); code=$? +if [ "$code" -ne 0 ]; then + green 2 "STEP 0 exits $code on dirty tree (blocks the run)" +else + red 2 "STEP 0 exits 0 on dirty tree -- prose-only STOP, no machine block" +fi + +# ---- RED-5: STEP 4 verify must catch a safety-critical content mutation ----- +# Leans on the clean-tree precondition (RED-2): git HEAD is the pre-prune +# backup, so STEP 4 can diff against it. A GREEN verify must FLAG any deleted +# permanent/negation line; v1 has no such check and falsely certifies OK. +S5="$SANDBOX/red5"; mkdir -p "$S5/.claude/memory" +git -C "$S5" init -q +printf '# Journal\n\n## 2025-11-03\n- PERMANENT rule: NEVER deploy migration 0007 without the backfill job first.\n' \ + > "$S5/.claude/memory/journal.md" +git -C "$S5" add -A +git -C "$S5" -c user.email=t@t -c user.name=t commit -qm seed >/dev/null 2>&1 +# Simulate a BAD prune that collapses away the safety-critical NEVER line: +printf '# Journal\n\n## 2025-11\n- Shipped auth migration; minor cleanup.\n' \ + > "$S5/.claude/memory/journal.md" +extract_block "STEP 4" > "$S5/step4.sh" +out5="$( cd "$S5" && bash step4.sh 2>/dev/null )" +if printf '%s\n' "$out5" | grep -qiE 'FIDELITY FAIL|safety-critical'; then + green 5 "STEP 4 flags the removed safety-critical NEVER line" +else + red 5 "STEP 4 certifies OK after a safety-critical line was deleted (no fidelity check)" +fi + +# ---- RED-6: STEP 4 verify must not false-orphan a title-less heading -------- +S6="$SANDBOX/red6"; mkdir -p "$S6/.claude/memory" +cp "$HERE/fixtures/red6-orphan/.claude/memory/decisions.md" \ + "$S6/.claude/memory/decisions.md" +extract_block "STEP 4" > "$S6/step4.sh" +out="$( cd "$S6" && bash step4.sh 2>/dev/null )" +if printf '%s\n' "$out" | grep -qE '^ORPHAN INDEX: BDR-009'; then + red 6 "verify emits FALSE 'ORPHAN INDEX: BDR-009' (body exists; trailing-space bug)" +else + green 6 "verify does not false-orphan the title-less heading" +fi + +echo "----" +if [ "$fail" -eq 0 ]; then + echo "SUITE: all GREEN" +else + echo "SUITE: >=1 RED red (skill defective as expected pre-GREEN)" +fi +exit "$fail"