RED-7 (example-priming): the STEP-2 worked example named live IDs (LRN-014 + LRN-016) and modeled merging them — but they are complementary (header-ids vs checkbox-CSS), a merge the skill's own rule forbids. Live IDs in an example prime the skill to act on those exact entries on real data. Fictionalized the whole STEP-2 example to 9xx IDs (cannot match a live registry); the merge example now models a same-concept merge. Closed by a DETERMINISTIC test (run-deterministic.sh RED-7: the example must carry only 9xx ids) per LRN-046, not a flaky behavioral fixture. The test caught its own ugrep false-green first (a leading-dash pattern parsed as an option) — fixed via /usr/bin/grep, the same dodge the skill's verify already uses at line 189. RED-8 (added-negation inversion): re-reviewed, consciously accepted as a documented limit in BACKLOG — remote (compression subtracts tokens), and an FP-safe increase check is non-trivial (needs the HEAD entry-id set to exclude legit new/merged 0->N); a noisy guard is worse than the honest limit on a destructive skill (LRN-047). Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com> Claude-Session: https://claude.ai/code/session_01C6bUdvHnajCNzgVQefZowj
108 lines
4.8 KiB
Bash
108 lines
4.8 KiB
Bash
#!/usr/bin/env bash
|
|
# Deterministic RED suite for /prune-memory — RED-1, RED-2, RED-5, RED-6, RED-7.
|
|
# Each MUST be red on the current (v1) skill. Pure mechanical oracles,
|
|
# no LLM. Faithful: RED-2/RED-6 execute the REAL bash blocks extracted
|
|
# from SKILL.md (no copy that could drift).
|
|
#
|
|
# Sandbox only (mktemp). NEVER touches real registries or the repo.
|
|
# Usage: bash run-deterministic.sh (exit 0 = all green, 1 = >=1 red)
|
|
set -uo pipefail
|
|
|
|
SKILL="${SKILL:-$HOME/.claude/skills/prune-memory/SKILL.md}"
|
|
HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
SANDBOX="$(mktemp -d "${TMPDIR:-/tmp}/prune-red.XXXXXX")"
|
|
trap 'rm -rf "$SANDBOX"' EXIT
|
|
|
|
fail=0
|
|
red() { printf 'RED-%s: RED (skill defective, expected pre-GREEN) -- %s\n' "$1" "$2"; fail=1; }
|
|
green() { printf 'RED-%s: GREEN (skill fixed) -- %s\n' "$1" "$2"; }
|
|
|
|
# Pull the real fenced ```bash block under a "## <heading>" from SKILL.md.
|
|
# Verified by the extract-check before the suite was written.
|
|
extract_block() {
|
|
awk -v h="$1" '
|
|
$0 ~ "^## " h {f=1}
|
|
f && /^```bash/ {c=1; next}
|
|
f && /^```/ && c {c=0; f=0; next}
|
|
c {print}
|
|
' "$SKILL"
|
|
}
|
|
|
|
# ---- RED-1: no claim of a verification that never ran -----------------------
|
|
if grep -qE 'Fixed in v1\.1|TDD found it' "$SKILL"; then
|
|
red 1 "false 'Fixed in v1.1 (TDD found it)' claim present in SKILL.md"
|
|
else
|
|
green 1 "no unproven verification claim in SKILL.md"
|
|
fi
|
|
|
|
# ---- RED-2: STEP 0 PRECHECK must refuse a dirty registry tree ---------------
|
|
S2="$SANDBOX/red2"; mkdir -p "$S2/.claude/memory"
|
|
git -C "$S2" init -q
|
|
printf '## BDR-001 -- seed\n' > "$S2/.claude/memory/decisions.md"
|
|
git -C "$S2" add -A
|
|
git -C "$S2" -c user.email=t@t -c user.name=t commit -qm seed >/dev/null 2>&1
|
|
printf 'uncommitted dirty line\n' >> "$S2/.claude/memory/decisions.md"
|
|
extract_block "STEP 0" > "$S2/step0.sh"
|
|
( cd "$S2" && bash step0.sh >/dev/null 2>&1 ); code=$?
|
|
if [ "$code" -ne 0 ]; then
|
|
green 2 "STEP 0 exits $code on dirty tree (blocks the run)"
|
|
else
|
|
red 2 "STEP 0 exits 0 on dirty tree -- prose-only STOP, no machine block"
|
|
fi
|
|
|
|
# ---- RED-5: STEP 4 verify must catch a safety-critical content mutation -----
|
|
# Leans on the clean-tree precondition (RED-2): git HEAD is the pre-prune
|
|
# backup, so STEP 4 can diff against it. A GREEN verify must FLAG any deleted
|
|
# permanent/negation line; v1 has no such check and falsely certifies OK.
|
|
S5="$SANDBOX/red5"; mkdir -p "$S5/.claude/memory"
|
|
git -C "$S5" init -q
|
|
printf '# Journal\n\n## 2025-11-03\n- PERMANENT rule: NEVER deploy migration 0007 without the backfill job first.\n' \
|
|
> "$S5/.claude/memory/journal.md"
|
|
git -C "$S5" add -A
|
|
git -C "$S5" -c user.email=t@t -c user.name=t commit -qm seed >/dev/null 2>&1
|
|
# Simulate a BAD prune that collapses away the safety-critical NEVER line:
|
|
printf '# Journal\n\n## 2025-11\n- Shipped auth migration; minor cleanup.\n' \
|
|
> "$S5/.claude/memory/journal.md"
|
|
extract_block "STEP 4" > "$S5/step4.sh"
|
|
out5="$( cd "$S5" && bash step4.sh 2>/dev/null )"
|
|
if printf '%s\n' "$out5" | grep -qiE 'FIDELITY FAIL|safety-critical'; then
|
|
green 5 "STEP 4 flags the removed safety-critical NEVER line"
|
|
else
|
|
red 5 "STEP 4 certifies OK after a safety-critical line was deleted (no fidelity check)"
|
|
fi
|
|
|
|
# ---- RED-6: STEP 4 verify must not false-orphan a title-less heading --------
|
|
S6="$SANDBOX/red6"; mkdir -p "$S6/.claude/memory"
|
|
cp "$HERE/fixtures/red6-orphan/.claude/memory/decisions.md" \
|
|
"$S6/.claude/memory/decisions.md"
|
|
extract_block "STEP 4" > "$S6/step4.sh"
|
|
out="$( cd "$S6" && bash step4.sh 2>/dev/null )"
|
|
if printf '%s\n' "$out" | grep -qE '^ORPHAN INDEX: BDR-009'; then
|
|
red 6 "verify emits FALSE 'ORPHAN INDEX: BDR-009' (body exists; trailing-space bug)"
|
|
else
|
|
green 6 "verify does not false-orphan the title-less heading"
|
|
fi
|
|
|
|
# ---- RED-7: STEP 2 plan example must use FICTIONAL ids, never live registry ids
|
|
# Live ids in the worked example PRIME the skill to act on those exact entries on
|
|
# real data (observed 2026-06-25: it merged the example's LRN-014 + LRN-016 on the
|
|
# live learnings.md, though they are complementary, not overlapping). Fictional ids
|
|
# (9xx) cannot match a real registry. Reads SKILL.md only — sandbox-safe.
|
|
# /usr/bin/grep (not the system grep, which may be ugrep — a leading-dash pattern
|
|
# like `-9..` is then misparsed as an option, erroring to an empty + FALSE GREEN).
|
|
ex7="$(awk '/^PRUNE PLAN/{f=1} f{print} /^Approve per category/{f=0; exit}' "$SKILL")"
|
|
bad7="$(printf '%s\n' "$ex7" | /usr/bin/grep -oE '(BDR|LRN|BLK|EVAL)-[0-9]+' | /usr/bin/grep -vE '9[0-9][0-9]$' | sort -u | tr '\n' ' ')"
|
|
if [ -n "${bad7// /}" ]; then
|
|
red 7 "STEP 2 example uses LIVE-range ids (prime real-data ops): ${bad7% }"
|
|
else
|
|
green 7 "STEP 2 example uses only fictional (9xx) ids"
|
|
fi
|
|
|
|
echo "----"
|
|
if [ "$fail" -eq 0 ]; then
|
|
echo "SUITE: all GREEN"
|
|
else
|
|
echo "SUITE: >=1 RED red (skill defective as expected pre-GREEN)"
|
|
fi
|
|
exit "$fail"
|