chore(graphify): bump skill to 0.7.0
- GitHub URL clone support (single + multi-repo merge) - uv tool detection ahead of pipx shebang - Optional Kimi K2.6 backend via MOONSHOT_API_KEY - Discrete confidence rubric (0.55/0.65/0.75/0.85/0.95) replaces continuous range - Aggregated community view for graphs >5000 nodes - Manifest persistence after --update so next run diffs against current state - Video file support, rationale-as-attribute (not separate node) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
parent
d5677f67c9
commit
d50763e04c
@ -1 +1 @@
|
|||||||
0.4.23
|
0.7.0
|
||||||
@ -1,6 +1,6 @@
|
|||||||
---
|
---
|
||||||
name: graphify
|
name: graphify
|
||||||
description: "any input (code, docs, papers, images) - knowledge graph - clustered communities - HTML + JSON + audit report"
|
description: "any input (code, docs, papers, images, videos) to knowledge graph. Use when user asks any question about a codebase, documents, or project content - especially if graphify-out/ exists, treat the question as a /graphify query."
|
||||||
trigger: /graphify
|
trigger: /graphify
|
||||||
---
|
---
|
||||||
|
|
||||||
@ -13,6 +13,9 @@ Turn any folder of files into a navigable knowledge graph with community detecti
|
|||||||
```
|
```
|
||||||
/graphify # full pipeline on current directory → Obsidian vault
|
/graphify # full pipeline on current directory → Obsidian vault
|
||||||
/graphify <path> # full pipeline on specific path
|
/graphify <path> # full pipeline on specific path
|
||||||
|
/graphify https://github.com/<owner>/<repo> # clone repo then run full pipeline on it
|
||||||
|
/graphify https://github.com/<owner>/<repo> --branch <branch> # clone a specific branch
|
||||||
|
/graphify <url1> <url2> ... # clone multiple repos, build each, merge into one cross-repo graph
|
||||||
/graphify <path> --mode deep # thorough extraction, richer INFERRED edges
|
/graphify <path> --mode deep # thorough extraction, richer INFERRED edges
|
||||||
/graphify <path> --update # incremental - re-extract only new/changed files
|
/graphify <path> --update # incremental - re-extract only new/changed files
|
||||||
/graphify <path> --directed # build directed graph (preserves edge direction: source→target)
|
/graphify <path> --directed # build directed graph (preserves edge direction: source→target)
|
||||||
@ -57,25 +60,60 @@ Use it for:
|
|||||||
|
|
||||||
If no path was given, use `.` (current directory). Do not ask the user for a path.
|
If no path was given, use `.` (current directory). Do not ask the user for a path.
|
||||||
|
|
||||||
|
If the path argument starts with `https://github.com/` or `http://github.com/`, treat it as a GitHub URL — run Step 0 before anything else, then continue with the resolved local path.
|
||||||
|
|
||||||
Follow these steps in order. Do not skip steps.
|
Follow these steps in order. Do not skip steps.
|
||||||
|
|
||||||
|
### Step 0 - Clone GitHub repo(s) (only if a GitHub URL was given)
|
||||||
|
|
||||||
|
**Single repo:**
|
||||||
|
```bash
|
||||||
|
LOCAL_PATH=$(graphify clone <github-url> [--branch <branch>])
|
||||||
|
# Use LOCAL_PATH as the target for all subsequent steps
|
||||||
|
```
|
||||||
|
|
||||||
|
**Multiple repos (cross-repo graph):**
|
||||||
|
```bash
|
||||||
|
# Clone each repo, run the full pipeline on each, then merge
|
||||||
|
graphify clone <url1> # → ~/.graphify/repos/<owner1>/<repo1>
|
||||||
|
graphify clone <url2> # → ~/.graphify/repos/<owner2>/<repo2>
|
||||||
|
# Run /graphify on each local path to produce their graph.json files
|
||||||
|
# Then merge:
|
||||||
|
graphify merge-graphs \
|
||||||
|
~/.graphify/repos/<owner1>/<repo1>/graphify-out/graph.json \
|
||||||
|
~/.graphify/repos/<owner2>/<repo2>/graphify-out/graph.json \
|
||||||
|
--out graphify-out/cross-repo-graph.json
|
||||||
|
```
|
||||||
|
|
||||||
|
Graphify clones into `~/.graphify/repos/<owner>/<repo>` and reuses existing clones on repeat runs. Each node in the merged graph carries a `repo` attribute so you can filter by origin.
|
||||||
|
|
||||||
### Step 1 - Ensure graphify is installed
|
### Step 1 - Ensure graphify is installed
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
# Detect the correct Python interpreter (handles pipx, venv, system installs)
|
# Detect the correct Python interpreter (handles uv tool, pipx, venv, system installs)
|
||||||
|
PYTHON=""
|
||||||
GRAPHIFY_BIN=$(which graphify 2>/dev/null)
|
GRAPHIFY_BIN=$(which graphify 2>/dev/null)
|
||||||
if [ -n "$GRAPHIFY_BIN" ]; then
|
# 1. uv tool installs — most reliable on modern Mac/Linux
|
||||||
PYTHON=$(head -1 "$GRAPHIFY_BIN" | tr -d '#!')
|
if [ -z "$PYTHON" ] && command -v uv >/dev/null 2>&1; then
|
||||||
case "$PYTHON" in
|
_UV_PY=$(uv tool run graphifyy python -c "import sys; print(sys.executable)" 2>/dev/null)
|
||||||
*[!a-zA-Z0-9/_.-]*) PYTHON="python3" ;;
|
if [ -n "$_UV_PY" ]; then PYTHON="$_UV_PY"; fi
|
||||||
esac
|
|
||||||
else
|
|
||||||
PYTHON="python3"
|
|
||||||
fi
|
fi
|
||||||
|
# 2. Read shebang from graphify binary (pipx and direct pip installs)
|
||||||
|
if [ -z "$PYTHON" ] && [ -n "$GRAPHIFY_BIN" ]; then
|
||||||
|
_SHEBANG=$(head -1 "$GRAPHIFY_BIN" | tr -d '#!')
|
||||||
|
case "$_SHEBANG" in
|
||||||
|
*[!a-zA-Z0-9/_.-]*) ;;
|
||||||
|
*) "$_SHEBANG" -c "import graphify" 2>/dev/null && PYTHON="$_SHEBANG" ;;
|
||||||
|
esac
|
||||||
|
fi
|
||||||
|
# 3. Fall back to python3
|
||||||
|
if [ -z "$PYTHON" ]; then PYTHON="python3"; fi
|
||||||
"$PYTHON" -c "import graphify" 2>/dev/null || "$PYTHON" -m pip install graphifyy -q 2>/dev/null || "$PYTHON" -m pip install graphifyy -q --break-system-packages 2>&1 | tail -3
|
"$PYTHON" -c "import graphify" 2>/dev/null || "$PYTHON" -m pip install graphifyy -q 2>/dev/null || "$PYTHON" -m pip install graphifyy -q --break-system-packages 2>&1 | tail -3
|
||||||
# Write interpreter path for all subsequent steps (persists across invocations)
|
# Write interpreter path for all subsequent steps (persists across invocations)
|
||||||
mkdir -p graphify-out
|
mkdir -p graphify-out
|
||||||
"$PYTHON" -c "import sys; open('graphify-out/.graphify_python', 'w').write(sys.executable)"
|
"$PYTHON" -c "import sys; open('graphify-out/.graphify_python', 'w').write(sys.executable)"
|
||||||
|
# Save scan root so `graphify update` (no args) knows where to look next time
|
||||||
|
echo "$(cd INPUT_PATH && pwd)" > graphify-out/.graphify_root
|
||||||
```
|
```
|
||||||
|
|
||||||
If the import succeeds, print nothing and move straight to Step 2.
|
If the import succeeds, print nothing and move straight to Step 2.
|
||||||
@ -162,7 +200,12 @@ After transcription:
|
|||||||
|
|
||||||
**Before starting:** note whether `--mode deep` was given. You must pass `DEEP_MODE=true` to every subagent in Step B2 if it was. Track this from the original invocation - do not lose it.
|
**Before starting:** note whether `--mode deep` was given. You must pass `DEEP_MODE=true` to every subagent in Step B2 if it was. Track this from the original invocation - do not lose it.
|
||||||
|
|
||||||
This step has two parts: **structural extraction** (deterministic, free) and **semantic extraction** (Claude, costs tokens).
|
This step has two parts: **structural extraction** (deterministic, free) and **semantic extraction** (LLM, costs tokens).
|
||||||
|
|
||||||
|
**Before dispatching subagents:** check whether `MOONSHOT_API_KEY` is set. If it is NOT set, print this one-liner to the user:
|
||||||
|
> Tip: set `MOONSHOT_API_KEY` to use Kimi K2.6 for semantic extraction — 3x cheaper, richer graphs (`pip install 'graphifyy[kimi]'`).
|
||||||
|
|
||||||
|
Print it once, then continue. If `MOONSHOT_API_KEY` IS set, use `graphify.llm.extract_corpus_parallel(files, backend="kimi")` for semantic extraction instead of dispatching Claude subagents.
|
||||||
|
|
||||||
**Run Part A (AST) and Part B (semantic) in parallel. Dispatch all semantic subagents AND start AST extraction in the same message. Both can run simultaneously since they operate on different file types. Merge results in Part C as before.**
|
**Run Part A (AST) and Part B (semantic) in parallel. Dispatch all semantic subagents AND start AST extraction in the same message. Both can run simultaneously since they operate on different file types. Merge results in Part C as before.**
|
||||||
|
|
||||||
@ -264,7 +307,8 @@ Rules:
|
|||||||
|
|
||||||
Code files: focus on semantic edges AST cannot find (call relationships, shared data, arch patterns).
|
Code files: focus on semantic edges AST cannot find (call relationships, shared data, arch patterns).
|
||||||
Do not re-extract imports - AST already has those.
|
Do not re-extract imports - AST already has those.
|
||||||
Doc/paper files: extract named concepts, entities, citations. Also extract rationale — sections that explain WHY a decision was made, trade-offs chosen, or design intent. These become nodes with `rationale_for` edges pointing to the concept they explain.
|
Doc/paper files: extract named concepts, entities, citations. For rationale (WHY decisions were made, trade-offs, design intent): store as a `rationale` attribute on the relevant concept node — do NOT create a separate rationale node or fragment node. Only create a node for something that is itself a named entity or concept. Use `file_type:"rationale"` for concept-like nodes (ideas, principles, mechanisms, design patterns). Do NOT invent file_types like `concept` — valid values are only `code|document|paper|image|rationale`.
|
||||||
|
Code files: when adding `calls` edges, source MUST be the caller (the function/class doing the calling), target MUST be the callee. Never reverse this direction.
|
||||||
Image files: use vision to understand what the image IS - do not just OCR.
|
Image files: use vision to understand what the image IS - do not just OCR.
|
||||||
UI screenshot: layout patterns, design decisions, key elements, purpose.
|
UI screenshot: layout patterns, design decisions, key elements, purpose.
|
||||||
Chart: metric, trend/insight, data source.
|
Chart: metric, trend/insight, data source.
|
||||||
@ -293,16 +337,22 @@ If a file has YAML frontmatter (--- ... ---), copy source_url, captured_at, auth
|
|||||||
|
|
||||||
confidence_score is REQUIRED on every edge - never omit it, never use 0.5 as a default:
|
confidence_score is REQUIRED on every edge - never omit it, never use 0.5 as a default:
|
||||||
- EXTRACTED edges: confidence_score = 1.0 always
|
- EXTRACTED edges: confidence_score = 1.0 always
|
||||||
- INFERRED edges: reason about each edge individually.
|
- INFERRED edges: pick exactly ONE value from this set — never 0.5:
|
||||||
Direct structural evidence (shared data structure, clear dependency): 0.8-0.9.
|
0.95 direct structural evidence (shared data structure, named cross-file reference).
|
||||||
Reasonable inference with some uncertainty: 0.6-0.7.
|
0.85 strong inference (clear functional alignment, no direct symbol link).
|
||||||
Weak or speculative: 0.4-0.5. Most edges should be 0.6-0.9, not 0.5.
|
0.75 reasonable inference (shared problem domain + similar shape, requires interpretation).
|
||||||
|
0.65 weak inference (thematically related, no shape evidence).
|
||||||
|
0.55 speculative but plausible (surface-level co-occurrence only).
|
||||||
|
Models follow discrete rubrics better than continuous ranges; the bimodal
|
||||||
|
distribution observed in production (>50% at 0.5, >40% at 0.85+) shows the
|
||||||
|
range guidance is being collapsed to a binary. If no value above fits, mark
|
||||||
|
the edge AMBIGUOUS rather than picking 0.4 or below.
|
||||||
- AMBIGUOUS edges: 0.1-0.3
|
- AMBIGUOUS edges: 0.1-0.3
|
||||||
|
|
||||||
Node ID format: lowercase, only `[a-z0-9_]`, no dots or slashes. Format: `{stem}_{entity}` where stem is the filename without extension and entity is the symbol name, both normalized (lowercase, non-alphanumeric chars replaced with `_`). Example: `src/auth/session.py` + `ValidateToken` → `session_validatetoken`. This must match the ID the AST extractor generates so cross-references between code and semantic nodes connect correctly.
|
Node ID format: lowercase, only `[a-z0-9_]`, no dots or slashes. Format: `{stem}_{entity}` where stem is the filename without extension and entity is the symbol name, both normalized (lowercase, non-alphanumeric chars replaced with `_`). Example: `src/auth/session.py` + `ValidateToken` → `session_validatetoken`. This must match the ID the AST extractor generates so cross-references between code and semantic nodes connect correctly. CRITICAL: never append chunk numbers, sequence numbers, or any suffix to an ID (no `_c1`, `_c2`, `_chunk2`, etc.). IDs must be deterministic from the label alone — the same entity must always produce the same ID regardless of which chunk processes it.
|
||||||
|
|
||||||
Output exactly this JSON (no other text):
|
Output exactly this JSON (no other text):
|
||||||
{"nodes":[{"id":"session_validatetoken","label":"Human Readable Name","file_type":"code|document|paper|image","source_file":"relative/path","source_location":null,"source_url":null,"captured_at":null,"author":null,"contributor":null}],"edges":[{"source":"node_id","target":"node_id","relation":"calls|implements|references|cites|conceptually_related_to|shares_data_with|semantically_similar_to|rationale_for","confidence":"EXTRACTED|INFERRED|AMBIGUOUS","confidence_score":1.0,"source_file":"relative/path","source_location":null,"weight":1.0}],"hyperedges":[{"id":"snake_case_id","label":"Human Readable Label","nodes":["node_id1","node_id2","node_id3"],"relation":"participate_in|implement|form","confidence":"EXTRACTED|INFERRED","confidence_score":0.75,"source_file":"relative/path"}],"input_tokens":0,"output_tokens":0}
|
{"nodes":[{"id":"session_validatetoken","label":"Human Readable Name","file_type":"code|document|paper|image|rationale","source_file":"relative/path","source_location":null,"source_url":null,"captured_at":null,"author":null,"contributor":null}],"edges":[{"source":"node_id","target":"node_id","relation":"calls|implements|references|cites|conceptually_related_to|shares_data_with|semantically_similar_to|rationale_for","confidence":"EXTRACTED|INFERRED|AMBIGUOUS","confidence_score":1.0,"source_file":"relative/path","source_location":null,"weight":1.0}],"hyperedges":[{"id":"snake_case_id","label":"Human Readable Label","nodes":["node_id1","node_id2","node_id3"],"relation":"participate_in|implement|form","confidence":"EXTRACTED|INFERRED","confidence_score":0.75,"source_file":"relative/path"}],"input_tokens":0,"output_tokens":0}
|
||||||
```
|
```
|
||||||
|
|
||||||
**Step B3 - Collect, cache, and merge**
|
**Step B3 - Collect, cache, and merge**
|
||||||
@ -315,6 +365,30 @@ Wait for all subagents. For each result:
|
|||||||
|
|
||||||
If more than half the chunks failed or are missing, stop and tell the user to re-run and ensure `subagent_type="general-purpose"` is used.
|
If more than half the chunks failed or are missing, stop and tell the user to re-run and ensure `subagent_type="general-purpose"` is used.
|
||||||
|
|
||||||
|
Merge all chunk files into `.graphify_semantic_new.json`. **After each Agent call completes, read the real token counts from the Agent tool result's `usage` field and write them back into the chunk JSON before merging** — the chunk JSON itself always has placeholder zeros. Then run:
|
||||||
|
```bash
|
||||||
|
$(cat graphify-out/.graphify_python) -c "
|
||||||
|
import json, glob
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
chunks = sorted(glob.glob('graphify-out/.graphify_chunk_*.json'))
|
||||||
|
all_nodes, all_edges, all_hyperedges = [], [], []
|
||||||
|
total_in, total_out = 0, 0
|
||||||
|
for c in chunks:
|
||||||
|
d = json.loads(Path(c).read_text())
|
||||||
|
all_nodes += d.get('nodes', [])
|
||||||
|
all_edges += d.get('edges', [])
|
||||||
|
all_hyperedges += d.get('hyperedges', [])
|
||||||
|
total_in += d.get('input_tokens', 0)
|
||||||
|
total_out += d.get('output_tokens', 0)
|
||||||
|
Path('graphify-out/.graphify_semantic_new.json').write_text(json.dumps({
|
||||||
|
'nodes': all_nodes, 'edges': all_edges, 'hyperedges': all_hyperedges,
|
||||||
|
'input_tokens': total_in, 'output_tokens': total_out,
|
||||||
|
}, indent=2))
|
||||||
|
print(f'Merged {len(chunks)} chunks: {total_in:,} in / {total_out:,} out tokens')
|
||||||
|
"
|
||||||
|
```
|
||||||
|
|
||||||
Save new results to cache:
|
Save new results to cache:
|
||||||
```bash
|
```bash
|
||||||
$(cat graphify-out/.graphify_python) -c "
|
$(cat graphify-out/.graphify_python) -c "
|
||||||
@ -542,8 +616,30 @@ G = build_from_json(extraction)
|
|||||||
communities = {int(k): v for k, v in analysis['communities'].items()}
|
communities = {int(k): v for k, v in analysis['communities'].items()}
|
||||||
labels = {int(k): v for k, v in labels_raw.items()}
|
labels = {int(k): v for k, v in labels_raw.items()}
|
||||||
|
|
||||||
if G.number_of_nodes() > 5000:
|
NODE_LIMIT = 5000
|
||||||
print(f'Graph has {G.number_of_nodes()} nodes - too large for HTML viz. Use Obsidian vault instead.')
|
if G.number_of_nodes() > NODE_LIMIT:
|
||||||
|
from collections import Counter
|
||||||
|
print(f'Graph has {G.number_of_nodes()} nodes (above {NODE_LIMIT} limit). Building aggregated community view...')
|
||||||
|
node_to_community = {nid: cid for cid, members in communities.items() for nid in members}
|
||||||
|
import networkx as nx_meta
|
||||||
|
meta = nx_meta.Graph()
|
||||||
|
for cid, members in communities.items():
|
||||||
|
meta.add_node(str(cid), label=labels.get(cid, f'Community {cid}'))
|
||||||
|
edge_counts = Counter()
|
||||||
|
for u, v in G.edges():
|
||||||
|
cu, cv = node_to_community.get(u), node_to_community.get(v)
|
||||||
|
if cu is not None and cv is not None and cu != cv:
|
||||||
|
edge_counts[(min(cu, cv), max(cu, cv))] += 1
|
||||||
|
for (cu, cv), w in edge_counts.items():
|
||||||
|
meta.add_edge(str(cu), str(cv), weight=w, relation=f'{w} cross-community edges', confidence='AGGREGATED')
|
||||||
|
if meta.number_of_nodes() > 1:
|
||||||
|
meta_communities = {cid: [str(cid)] for cid in communities}
|
||||||
|
member_counts = {cid: len(members) for cid, members in communities.items()}
|
||||||
|
to_html(meta, meta_communities, 'graphify-out/graph.html', community_labels=labels or None, member_counts=member_counts)
|
||||||
|
print(f'graph.html written (aggregated: {meta.number_of_nodes()} community nodes, {meta.number_of_edges()} cross-community edges)')
|
||||||
|
print('Tip: run with --obsidian for full node-level detail.')
|
||||||
|
else:
|
||||||
|
print('Single community — aggregated view not useful. Skipping graph.html.')
|
||||||
else:
|
else:
|
||||||
to_html(G, communities, 'graphify-out/graph.html', community_labels=labels or None)
|
to_html(G, communities, 'graphify-out/graph.html', community_labels=labels or None)
|
||||||
print('graph.html written - open in any browser, no server needed')
|
print('graph.html written - open in any browser, no server needed')
|
||||||
@ -738,7 +834,7 @@ cost_path.write_text(json.dumps(cost, indent=2))
|
|||||||
print(f'This run: {input_tok:,} input tokens, {output_tok:,} output tokens')
|
print(f'This run: {input_tok:,} input tokens, {output_tok:,} output tokens')
|
||||||
print(f'All time: {cost[\"total_input_tokens\"]:,} input, {cost[\"total_output_tokens\"]:,} output ({len(cost[\"runs\"])} runs)')
|
print(f'All time: {cost[\"total_input_tokens\"]:,} input, {cost[\"total_output_tokens\"]:,} output ({len(cost[\"runs\"])} runs)')
|
||||||
"
|
"
|
||||||
rm -f graphify-out/.graphify_detect.json graphify-out/.graphify_extract.json graphify-out/.graphify_ast.json graphify-out/.graphify_semantic.json graphify-out/.graphify_analysis.json graphify-out/.graphify_labels.json
|
rm -f graphify-out/.graphify_detect.json graphify-out/.graphify_extract.json graphify-out/.graphify_ast.json graphify-out/.graphify_semantic.json graphify-out/.graphify_analysis.json graphify-out/.graphify_chunk_*.json
|
||||||
rm -f graphify-out/.needs_update 2>/dev/null || true
|
rm -f graphify-out/.needs_update 2>/dev/null || true
|
||||||
```
|
```
|
||||||
|
|
||||||
@ -857,7 +953,10 @@ deleted = set(incremental.get('deleted_files', []))
|
|||||||
if deleted:
|
if deleted:
|
||||||
to_remove = [n for n, d in G_existing.nodes(data=True) if d.get('source_file') in deleted]
|
to_remove = [n for n, d in G_existing.nodes(data=True) if d.get('source_file') in deleted]
|
||||||
G_existing.remove_nodes_from(to_remove)
|
G_existing.remove_nodes_from(to_remove)
|
||||||
print(f'Pruned {len(to_remove)} ghost nodes from {len(deleted)} deleted file(s)')
|
if to_remove:
|
||||||
|
print(f'Pruned {len(to_remove)} ghost node(s) from {len(deleted)} deleted file(s) — drift detected and corrected.')
|
||||||
|
else:
|
||||||
|
print(f'{len(deleted)} file(s) deleted since last run, but no ghost nodes were present in the graph — no drift.')
|
||||||
|
|
||||||
# Merge: new nodes/edges into existing graph
|
# Merge: new nodes/edges into existing graph
|
||||||
G_existing.update(G_new)
|
G_existing.update(G_new)
|
||||||
@ -873,6 +972,14 @@ merged_out = {
|
|||||||
}
|
}
|
||||||
Path('graphify-out/.graphify_extract.json').write_text(json.dumps(merged_out))
|
Path('graphify-out/.graphify_extract.json').write_text(json.dumps(merged_out))
|
||||||
print(f'[graphify update] Merged extraction written ({len(merged_out[\"nodes\"])} nodes, {len(merged_out[\"edges\"])} edges)')
|
print(f'[graphify update] Merged extraction written ({len(merged_out[\"nodes\"])} nodes, {len(merged_out[\"edges\"])} edges)')
|
||||||
|
|
||||||
|
# Save manifest with the CURRENT full file list so the next --update
|
||||||
|
# diffs against today's filesystem state, not the prior --update's
|
||||||
|
# baseline. Without this, deleted files get reported as ghosts again
|
||||||
|
# on every subsequent --update until a full rebuild runs.
|
||||||
|
from graphify.detect import save_manifest
|
||||||
|
save_manifest(incremental['files'])
|
||||||
|
print('[graphify update] Manifest saved.')
|
||||||
"
|
"
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user