{
  "$schema": "./schema.json",
  "license": "CC-BY-4.0",
  "patterns": [
    {
      "id": "black-box-opaqueness",
      "name": "Black-Box Opaqueness",
      "aliases": [
        "Opaque Agent",
        "No-Trace Agent"
      ],
      "category": "anti-patterns",
      "intent": "Anti-pattern: ship an agent without traces, decision logs, or provenance, then debug from user reports.",
      "context": "Schedule pressure or framework defaults push the agent into production with no observability hooks.",
      "problem": "When (not if) the agent does something wrong, there is no record of why; debugging is reduced to reproduction attempts.",
      "forces": [
        "Observability has a cost (storage, dev time).",
        "Frameworks differ in trace quality.",
        "Privacy and trace coverage tension."
      ],
      "solution": "Don't. Add traces, decision logs, and provenance from day one. See provenance-ledger, decision-log, lineage-tracking.",
      "consequences": {
        "benefits": [],
        "liabilities": [
          "Debugging time stretches to weeks.",
          "Compliance posture is unanswerable.",
          "Stakeholder trust erodes."
        ]
      },
      "constrains": "By definition, this anti-pattern imposes no useful constraint; the missing constraint is the failure mode.",
      "known_uses": [
        {
          "system": "Default state of un-instrumented LangChain projects circa 2023",
          "status": "available"
        }
      ],
      "related": [
        {
          "pattern": "provenance-ledger",
          "relation": "alternative-to"
        },
        {
          "pattern": "decision-log",
          "relation": "alternative-to"
        },
        {
          "pattern": "lineage-tracking",
          "relation": "alternative-to"
        }
      ],
      "references": [
        {
          "type": "repo",
          "title": "ai-standards/ai-design-patterns (Black-Box Opaqueness)",
          "url": "https://github.com/ai-standards/ai-design-patterns"
        }
      ],
      "status_in_practice": "deprecated",
      "tags": [
        "anti-pattern",
        "observability"
      ]
    },
    {
      "id": "hallucinated-citations",
      "name": "Hallucinated Citations",
      "aliases": [
        "Fake URLs",
        "Invented References"
      ],
      "category": "anti-patterns",
      "intent": "Anti-pattern: let the model emit citations as free text and trust them.",
      "context": "An agent answers with citations but the citation pipeline is the model itself; no source-id discipline.",
      "problem": "The model invents URLs, paper titles, and author names. Hallucinated citations look authoritative until clicked.",
      "forces": [
        "Real citations require source ids and a retrieval pipeline.",
        "Models trained on academic text are particularly fluent at fabricating citations.",
        "End users do not check."
      ],
      "solution": "Don't. Wire citations to retrieved-source ids. See citation-streaming, naive-rag, contextual-retrieval. Validate URLs before display.",
      "consequences": {
        "benefits": [],
        "liabilities": [
          "Trust collapse on first user verification.",
          "Legal / regulatory exposure in regulated domains."
        ]
      },
      "constrains": "By definition, this anti-pattern imposes no useful constraint; the missing constraint is the failure mode.",
      "known_uses": [
        {
          "system": "Notable lawyer's brief incident, 2023 (filed hallucinated cases)",
          "status": "available"
        }
      ],
      "related": [
        {
          "pattern": "citation-streaming",
          "relation": "alternative-to"
        },
        {
          "pattern": "naive-rag",
          "relation": "alternative-to"
        }
      ],
      "references": [
        {
          "type": "spec",
          "title": "OWASP LLM09: Misinformation",
          "year": 2025,
          "url": "https://genai.owasp.org/llmrisk/llm092025-misinformation/"
        }
      ],
      "status_in_practice": "deprecated",
      "tags": [
        "anti-pattern",
        "citation",
        "hallucination"
      ]
    },
    {
      "id": "hallucinated-tools",
      "name": "Hallucinated Tools",
      "aliases": [
        "Phantom Tool Calls",
        "Imagined Functions"
      ],
      "category": "anti-patterns",
      "intent": "Anti-pattern: trust the model to invoke only the tools it has been given, then debug calls to functions that do not exist.",
      "context": "The agent is given a tool palette, but the host accepts whatever the model emits without validation.",
      "problem": "The model invents tool names. The host either crashes, silently drops the call, or worse, dispatches to a similar-named real tool.",
      "forces": [
        "Validation feels redundant when providers offer typed tool calls.",
        "Provider-side validation is not always strict.",
        "Logging fails to surface 'tool does not exist' as a first-class event."
      ],
      "solution": "Don't trust. Validate every tool call against the registered palette before dispatch. Reject unknown names with a typed error the agent can react to. See tool-use, structured-output.",
      "consequences": {
        "benefits": [],
        "liabilities": [
          "Silent failures.",
          "Wrong actions executed by similar-named tools."
        ]
      },
      "constrains": "By definition, this anti-pattern imposes no useful constraint; the missing constraint is the failure mode.",
      "known_uses": [
        {
          "system": "Common in pre-2024 agent integrations",
          "status": "available"
        }
      ],
      "related": [
        {
          "pattern": "tool-use",
          "relation": "alternative-to"
        },
        {
          "pattern": "structured-output",
          "relation": "alternative-to"
        }
      ],
      "references": [
        {
          "type": "doc",
          "title": "Tool use with Claude",
          "year": 2025,
          "url": "https://docs.claude.com/en/docs/agents-and-tools/tool-use/overview"
        }
      ],
      "status_in_practice": "deprecated",
      "tags": [
        "anti-pattern",
        "tool-use"
      ]
    },
    {
      "id": "hero-agent",
      "name": "Hero Agent",
      "aliases": [
        "Mega-Prompt Agent",
        "God Agent"
      ],
      "category": "anti-patterns",
      "intent": "Anti-pattern: stuff every capability into one agent with one giant prompt.",
      "context": "An agent product grows; new capabilities are added by appending to the system prompt and the tool palette of a single agent.",
      "problem": "The agent's prompt becomes a monolith. Tools conflict. The model is confused about which path to take. Cheap requests pay expensive prices.",
      "forces": [
        "Specialisation requires routing or multi-agent infrastructure that does not yet exist.",
        "Splitting feels like premature optimisation.",
        "One-prompt is fastest to ship and slowest to maintain."
      ],
      "solution": "Don't. Once the prompt exceeds a few hundred lines or the tool count exceeds about a dozen, extract specialists. See routing, supervisor, multi-model-routing.",
      "consequences": {
        "benefits": [],
        "liabilities": [
          "Quality regressions on each new capability.",
          "Cost ballooning.",
          "Debugging the agent becomes archaeology."
        ]
      },
      "constrains": "By definition, this anti-pattern imposes no useful constraint; the missing constraint is the failure mode.",
      "known_uses": [
        {
          "system": "Common in early-stage AI products",
          "status": "available"
        }
      ],
      "related": [
        {
          "pattern": "routing",
          "relation": "alternative-to"
        },
        {
          "pattern": "supervisor",
          "relation": "alternative-to"
        },
        {
          "pattern": "multi-model-routing",
          "relation": "alternative-to"
        },
        {
          "pattern": "tool-explosion",
          "relation": "complements"
        },
        {
          "pattern": "prompt-bloat",
          "relation": "complements"
        },
        {
          "pattern": "sop-encoded-multi-agent",
          "relation": "alternative-to"
        },
        {
          "pattern": "cross-domain-agent-network",
          "relation": "alternative-to"
        }
      ],
      "references": [
        {
          "type": "repo",
          "title": "ai-standards/ai-design-patterns (Hero Agent)",
          "url": "https://github.com/ai-standards/ai-design-patterns"
        }
      ],
      "status_in_practice": "deprecated",
      "tags": [
        "anti-pattern",
        "monolith"
      ]
    },
    {
      "id": "hidden-mode-switching",
      "name": "Hidden Mode Switching",
      "aliases": [
        "Silent Model Swap",
        "Undisclosed Routing"
      ],
      "category": "anti-patterns",
      "intent": "Anti-pattern: silently swap the underlying model between requests without disclosing the change to users or operators.",
      "context": "Cost or capacity pressure pushes a product to route some requests to cheaper models; the routing is invisible.",
      "problem": "Reproducibility breaks; users notice quality changes they cannot diagnose; trust erodes.",
      "forces": [
        "Cost arbitrage feels too good to disclose.",
        "Per-request model disclosure adds UI complexity.",
        "Hidden routing complicates eval gates."
      ],
      "solution": "Don't. Disclose model identity per response. Use multi-model-routing transparently. Make routing decisions inspectable.",
      "consequences": {
        "benefits": [],
        "liabilities": [
          "Trust erosion when users discover the swap.",
          "Reproducibility broken across requests.",
          "Eval results become misleading."
        ]
      },
      "constrains": "By definition, this anti-pattern imposes no useful constraint; the missing constraint is the failure mode.",
      "known_uses": [
        {
          "system": "GPT-4 -> GPT-4o auto-router incident, 2024",
          "status": "available"
        }
      ],
      "related": [
        {
          "pattern": "multi-model-routing",
          "relation": "alternative-to"
        },
        {
          "pattern": "lineage-tracking",
          "relation": "alternative-to"
        },
        {
          "pattern": "model-card",
          "relation": "alternative-to"
        }
      ],
      "references": [
        {
          "type": "blog",
          "title": "Building Effective Agents",
          "authors": "Anthropic",
          "year": 2024,
          "url": "https://www.anthropic.com/engineering/building-effective-agents"
        }
      ],
      "status_in_practice": "deprecated",
      "tags": [
        "anti-pattern",
        "routing",
        "disclosure"
      ]
    },
    {
      "id": "infinite-debate",
      "name": "Infinite Debate",
      "aliases": [
        "Stuck Multi-Agent",
        "Convergence Failure",
        "Agents Stuck Talking",
        "Multi-Agent Loop"
      ],
      "category": "anti-patterns",
      "intent": "Anti-pattern: launch multi-agent debate without a termination rule and watch the agents loop forever.",
      "context": "Debate or consensus patterns are added without explicit halt conditions; the agents argue until the cost cap kicks in.",
      "problem": "Debate without termination converges only by accident. Real cost grows linearly while progress stalls.",
      "forces": [
        "Consensus heuristics are easy to game.",
        "Round caps cut off legitimate convergence.",
        "Judge agents become the new bottleneck."
      ],
      "solution": "Don't. Add a round cap and a termination predicate. Pair debate with a judge or aggregator. See debate, step-budget, the-stop-hook.",
      "consequences": {
        "benefits": [],
        "liabilities": [
          "Cost blow-up.",
          "User-visible non-termination."
        ]
      },
      "constrains": "By definition, this anti-pattern imposes no useful constraint; the missing constraint is the failure mode.",
      "known_uses": [
        {
          "system": "Early multi-agent demos in 2023-2024",
          "status": "available"
        }
      ],
      "related": [
        {
          "pattern": "debate",
          "relation": "alternative-to"
        },
        {
          "pattern": "step-budget",
          "relation": "alternative-to"
        },
        {
          "pattern": "stop-hook",
          "relation": "alternative-to"
        },
        {
          "pattern": "communicative-dehallucination",
          "relation": "conflicts-with"
        }
      ],
      "references": [
        {
          "type": "repo",
          "title": "ai-standards/ai-design-patterns (Infinite Debate)",
          "url": "https://github.com/ai-standards/ai-design-patterns"
        }
      ],
      "status_in_practice": "deprecated",
      "tags": [
        "anti-pattern",
        "multi-agent",
        "termination"
      ]
    },
    {
      "id": "naive-rag-first",
      "name": "Naive-RAG-First",
      "aliases": [
        "RAG-By-Default",
        "Vector-Store-First"
      ],
      "category": "anti-patterns",
      "intent": "Anti-pattern: reach for naive RAG before checking whether the knowledge actually needs retrieval.",
      "context": "A team adds RAG because the field talks about RAG, even when the agent's information lives in a database, an API, or a small static document.",
      "problem": "Vector indexes are added where a SQL query, a tool call, or a system prompt would suffice. Cost and complexity rise; quality often drops because retrieval is the wrong shape.",
      "forces": [
        "RAG is on every reference architecture.",
        "Vector stores feel like a moat.",
        "Tool use is sometimes harder to build than RAG."
      ],
      "solution": "Don't reach for RAG first. Check whether the knowledge lives in a tool (database, API, search service), a scoped system prompt, or a small inlined document. Only adopt RAG when those genuinely do not work. See tool-use, naive-rag for when it does.",
      "consequences": {
        "benefits": [],
        "liabilities": [
          "Architectural complexity that pays for nothing.",
          "Retrieval misses that a SQL query would not.",
          "Embedding maintenance burden."
        ]
      },
      "constrains": "By definition, this anti-pattern imposes no useful constraint; the missing constraint is the failure mode.",
      "known_uses": [
        {
          "system": "Common in 2023-2024 enterprise AI projects",
          "status": "available"
        }
      ],
      "related": [
        {
          "pattern": "naive-rag",
          "relation": "conflicts-with",
          "note": "RAG is fine; RAG-first is not."
        },
        {
          "pattern": "tool-use",
          "relation": "alternative-to"
        }
      ],
      "references": [
        {
          "type": "blog",
          "title": "Marco Nissen, Working with the models",
          "year": 2026,
          "url": "https://substack.com/@marconissen"
        }
      ],
      "status_in_practice": "deprecated",
      "tags": [
        "anti-pattern",
        "rag",
        "architecture"
      ]
    },
    {
      "id": "perma-beta",
      "name": "Perma-Beta",
      "aliases": [
        "Forever Beta",
        "Eval Vacuum"
      ],
      "category": "anti-patterns",
      "intent": "Anti-pattern: ship the agent in 'beta' indefinitely so that quality regressions are someone else's problem.",
      "context": "An agent is launched without an eval harness and stays in beta because removing the label would create accountability for quality.",
      "problem": "Beta becomes a permanent excuse. Without an eval harness, quality is a guess and regressions are invisible.",
      "forces": [
        "Eval harnesses cost time to build.",
        "GA promises commit to quality bars.",
        "Beta lets product move fast."
      ],
      "solution": "Don't. Build the eval harness and exit beta. See eval-harness, llm-as-judge, shadow-canary.",
      "consequences": {
        "benefits": [],
        "liabilities": [
          "Trust erosion.",
          "No SLA defensibility.",
          "Quality stagnates without measurement."
        ]
      },
      "constrains": "By definition, this anti-pattern imposes no useful constraint; the missing constraint is the failure mode.",
      "known_uses": [
        {
          "system": "Many AI products since 2023",
          "status": "available"
        }
      ],
      "related": [
        {
          "pattern": "eval-harness",
          "relation": "alternative-to"
        },
        {
          "pattern": "shadow-canary",
          "relation": "alternative-to"
        },
        {
          "pattern": "eval-as-contract",
          "relation": "conflicts-with"
        }
      ],
      "references": [
        {
          "type": "repo",
          "title": "ai-standards/ai-design-patterns (Perma-Beta)",
          "url": "https://github.com/ai-standards/ai-design-patterns"
        }
      ],
      "status_in_practice": "deprecated",
      "tags": [
        "anti-pattern",
        "release",
        "beta"
      ]
    },
    {
      "id": "prompt-bloat",
      "name": "Prompt Bloat",
      "aliases": [
        "Prompt Accretion",
        "Eternal System Prompt"
      ],
      "category": "anti-patterns",
      "intent": "Anti-pattern: every bug fix adds a sentence to the system prompt; nothing is ever removed.",
      "context": "Production agents whose system prompt grows monotonically over months; no eviction policy; no review for relevance.",
      "problem": "Past a few thousand tokens, the prompt squeezes retrieval, forces cache misses, and yields diminishing-returns instruction following. Distinct from hero-agent (which is about scope) — this is about prompt accretion as a process.",
      "forces": [
        "Adding a sentence feels free; removing one feels risky.",
        "No clear owner of the prompt's overall design.",
        "Eval coverage rarely catches bloat-driven regressions."
      ],
      "solution": "Don't. Treat the prompt as code: PR review, eval gate on length, quarterly pruning sprints. Lift recurring procedures into agent-skills. Move stable rules into a constitutional charter.",
      "consequences": {
        "benefits": [],
        "liabilities": [
          "Token cost per turn rises monotonically.",
          "Cache misses on every prompt edit.",
          "Conflicting instructions accumulate; the model picks one at random."
        ]
      },
      "constrains": "By definition, this anti-pattern imposes no useful constraint; the missing eviction policy is the failure.",
      "known_uses": [
        {
          "system": "Common at six-month-old agent products",
          "status": "available"
        }
      ],
      "related": [
        {
          "pattern": "agent-skills",
          "relation": "alternative-to"
        },
        {
          "pattern": "constitutional-charter",
          "relation": "alternative-to"
        },
        {
          "pattern": "hero-agent",
          "relation": "complements"
        }
      ],
      "references": [
        {
          "type": "blog",
          "title": "Eugene Yan: Prompt engineering as a craft",
          "url": "https://eugeneyan.com/writing/llm-patterns/"
        },
        {
          "type": "blog",
          "title": "Hamel Husain: Improving the operations of agents",
          "url": "https://hamel.dev"
        }
      ],
      "status_in_practice": "deprecated",
      "tags": [
        "anti-pattern",
        "prompt"
      ]
    },
    {
      "id": "same-model-self-critique",
      "name": "Same-Model Self-Critique",
      "aliases": [
        "Echo-Chamber Reflection",
        "Single-Model Reflexion"
      ],
      "category": "anti-patterns",
      "intent": "Anti-pattern: have the same model both produce an answer and critique it, expecting independence.",
      "context": "Reflexion or evaluator-optimizer loops where producer and critic share a model and a prompt family.",
      "problem": "The critic shares the producer's blind spots. Wrong answers are reinforced as confident. 2025 replication studies confirm the effect.",
      "forces": [
        "Two models cost twice.",
        "Cross-model judges have their own biases.",
        "Self-critique feels free."
      ],
      "solution": "Don't pretend it is independent. Either accept that self-critique catches surface errors only, or use a different model family for the critic. See reflection, evaluator-optimizer, llm-as-judge.",
      "consequences": {
        "benefits": [],
        "liabilities": [
          "False confidence in flawed answers.",
          "Self-reinforced misconceptions across iterations."
        ]
      },
      "constrains": "By definition, this anti-pattern imposes no useful constraint; the missing constraint is the failure mode.",
      "known_uses": [
        {
          "system": "Naive Reflexion implementations",
          "status": "available"
        }
      ],
      "related": [
        {
          "pattern": "reflection",
          "relation": "alternative-to",
          "note": "Same-model-self-critique is the misuse mode of reflection; well-engineered reflection (frozen-rubric or self-refine) avoids the failure."
        },
        {
          "pattern": "evaluator-optimizer",
          "relation": "conflicts-with"
        },
        {
          "pattern": "self-refine",
          "relation": "conflicts-with"
        }
      ],
      "references": [
        {
          "type": "blog",
          "title": "Theaiengineer.substack: ReAct vs Plan-and-Execute vs ReWOO vs Reflexion",
          "year": 2025,
          "url": "https://theaiengineer.substack.com/"
        }
      ],
      "status_in_practice": "deprecated",
      "tags": [
        "anti-pattern",
        "reflection"
      ]
    },
    {
      "id": "schema-free-output",
      "name": "Schema-Free Output",
      "aliases": [
        "Free-Form Tool Call",
        "String-Parsing the Model"
      ],
      "category": "anti-patterns",
      "intent": "Anti-pattern: parse free-form model output for downstream code instead of using structured output.",
      "context": "An agent emits text that downstream code parses with regex, string splits, or 'is the word yes in there?'.",
      "problem": "The model invents punctuation, formatting, and field names. Parsers fail in non-obvious ways. Errors are mis-attributed to the model when they are parser bugs.",
      "forces": [
        "Structured output adds setup cost and provider lock-in.",
        "Some providers offered structured output later than tool use.",
        "Free-form feels flexible until it breaks."
      ],
      "solution": "Don't. Use structured-output (JSON Schema, Pydantic, function calling). See structured-output, tool-use.",
      "consequences": {
        "benefits": [],
        "liabilities": [
          "Brittle parsing.",
          "Silent corruption of downstream state.",
          "Debugging blames the model when the parser is at fault."
        ]
      },
      "constrains": "By definition, this anti-pattern imposes no useful constraint; the missing constraint is the failure mode.",
      "known_uses": [
        {
          "system": "Pre-2024 LangChain agents",
          "status": "available"
        }
      ],
      "related": [
        {
          "pattern": "structured-output",
          "relation": "alternative-to"
        },
        {
          "pattern": "tool-use",
          "relation": "alternative-to"
        }
      ],
      "references": [
        {
          "type": "doc",
          "title": "Tool use with Claude",
          "year": 2025,
          "url": "https://docs.claude.com/en/docs/agents-and-tools/tool-use/overview"
        },
        {
          "type": "blog",
          "title": "Building Effective Agents",
          "authors": "Anthropic",
          "year": 2024,
          "url": "https://www.anthropic.com/engineering/building-effective-agents"
        }
      ],
      "status_in_practice": "deprecated",
      "tags": [
        "anti-pattern",
        "parsing",
        "schema"
      ]
    },
    {
      "id": "tool-explosion",
      "name": "Tool Explosion",
      "aliases": [
        "Bloated Tool Registry",
        "100-Tool Agent",
        "Too Many Tools",
        "Tool Registry Bloat",
        "Function-Calling Accuracy Collapse"
      ],
      "category": "anti-patterns",
      "intent": "Anti-pattern: expose every available tool in every request and watch function-calling accuracy collapse.",
      "context": "MCP and plugin ecosystems make hundreds of tools easy to register; teams expose them all.",
      "problem": "Past about 20 tools, model selection accuracy drops sharply; the agent picks wrong tools or invents wrong arguments.",
      "forces": [
        "Adding tools feels free; selecting subsets feels like extra engineering.",
        "Discovery is push-style; filter is pull-style.",
        "Frontier models tolerate larger palettes; the threshold drifts."
      ],
      "solution": "Don't. Use tool-loadout to select per-task subsets. Cap exposed tools at a tested threshold. Measure function-calling accuracy as a release gate.",
      "consequences": {
        "benefits": [],
        "liabilities": [
          "Selection accuracy degradation.",
          "Token cost from large tool definitions in every prompt.",
          "Latency from prompt-caching cache-misses on tool changes."
        ]
      },
      "constrains": "By definition, this anti-pattern imposes no useful constraint; the missing constraint is the failure mode.",
      "known_uses": [
        {
          "system": "Common in early MCP integrations 2025",
          "status": "available"
        }
      ],
      "related": [
        {
          "pattern": "tool-loadout",
          "relation": "conflicts-with"
        },
        {
          "pattern": "hero-agent",
          "relation": "complements",
          "note": "Two flavours of the same problem: too much in one prompt."
        }
      ],
      "references": [
        {
          "type": "paper",
          "title": "Gorilla: Large Language Model Connected with Massive APIs (Berkeley Function-Calling Leaderboard)",
          "authors": "Patil, Zhang, Wang, Gonzalez",
          "year": 2023,
          "url": "https://arxiv.org/abs/2305.15334"
        },
        {
          "type": "blog",
          "title": "Drew Breunig: How Long Contexts Fail / How to Fix Your Context",
          "year": 2025,
          "url": "https://www.dbreunig.com"
        }
      ],
      "status_in_practice": "deprecated",
      "tags": [
        "anti-pattern",
        "tool-use"
      ]
    },
    {
      "id": "tool-output-trusted-verbatim",
      "name": "Tool Output Trusted Verbatim",
      "aliases": [
        "Untyped Tool Returns",
        "No Tool Output Validation"
      ],
      "category": "anti-patterns",
      "intent": "Anti-pattern: trust whatever tools return without validation, schema enforcement, or trust labels.",
      "context": "Agents accept tool output as ground truth; assume the tool is honest, returns valid JSON, and stays within content limits.",
      "problem": "Real tools return errors as 200 OK with `{error: ...}`, multi-MB responses that blow context, HTML with embedded scripts, or text with embedded prompt-injection payloads.",
      "forces": [
        "Validation feels like duplicate work when typed function calls exist.",
        "Schema enforcement requires per-tool work.",
        "Size limits are tool-specific."
      ],
      "solution": "Don't. Validate every tool result against a schema. Cap response size. Sanitise HTML. Apply tool-output-poisoning defenses. See tool-output-poisoning, structured-output, input-output-guardrails.",
      "consequences": {
        "benefits": [],
        "liabilities": [
          "Silent corruption of agent context.",
          "Indirect prompt injection succeeds.",
          "Context overflow from oversized responses."
        ]
      },
      "constrains": "By definition, this anti-pattern imposes no useful constraint; the missing validation is the failure.",
      "known_uses": [
        {
          "system": "Common in pre-2025 MCP integrations",
          "status": "available"
        }
      ],
      "related": [
        {
          "pattern": "tool-output-poisoning",
          "relation": "alternative-to"
        },
        {
          "pattern": "structured-output",
          "relation": "alternative-to"
        },
        {
          "pattern": "input-output-guardrails",
          "relation": "alternative-to"
        }
      ],
      "references": [
        {
          "type": "spec",
          "title": "OWASP LLM01: Prompt Injection",
          "year": 2025,
          "url": "https://genai.owasp.org/llmrisk/llm01-prompt-injection/"
        }
      ],
      "status_in_practice": "deprecated",
      "tags": [
        "anti-pattern",
        "tool-trust"
      ]
    },
    {
      "id": "unbounded-loop",
      "name": "Unbounded Loop",
      "aliases": [
        "No Step Cap",
        "Open-Ended Agent",
        "Agent Stuck",
        "Loops Forever"
      ],
      "category": "anti-patterns",
      "intent": "Anti-pattern: run the agent loop without a step budget and let model self-termination decide.",
      "context": "An agent is implemented as 'while not done' where 'done' is whatever the model says. The model rarely says done.",
      "problem": "The agent wanders, retries, or loops on errors. Cost is unbounded. User waits.",
      "forces": [
        "Caps cut off legitimate work.",
        "Choosing the cap is empirical.",
        "Model self-termination feels natural until it fails."
      ],
      "solution": "Don't. Set max_steps. Add a stop hook. See step-budget, the-stop-hook.",
      "consequences": {
        "benefits": [],
        "liabilities": [
          "Cost blow-up.",
          "Silent quality regressions when models drift."
        ]
      },
      "constrains": "By definition, this anti-pattern imposes no useful constraint; the missing constraint is the failure mode.",
      "known_uses": [
        {
          "system": "Early autonomous-agent demos (AutoGPT, BabyAGI initial versions)",
          "status": "available"
        }
      ],
      "related": [
        {
          "pattern": "step-budget",
          "relation": "alternative-to"
        },
        {
          "pattern": "stop-hook",
          "relation": "alternative-to"
        }
      ],
      "references": [
        {
          "type": "blog",
          "title": "Building Effective Agents",
          "authors": "Anthropic",
          "year": 2024,
          "url": "https://www.anthropic.com/engineering/building-effective-agents"
        }
      ],
      "status_in_practice": "deprecated",
      "tags": [
        "anti-pattern",
        "loop",
        "budget"
      ]
    },
    {
      "id": "unbounded-subagent-spawn",
      "name": "Unbounded Subagent Spawn",
      "aliases": [
        "Recursive Spawn",
        "Subagent Fan-Out Bomb"
      ],
      "category": "anti-patterns",
      "intent": "Anti-pattern: a supervisor or orchestrator spawns sub-agents that can themselves spawn sub-agents without a global cap.",
      "context": "Multi-agent systems with `supervisor` / `orchestrator-workers` / `lead-researcher` patterns where each level decomposes the task and spawns more agents.",
      "problem": "step-budget caps a single agent's loop; cost-gating caps a single action's cost. Neither caps total system spend through fan-out. A buggy decomposition can recursively explode the agent tree.",
      "forces": [
        "Per-agent caps look like sufficient governance until fan-out is observed.",
        "Detecting recursive spawn requires global agent tree state.",
        "Killing a single instance does not kill its descendants."
      ],
      "solution": "Don't. Maintain a global step budget across all descendants of a root request. Cap fan-out per supervisor (typically 5-10 children). Track parent_run_id in lineage so the agent tree is inspectable. Pair with kill-switch for emergency descent halt.",
      "consequences": {
        "benefits": [],
        "liabilities": [
          "Catastrophic cost spikes from runaway decomposition.",
          "Untracked descendants survive a top-level halt.",
          "Provider rate-limits cascade through the tree."
        ]
      },
      "constrains": "By definition, this anti-pattern imposes no useful constraint; the missing global fan-out cap is the failure.",
      "known_uses": [
        {
          "system": "Observed in early multi-agent demos (AutoGPT-style 2023)",
          "status": "available"
        }
      ],
      "related": [
        {
          "pattern": "step-budget",
          "relation": "alternative-to"
        },
        {
          "pattern": "cost-gating",
          "relation": "alternative-to"
        },
        {
          "pattern": "kill-switch",
          "relation": "alternative-to"
        },
        {
          "pattern": "subagent-isolation",
          "relation": "complements"
        }
      ],
      "references": [
        {
          "type": "blog",
          "title": "Building Effective Agents",
          "authors": "Anthropic",
          "year": 2024,
          "url": "https://www.anthropic.com/engineering/building-effective-agents"
        }
      ],
      "status_in_practice": "deprecated",
      "tags": [
        "anti-pattern",
        "multi-agent",
        "fan-out"
      ]
    },
    {
      "id": "agent-as-judge",
      "name": "Agent-as-a-Judge",
      "aliases": [
        "Trajectory Evaluator",
        "Judge Agent"
      ],
      "category": "governance-observability",
      "intent": "Evaluate an agent's full trajectory (steps, tool calls, intermediate states) by another agent rather than scoring only the final output.",
      "context": "Agent evaluations (SWE-Bench, agentic benchmarks) where the final output is one signal among many and the trajectory contains evaluable structure.",
      "problem": "LLM-as-judge evaluates only the final answer; agentic tasks succeed or fail along the trajectory in ways the final answer hides.",
      "forces": [
        "Trajectory evaluation is more expensive than answer-only judging.",
        "Judge agents have their own biases and failure modes.",
        "Trajectory schemas vary per agent framework."
      ],
      "solution": "A judge agent receives the candidate agent's full trajectory: thoughts, tool calls, observations, intermediate state, and final answer. It evaluates against a rubric covering correctness, efficiency, and process quality. Outputs a structured verdict with rationale.",
      "consequences": {
        "benefits": [
          "Catches process-level failures that hide behind right answers.",
          "Inspectable judge rationales."
        ],
        "liabilities": [
          "Cost: trajectory evaluation is expensive.",
          "Judge calibration on trajectory rubrics is its own dataset effort."
        ]
      },
      "constrains": "The judge sees the full trajectory, not just the final output; answer-only evaluation is not used in this pattern.",
      "known_uses": [
        {
          "system": "MetaGPT Agent-as-a-Judge",
          "status": "available",
          "url": "https://github.com/metauto-ai/agent-as-a-judge"
        },
        {
          "system": "SWE-Bench-style agentic benchmarks",
          "status": "available"
        }
      ],
      "related": [
        {
          "pattern": "llm-as-judge",
          "relation": "specialises"
        },
        {
          "pattern": "eval-harness",
          "relation": "uses"
        },
        {
          "pattern": "decision-log",
          "relation": "uses"
        }
      ],
      "references": [
        {
          "type": "paper",
          "title": "Agent-as-a-Judge: Evaluate Agents with Agents",
          "authors": "Zhuge, Zhao, Ashley, Wang, Khizbullin, Xiong, Liu, Chang, Zhang, Yang, Liu, Huang, Schmidhuber",
          "year": 2024,
          "url": "https://arxiv.org/abs/2410.10934"
        }
      ],
      "status_in_practice": "emerging",
      "tags": [
        "eval",
        "judge",
        "trajectory"
      ]
    },
    {
      "id": "agent-resumption",
      "name": "Agent Resumption",
      "aliases": [
        "Durable Execution",
        "Pause-and-Resume",
        "Long-Running Agent State"
      ],
      "category": "governance-observability",
      "intent": "Persist agent execution state so a long-running run survives restarts, deploys, or user disconnects.",
      "context": "Production agents that run for minutes to hours; users need the work to survive disconnects and ops events.",
      "problem": "Agents that lose state on restart lose hours of work; users distrust long-running agents.",
      "forces": [
        "Checkpoint frequency vs cost.",
        "What to persist; what to recompute.",
        "Resumability requires deterministic enough replay or full state capture."
      ],
      "solution": "Two production approaches. (a) Deterministic replay of recorded effects (Temporal/Inngest pattern): state = inputs + log of side-effects; on resume, the engine re-executes the workflow code, skipping side-effects that already have logged results. (b) Checkpoint snapshots of agent state (LangGraph Cloud pattern): periodically serialise plan, working memory, partial outputs, pending tool calls; restore on restart. Both approaches require deterministic idempotency keys passed to side-effect targets so a replayed-but-unlogged call is deduplicated downstream. Without this, crash-between-effect-and-log produces duplicates.",
      "consequences": {
        "benefits": [
          "Reliability for long-running agents.",
          "Operations confidence: deploys do not lose user work."
        ],
        "liabilities": [
          "Checkpoint storage cost.",
          "Resumed runs may see drifted external state.",
          "Deterministic-replay requires the workflow code to be deterministic; non-deterministic code in the agent path corrupts on resume.",
          "Tools that don't accept an idempotency key cannot be safely resumed."
        ]
      },
      "constrains": "Agent state must be serialisable; non-serialisable in-memory references are forbidden in long-running paths.",
      "known_uses": [
        {
          "system": "Devin sessions",
          "status": "available",
          "url": "https://devin.ai/"
        },
        {
          "system": "Manus tasks",
          "status": "available",
          "url": "https://manus.im/"
        },
        {
          "system": "OpenAI Agents SDK durable execution",
          "status": "available",
          "url": "https://openai.github.io/openai-agents-python/"
        },
        {
          "system": "Temporal-backed agents",
          "status": "available",
          "url": "https://temporal.io/"
        },
        {
          "system": "Inngest agents",
          "status": "available",
          "url": "https://www.inngest.com/docs"
        },
        {
          "system": "LangGraph Cloud checkpointing",
          "status": "available",
          "url": "https://langchain-ai.github.io/langgraph/concepts/persistence/"
        }
      ],
      "related": [
        {
          "pattern": "scheduled-agent",
          "relation": "complements"
        },
        {
          "pattern": "event-driven-agent",
          "relation": "complements"
        },
        {
          "pattern": "short-term-memory",
          "relation": "uses"
        },
        {
          "pattern": "todo-list-driven-agent",
          "relation": "complements"
        }
      ],
      "references": [
        {
          "type": "doc",
          "title": "Temporal: Durable execution",
          "url": "https://docs.temporal.io"
        },
        {
          "type": "doc",
          "title": "Inngest: AgentKit durable agents",
          "url": "https://www.inngest.com/docs"
        }
      ],
      "status_in_practice": "mature",
      "tags": [
        "durability",
        "long-running",
        "state"
      ]
    },
    {
      "id": "attention-manipulation-explainability",
      "name": "Attention-Manipulation Explainability",
      "aliases": [
        "AtMan",
        "Attention Perturbation Attribution",
        "Token-Influence Map"
      ],
      "category": "governance-observability",
      "intent": "Surface which input tokens caused a given output by perturbing attention across all transformer layers and measuring the resulting change in output probability, producing a per-token relevance map alongside the model's response.",
      "context": "Regulated or high-stakes deployments where the operator must explain to an auditor or end-user *why* the model produced a specific output; pure post-hoc rationalisations are considered insufficient.",
      "problem": "Free-text follow-up prompts asking the model to justify itself produce plausible but unfaithful explanations; the model confabulates a reason that does not match its actual decision.",
      "forces": [
        "Auditors want input-grounded explanations, not generated rationales.",
        "Per-token attribution must be cheap enough to run in production, not only offline.",
        "Faithfulness of the explanation matters more than its readability.",
        "Vendor-side method may be incompatible with hosted black-box APIs."
      ],
      "solution": "Run a structured perturbation pass over the model's attention: for each input token (or chunk), suppress its attention contribution and measure the change in the output token probabilities. Tokens whose suppression most reduces the output probability are the most relevant. Surface this as a heat-map alongside the answer. Keep the attribution method on the inference side; avoid asking the model to self-explain in prose.",
      "structure": "Input -> Model -> Output. Parallel: Input -> perturb(token_i) -> ΔP(output) -> relevance_map.",
      "consequences": {
        "benefits": [
          "Faithful (mechanistic) attribution rather than confabulated rationale.",
          "Compatible with audit and right-to-explanation requirements.",
          "User-visible heat-maps build calibrated trust."
        ],
        "liabilities": [
          "Requires white-box access to attention; not available for hosted black-box APIs.",
          "Compute overhead per request (one forward pass per token group).",
          "Token-level attribution can mislead when reasoning spans many tokens."
        ]
      },
      "constrains": "The agent may not present generated text as the explanation of its own output when an attribution-based explanation is feasible; self-explanations have to be marked as such.",
      "known_uses": [
        {
          "system": "Aleph Alpha AtMan",
          "note": "Original attention-perturbation explainability method shipped in PhariaAI.",
          "status": "available",
          "url": "https://aleph-alpha.com/"
        }
      ],
      "related": [
        {
          "pattern": "decision-log",
          "relation": "complements"
        },
        {
          "pattern": "confidence-reporting",
          "relation": "complements"
        },
        {
          "pattern": "lineage-tracking",
          "relation": "complements"
        },
        {
          "pattern": "citation-streaming",
          "relation": "alternative-to",
          "note": "Citations attribute to retrieved docs; AtMan attributes to input tokens."
        },
        {
          "pattern": "model-card",
          "relation": "complements"
        }
      ],
      "references": [
        {
          "type": "paper",
          "title": "AtMan: Understanding Transformer Predictions Through Memory Efficient Attention Manipulation",
          "year": 2023,
          "url": "https://arxiv.org/abs/2301.08110"
        }
      ],
      "status_in_practice": "experimental",
      "tags": [
        "governance",
        "explainability",
        "germany-origin",
        "aleph-alpha"
      ]
    },
    {
      "id": "cost-observability",
      "name": "Cost Observability",
      "aliases": [
        "Token Telemetry",
        "Cost Dashboard"
      ],
      "category": "governance-observability",
      "intent": "Surface per-request, per-user, and per-feature cost and token consumption to operators in near-real-time.",
      "context": "Production agent products where cost surprises (a runaway feature, a bad routing decision) blow up bills before anyone notices.",
      "problem": "Without per-feature cost visibility, expensive failures hide in aggregate metrics until billing arrives.",
      "forces": [
        "Telemetry schema must capture which feature, which model, which user.",
        "Real-time vs daily aggregation.",
        "Privacy on per-user attribution."
      ],
      "solution": "Tag every model and tool call with feature, route, user (anonymised), and model id. Stream to a telemetry store. Build dashboards by feature, by model, by tier, by hour. Set alerts on anomalies. Pair with cost-gating for prevention.",
      "consequences": {
        "benefits": [
          "Fast detection of cost regressions.",
          "Inputs for capacity planning and pricing."
        ],
        "liabilities": [
          "Telemetry overhead.",
          "Per-user attribution has privacy implications."
        ]
      },
      "constrains": "Calls without telemetry tags fall into an 'unattributed' bucket; some internal gateways enforce tag-or-reject.",
      "known_uses": [
        {
          "system": "Langfuse",
          "status": "available"
        },
        {
          "system": "Helicone",
          "status": "available"
        },
        {
          "system": "OpenAI usage dashboard",
          "status": "available"
        },
        {
          "system": "Anthropic Console usage",
          "status": "available"
        }
      ],
      "related": [
        {
          "pattern": "cost-gating",
          "relation": "complements"
        },
        {
          "pattern": "lineage-tracking",
          "relation": "complements"
        }
      ],
      "references": [
        {
          "type": "doc",
          "title": "Langfuse",
          "url": "https://langfuse.com/docs"
        },
        {
          "type": "doc",
          "title": "Helicone",
          "url": "https://docs.helicone.ai"
        }
      ],
      "status_in_practice": "mature",
      "tags": [
        "observability",
        "cost",
        "telemetry"
      ]
    },
    {
      "id": "decision-log",
      "name": "Decision Log",
      "aliases": [
        "Reasoning Trace",
        "Thought Trace"
      ],
      "category": "governance-observability",
      "intent": "Persist the agent's reasoning trace alongside its actions so post-hoc review can explain why.",
      "context": "Audit and debugging both need access to what the agent considered, not just what it did.",
      "problem": "Action-only logs answer 'what' but not 'why'; debugging a wrong action requires the reasoning chain.",
      "forces": [
        "Reasoning traces are large.",
        "Sensitive content in reasoning may need redaction.",
        "Trace fidelity vs cost: full chain-of-thought, key decisions, summary?"
      ],
      "solution": "Persist reasoning at a chosen granularity (full trace, key decisions, or summary). Link each action in the provenance ledger to its trace. Indexed by request id and time for retrieval.",
      "consequences": {
        "benefits": [
          "Debugging speed jumps; you see the why immediately.",
          "User-facing explanations become possible."
        ],
        "liabilities": [
          "Storage and privacy implications.",
          "Trace tampering (the agent rewriting its trace) defeats the purpose; append-only is needed."
        ]
      },
      "constrains": "Action records cannot be written without a corresponding decision-log entry.",
      "known_uses": [
        {
          "system": "Sparrot",
          "note": "Thought stream + ledger linkage.",
          "status": "available"
        },
        {
          "system": "Langfuse / LangSmith trace stores",
          "status": "available"
        }
      ],
      "related": [
        {
          "pattern": "provenance-ledger",
          "relation": "generalises"
        },
        {
          "pattern": "append-only-thought-stream",
          "relation": "uses"
        },
        {
          "pattern": "black-box-opaqueness",
          "relation": "alternative-to"
        },
        {
          "pattern": "replay-time-travel",
          "relation": "used-by"
        },
        {
          "pattern": "agent-as-judge",
          "relation": "used-by"
        },
        {
          "pattern": "attention-manipulation-explainability",
          "relation": "complements"
        }
      ],
      "references": [
        {
          "type": "doc",
          "title": "Langfuse docs",
          "url": "https://langfuse.com/docs"
        }
      ],
      "status_in_practice": "mature",
      "tags": [
        "observability",
        "trace",
        "debug"
      ],
      "example_scenario": "A trading agent decided to short a position at 14:32. At 16:00, the trade lost money. The decision log shows: at 14:32 the agent considered three signals (RSI was low, volume spiked, news sentiment was negative), weighted them, and chose short. The human reviewer can now ask 'was the weighting wrong?' instead of 'what was the agent thinking?'",
      "variants": [
        {
          "name": "Append-only event log",
          "summary": "Every decision is appended to an immutable log. Operators cannot edit prior entries; corrections become new entries.",
          "distinguishing_factor": "immutable append",
          "when_to_use": "Default. Audit and provenance require it.",
          "see_also": "append-only-thought-stream"
        },
        {
          "name": "Structured trace (OpenTelemetry)",
          "summary": "Decisions are emitted as OTel spans alongside tool calls and LLM calls; one trace per agent run.",
          "distinguishing_factor": "ops-tooling integration",
          "when_to_use": "The team already runs an OTel-based observability stack (Honeycomb, Datadog, Tempo).",
          "see_also": "lineage-tracking"
        },
        {
          "name": "Reasoning-trace export",
          "summary": "The model's thinking content is captured per turn and exported alongside actions, so reviewers see why each tool call was made.",
          "distinguishing_factor": "captures model thoughts",
          "when_to_use": "Reasoning models (o1/Claude extended thinking) are in use and the trace is worth keeping."
        }
      ]
    },
    {
      "id": "eval-as-contract",
      "name": "Eval as Contract",
      "aliases": [
        "Test-Driven Agent",
        "Eval-Gated Release"
      ],
      "category": "governance-observability",
      "intent": "Treat the eval suite as the contract the agent must satisfy; releases ship only if evals pass.",
      "context": "Production agents where 'works on my prompt' is insufficient and stakeholders need a quality bar to depend on.",
      "problem": "Without a release gate tied to evals, quality regressions slip through; the eval suite becomes documentation rather than enforcement.",
      "forces": [
        "Contract authoring is up-front work.",
        "Eval-suite drift if not maintained.",
        "Calibration: which evals are blocking, which are advisory."
      ],
      "solution": "Define a tiered eval suite: blocking evals (must pass for release), advisory evals (tracked but not blocking). Wire blocking evals into CI. Block PRs and releases when blocking evals fail. Treat eval changes as architectural changes (review, signoff).",
      "consequences": {
        "benefits": [
          "Quality bar is enforced, not aspirational.",
          "Eval suite earns its seat by being load-bearing."
        ],
        "liabilities": [
          "Bad evals block legitimate releases.",
          "Calibration is empirical."
        ]
      },
      "constrains": "Releases are forbidden when blocking evals fail; bypassing requires explicit operator override.",
      "known_uses": [
        {
          "system": "AI-Standards Eval as Contract pattern",
          "status": "available"
        }
      ],
      "related": [
        {
          "pattern": "eval-harness",
          "relation": "specialises"
        },
        {
          "pattern": "shadow-canary",
          "relation": "complements"
        },
        {
          "pattern": "perma-beta",
          "relation": "conflicts-with"
        },
        {
          "pattern": "prompt-versioning",
          "relation": "used-by"
        },
        {
          "pattern": "automatic-workflow-search",
          "relation": "complements"
        }
      ],
      "references": [
        {
          "type": "repo",
          "title": "ai-standards/ai-design-patterns (Eval as Contract)",
          "url": "https://github.com/ai-standards/ai-design-patterns"
        }
      ],
      "status_in_practice": "mature",
      "tags": [
        "eval",
        "release",
        "contract"
      ]
    },
    {
      "id": "eval-harness",
      "name": "Eval Harness",
      "aliases": [
        "Golden Dataset Suite",
        "Champion-Challenger",
        "Regression Suite"
      ],
      "category": "governance-observability",
      "intent": "Run a held-out dataset against agent versions to detect regressions and measure improvement.",
      "context": "Agents are non-deterministic and prompt-sensitive; without an eval harness, every change is a guess.",
      "problem": "A change that 'feels better' often isn't; without measurement, the system regresses silently.",
      "forces": [
        "Dataset construction is expensive and ages.",
        "Judging open-ended outputs needs a metric or judge.",
        "Champion-challenger is fairer but doubles cost."
      ],
      "solution": "Build a golden dataset of (input, expected output) pairs. Run candidate versions against the dataset; score each. Compare champion (current) against challenger (proposed). Promote on quality lift, blocked on regression. Re-run on every meaningful change.",
      "consequences": {
        "benefits": [
          "Quality becomes measurable, comparable, and trendable.",
          "Releases gain a quantitative gate."
        ],
        "liabilities": [
          "Dataset bias means high scores can hide real-world failures.",
          "LLM-as-judge has its own calibration cost."
        ]
      },
      "constrains": "Releases are blocked if the harness flags a regression beyond tolerance.",
      "known_uses": [
        {
          "system": "Bobbin (Stash2Go)",
          "note": "Eval harness flagged as the explicit next step; in beta because of this gap.",
          "status": "planned"
        },
        {
          "system": "Sparrot",
          "status": "planned"
        },
        {
          "system": "Ragas, DeepEval, Langfuse Evals",
          "status": "available"
        }
      ],
      "related": [
        {
          "pattern": "llm-as-judge",
          "relation": "uses"
        },
        {
          "pattern": "eval-as-contract",
          "relation": "generalises"
        },
        {
          "pattern": "shadow-canary",
          "relation": "complements"
        },
        {
          "pattern": "perma-beta",
          "relation": "alternative-to"
        },
        {
          "pattern": "dspy-signatures",
          "relation": "used-by"
        },
        {
          "pattern": "model-card",
          "relation": "complements"
        },
        {
          "pattern": "agent-as-judge",
          "relation": "used-by"
        },
        {
          "pattern": "automatic-workflow-search",
          "relation": "used-by"
        }
      ],
      "references": [
        {
          "type": "repo",
          "title": "explodinggradients/ragas",
          "url": "https://github.com/explodinggradients/ragas"
        },
        {
          "type": "doc",
          "title": "Anthropic: Building Effective Agents (eval section)",
          "year": 2024,
          "url": "https://www.anthropic.com/engineering/building-effective-agents"
        }
      ],
      "status_in_practice": "mature",
      "tags": [
        "eval",
        "regression",
        "harness"
      ]
    },
    {
      "id": "incident-response-runbook",
      "name": "Incident Response Runbook",
      "aliases": [
        "IR Runbook",
        "Agent Failure Playbook",
        "Agent Incident Procedure"
      ],
      "category": "governance-observability",
      "intent": "Maintain pre-written response procedures for agent failures (PII leak, tool exploit, mass false action) so detected incidents trigger known steps.",
      "context": "Production agents at organisations with safety controls (kill-switch, sandbox-escape-monitoring, provenance-ledger) that detect events nobody knows how to respond to.",
      "problem": "Without a runbook, detection produces alerts that wake the on-call but do not lead to coordinated containment, compensation, communication, or post-mortem.",
      "forces": [
        "Severity classification must be agreed upfront.",
        "Containment vs forensic preservation tension.",
        "Communication clocks for regulators (GDPR 72h, EU AI Act serious incident) constrain runbook latency."
      ],
      "solution": "Maintain a runbook covering: severity levels, on-call paths, containment steps (kill-switch invocation, traffic rerouting), forensic preservation (pin traces beyond normal retention), compensating actions, customer communication templates, regulator notification procedures, and post-mortem template. Tie alerts from kill-switch/sandbox-escape-monitoring/cost-observability to runbook entries.",
      "consequences": {
        "benefits": [
          "Detection produces coordinated response, not panic.",
          "Regulator timelines are met."
        ],
        "liabilities": [
          "Runbook drift: scenarios evolve faster than the doc.",
          "Runbook fatigue if drilled too rarely or too often."
        ]
      },
      "constrains": "Detected incidents must trigger a documented runbook step; ad-hoc response without runbook is a process failure to be flagged in post-mortem.",
      "known_uses": [
        {
          "system": "Standard SRE practice transferred to agent platforms",
          "status": "available"
        },
        {
          "system": "Frontier-lab safety teams",
          "status": "available"
        }
      ],
      "related": [
        {
          "pattern": "kill-switch",
          "relation": "complements"
        },
        {
          "pattern": "sandbox-escape-monitoring",
          "relation": "complements"
        },
        {
          "pattern": "provenance-ledger",
          "relation": "uses"
        },
        {
          "pattern": "compensating-action",
          "relation": "uses"
        }
      ],
      "references": [
        {
          "type": "book",
          "title": "Site Reliability Engineering (Google, ch. 14 Managing Incidents)",
          "year": 2016,
          "url": "https://sre.google/sre-book/managing-incidents/"
        },
        {
          "type": "book",
          "title": "Site Reliability Engineering (Google) — Managing Incidents",
          "year": 2016,
          "url": "https://sre.google/sre-book/managing-incidents/"
        }
      ],
      "status_in_practice": "mature",
      "tags": [
        "governance",
        "incident-response",
        "safety"
      ]
    },
    {
      "id": "lineage-tracking",
      "name": "Lineage Tracking",
      "aliases": [
        "Data Lineage",
        "Prompt Versioning",
        "Artefact Provenance"
      ],
      "category": "governance-observability",
      "intent": "Track which prompt version, model version, and data sources produced each agent output.",
      "context": "Long-lived production agents where outputs are referenced months later and stakeholders ask 'which version produced this?'.",
      "problem": "Without lineage, output disputes are unanswerable; rolling back to a known-good state is guesswork.",
      "forces": [
        "Lineage metadata adds storage.",
        "Schema evolution of lineage is itself a problem.",
        "PII in lineage records (prompts contain user data)."
      ],
      "solution": "Tag every agent output with: prompt template hash, model id and version, tool versions, retrieved-document ids, decision-log id. Store in a queryable lineage store. Make lineage joinable to the output store.",
      "consequences": {
        "benefits": [
          "Output disputes are answerable.",
          "Targeted rollback becomes possible."
        ],
        "liabilities": [
          "Storage growth.",
          "Lineage schema must evolve carefully."
        ]
      },
      "constrains": "Outputs without lineage tags are not promoted to production storage.",
      "known_uses": [
        {
          "system": "Langfuse, LangSmith, Weights & Biases Prompts",
          "status": "available"
        }
      ],
      "related": [
        {
          "pattern": "provenance-ledger",
          "relation": "complements"
        },
        {
          "pattern": "model-card",
          "relation": "complements"
        },
        {
          "pattern": "cost-observability",
          "relation": "complements"
        },
        {
          "pattern": "replay-time-travel",
          "relation": "complements"
        },
        {
          "pattern": "black-box-opaqueness",
          "relation": "alternative-to"
        },
        {
          "pattern": "hidden-mode-switching",
          "relation": "alternative-to"
        },
        {
          "pattern": "prompt-versioning",
          "relation": "composes-with"
        },
        {
          "pattern": "sovereign-inference-stack",
          "relation": "used-by"
        },
        {
          "pattern": "attention-manipulation-explainability",
          "relation": "complements"
        }
      ],
      "references": [
        {
          "type": "spec",
          "title": "NIST AI Risk Management Framework",
          "year": 2023,
          "url": "https://www.nist.gov/itl/ai-risk-management-framework"
        }
      ],
      "status_in_practice": "mature",
      "tags": [
        "governance",
        "lineage",
        "versioning"
      ]
    },
    {
      "id": "llm-as-judge",
      "name": "LLM-as-Judge",
      "aliases": [
        "Model Grading",
        "Auto-Evaluator"
      ],
      "category": "governance-observability",
      "intent": "Use an LLM to score open-ended outputs against rubric criteria when no exact-match metric applies.",
      "context": "Free-form outputs (summaries, code, prose) defy exact-match scoring; human grading is too slow for CI.",
      "problem": "Without an automated grader, regression detection on free-form outputs requires human eyes on every run.",
      "forces": [
        "Judges have biases (length, position, model-family preference).",
        "Calibration against human judgement is its own dataset.",
        "Same-model judging is suspect when the candidate is from the same family."
      ],
      "solution": "Define a rubric. Prompt a judge model with the input, candidate output, and rubric. Receive a structured score plus rationale. Calibrate periodically against human-graded samples. Use a different model family for judge vs candidate where possible.",
      "consequences": {
        "benefits": [
          "Scales free-form evaluation.",
          "Rationales are debugging breadcrumbs."
        ],
        "liabilities": [
          "Judge biases skew scores in subtle ways.",
          "Cost: every eval is now N x judge calls."
        ]
      },
      "constrains": "Scores are advisory unless calibrated against human judgement at known intervals.",
      "known_uses": [
        {
          "system": "MT-Bench / AlpacaEval",
          "status": "available"
        },
        {
          "system": "Ragas / DeepEval / Langfuse",
          "status": "available"
        }
      ],
      "related": [
        {
          "pattern": "eval-harness",
          "relation": "used-by"
        },
        {
          "pattern": "evaluator-optimizer",
          "relation": "used-by"
        },
        {
          "pattern": "agent-as-judge",
          "relation": "generalises"
        },
        {
          "pattern": "shadow-canary",
          "relation": "used-by"
        }
      ],
      "references": [
        {
          "type": "paper",
          "title": "Judging LLM-as-a-Judge with MT-Bench and Chatbot Arena",
          "authors": "Zheng et al.",
          "year": 2023,
          "url": "https://arxiv.org/abs/2306.05685"
        }
      ],
      "status_in_practice": "mature",
      "tags": [
        "eval",
        "judge",
        "scoring"
      ]
    },
    {
      "id": "model-card",
      "name": "Model Card",
      "aliases": [
        "System Card"
      ],
      "category": "governance-observability",
      "intent": "Maintain a structured document describing the model/agent's intended use, limitations, evaluation results, and risks.",
      "context": "Regulated, high-stakes, or public-facing agents where stakeholders (legal, compliance, users) need to understand what the system does and where it fails.",
      "problem": "Agent capabilities and limitations are tribal knowledge; without a card, every stakeholder reinvents understanding.",
      "forces": [
        "Cards age fast in fast-moving systems.",
        "Truthful disclosure conflicts with marketing pressure.",
        "Card depth vs maintainability."
      ],
      "solution": "Maintain a markdown document at a known location with sections: intended use, out-of-scope use, training/data lineage, evaluation results, limitations, risks, contact. Versioned alongside the agent.",
      "consequences": {
        "benefits": [
          "Stakeholder alignment.",
          "Regulatory and audit defensibility."
        ],
        "liabilities": [
          "Maintenance burden.",
          "Card drift when not enforced in PRs."
        ]
      },
      "constrains": "Production releases require a current card; missing or stale cards block deployment.",
      "known_uses": [
        {
          "system": "Anthropic system cards",
          "status": "available"
        },
        {
          "system": "Google Model Cards",
          "status": "available"
        }
      ],
      "related": [
        {
          "pattern": "lineage-tracking",
          "relation": "complements"
        },
        {
          "pattern": "eval-harness",
          "relation": "complements"
        },
        {
          "pattern": "provenance-ledger",
          "relation": "complements"
        },
        {
          "pattern": "awareness",
          "relation": "generalises"
        },
        {
          "pattern": "hidden-mode-switching",
          "relation": "alternative-to"
        },
        {
          "pattern": "sovereign-inference-stack",
          "relation": "complements"
        },
        {
          "pattern": "attention-manipulation-explainability",
          "relation": "complements"
        }
      ],
      "references": [
        {
          "type": "paper",
          "title": "Model Cards for Model Reporting",
          "authors": "Mitchell, Wu, Zaldivar, Barnes, Vasserman, Hutchinson, Spitzer, Raji, Gebru",
          "year": 2018,
          "url": "https://arxiv.org/abs/1810.03993"
        }
      ],
      "status_in_practice": "mature",
      "tags": [
        "governance",
        "documentation"
      ]
    },
    {
      "id": "prompt-versioning",
      "name": "Prompt Versioning",
      "aliases": [
        "Prompt-as-Artifact",
        "Prompt Registry",
        "Versioned Prompts"
      ],
      "category": "governance-observability",
      "intent": "Treat prompts as immutable, hashed, semver'd artefacts in a registry; deploy and roll back like code.",
      "context": "Production agents where prompt changes drive quality changes; ad-hoc prompt edits introduce silent regressions.",
      "problem": "Prompts edited inline in code are hard to audit; rolling back a prompt means rolling back a deployment; comparison across versions is bespoke.",
      "forces": [
        "Registry adds infrastructure.",
        "Prompt versioning must integrate with eval harness.",
        "Signed prompts vs editable prompts."
      ],
      "solution": "Prompts live in a registry as immutable, hashed, version-tagged artefacts. Code references prompts by name + version (semver). Deployments pin specific versions; rollback by version. Eval harness ties metric outcomes to prompt versions. Optionally signed for provenance.",
      "consequences": {
        "benefits": [
          "Prompt rollback without redeploy.",
          "Eval results map to specific prompts."
        ],
        "liabilities": [
          "Registry infrastructure.",
          "Version-pinning means prompts stop tracking model upgrades automatically."
        ]
      },
      "constrains": "Production calls reference pinned prompt versions only; ad-hoc inline prompts are forbidden.",
      "known_uses": [
        {
          "system": "LangSmith Prompts",
          "status": "available"
        },
        {
          "system": "PromptLayer",
          "status": "available"
        },
        {
          "system": "Humanloop",
          "status": "available"
        },
        {
          "system": "Vellum",
          "status": "available"
        },
        {
          "system": "Helicone Prompts",
          "status": "available"
        }
      ],
      "related": [
        {
          "pattern": "lineage-tracking",
          "relation": "composes-with"
        },
        {
          "pattern": "eval-as-contract",
          "relation": "uses"
        },
        {
          "pattern": "shadow-canary",
          "relation": "complements"
        }
      ],
      "references": [
        {
          "type": "doc",
          "title": "LangSmith Prompts",
          "url": "https://docs.smith.langchain.com/prompt_engineering/concepts"
        },
        {
          "type": "doc",
          "title": "PromptLayer",
          "url": "https://docs.promptlayer.com"
        },
        {
          "type": "doc",
          "title": "Humanloop",
          "url": "https://humanloop.com"
        }
      ],
      "status_in_practice": "mature",
      "tags": [
        "governance",
        "prompt",
        "versioning"
      ]
    },
    {
      "id": "provenance-ledger",
      "name": "Provenance Ledger",
      "aliases": [
        "Audit Trail",
        "Action Log"
      ],
      "category": "governance-observability",
      "intent": "Log every agent decision and state change with enough metadata to explain or reverse it later.",
      "context": "Long-running or regulated agents need to answer 'why did the agent do X on day Y?' months later.",
      "problem": "Without provenance, agent behaviour is post-hoc inscrutable; audit and rollback become impossible.",
      "forces": [
        "Auditability vs storage cost of every event.",
        "Schema rigidity vs evolvability over the agent's lifetime.",
        "PII in events: redaction at write time vs read time."
      ],
      "solution": "Append events to an immutable log with: timestamp, actor, action, target, justification (link to thought or decision), diff hash. Enable rollback by id. Reject events that lack the required fields.",
      "consequences": {
        "benefits": [
          "Audit and rollback become tractable.",
          "Pattern of failures becomes visible across time."
        ],
        "liabilities": [
          "Log volume can dominate other storage.",
          "Justification fields require the agent to write them; lazy agents skip."
        ]
      },
      "constrains": "Self-edits and other recorded actions are rejected if they lack a valid justification reference.",
      "known_uses": [
        {
          "system": "Sparrot",
          "note": "ledger.jsonl with {ts, file, diff_hash, thought_id, justification}; orphan writes rejected.",
          "status": "available"
        },
        {
          "system": "Langfuse traces",
          "status": "available"
        },
        {
          "system": "OpenTelemetry GenAI semantic conventions",
          "status": "available"
        },
        {
          "system": "Datadog LLM Observability",
          "status": "available"
        }
      ],
      "related": [
        {
          "pattern": "append-only-thought-stream",
          "relation": "composes-with"
        },
        {
          "pattern": "decision-log",
          "relation": "specialises"
        },
        {
          "pattern": "compensating-action",
          "relation": "used-by"
        },
        {
          "pattern": "lineage-tracking",
          "relation": "complements"
        },
        {
          "pattern": "model-card",
          "relation": "complements"
        },
        {
          "pattern": "black-box-opaqueness",
          "relation": "alternative-to"
        },
        {
          "pattern": "sandbox-escape-monitoring",
          "relation": "used-by"
        },
        {
          "pattern": "incident-response-runbook",
          "relation": "used-by"
        }
      ],
      "references": [
        {
          "type": "doc",
          "title": "OpenTelemetry GenAI semantic conventions",
          "year": 2024,
          "url": "https://opentelemetry.io/docs/specs/semconv/gen-ai/"
        }
      ],
      "status_in_practice": "mature",
      "tags": [
        "audit",
        "provenance",
        "rollback"
      ]
    },
    {
      "id": "replay-time-travel",
      "name": "Replay / Time-Travel",
      "aliases": [
        "Trace Replay",
        "Run Branching",
        "Fork from Step N"
      ],
      "category": "governance-observability",
      "intent": "Re-run a past agent trace from any step with modified inputs/prompts/tools to debug or branch.",
      "context": "Agent debugging and experimentation; production incidents that need reproduction.",
      "problem": "Agent runs are non-deterministic and state-laden; without replay, debugging is reproduction-by-prayer.",
      "forces": [
        "Captured state must be complete enough to re-run.",
        "Storage of full traces is expensive.",
        "Modified replays diverge from original; comparison logic is non-trivial."
      ],
      "solution": "Capture per-step inputs, outputs, prompts, model id, tool calls. Provide a replay tool that loads a trace at step N and re-runs forward with optional modifications (different model, different prompt, different tool result). Store branches for comparison.",
      "consequences": {
        "benefits": [
          "Debugging cycle drops from hours to minutes.",
          "A/B comparison of fixes becomes trivial."
        ],
        "liabilities": [
          "Trace storage overhead.",
          "Non-deterministic external dependencies (network) limit fidelity."
        ]
      },
      "constrains": "Replay reads from captured state; live model and tool calls happen only for the modified branch from step N forward.",
      "known_uses": [
        {
          "system": "LangSmith replay",
          "status": "available"
        },
        {
          "system": "Langfuse playground replay",
          "status": "available"
        },
        {
          "system": "Inspect AI",
          "status": "available"
        },
        {
          "system": "Claude Code conversation rewind",
          "status": "available",
          "note": "Transcript-level rewind, not full trace replay."
        },
        {
          "system": "Braintrust playground",
          "status": "available"
        }
      ],
      "related": [
        {
          "pattern": "decision-log",
          "relation": "uses"
        },
        {
          "pattern": "lineage-tracking",
          "relation": "complements"
        }
      ],
      "references": [
        {
          "type": "doc",
          "title": "LangSmith: Replay",
          "url": "https://docs.smith.langchain.com/observability/how_to_guides/replay"
        }
      ],
      "status_in_practice": "mature",
      "tags": [
        "debug",
        "replay",
        "observability"
      ]
    },
    {
      "id": "sandbox-escape-monitoring",
      "name": "Sandbox Escape Monitoring",
      "aliases": [
        "Sandbox Telemetry",
        "Boundary Violation Alerts"
      ],
      "category": "governance-observability",
      "intent": "Treat sandbox boundary violations as telemetry; alert on syscalls, network egress, or filesystem writes outside expected scope.",
      "context": "Agents executing code or operating filesystems where sandbox-isolation is in place; sandboxes have known escape vulnerabilities.",
      "problem": "Sandbox-isolation is preventive only; without monitoring, a successful escape (or even an attempt) is invisible until damage is done.",
      "forces": [
        "Telemetry granularity vs cost.",
        "False positives on legitimate boundary-pushing operations.",
        "Egress patterns evolve faster than allowlists."
      ],
      "solution": "Instrument the sandbox: log every syscall outside the allowed set, every network egress not on the allowlist, every filesystem write outside the working directory. Stream to safety telemetry. Alert on threshold breaches. Pair with kill-switch for automatic halt on confirmed escape.",
      "consequences": {
        "benefits": [
          "Detection of escape attempts and successes.",
          "Forensic trail when incidents occur."
        ],
        "liabilities": [
          "Telemetry volume.",
          "Alert fatigue if thresholds are mis-tuned."
        ]
      },
      "constrains": "Sandbox events outside the allowed set must be logged and inspectable; silent boundary violations are forbidden.",
      "known_uses": [
        {
          "system": "Production code-execution platforms (E2B, Modal sandbox monitoring)",
          "status": "available"
        }
      ],
      "related": [
        {
          "pattern": "sandbox-isolation",
          "relation": "complements"
        },
        {
          "pattern": "kill-switch",
          "relation": "composes-with"
        },
        {
          "pattern": "provenance-ledger",
          "relation": "uses"
        },
        {
          "pattern": "incident-response-runbook",
          "relation": "complements"
        }
      ],
      "references": [
        {
          "type": "spec",
          "title": "OWASP Top 10 for LLM Applications",
          "year": 2025,
          "url": "https://owasp.org/www-project-top-10-for-large-language-model-applications/"
        }
      ],
      "status_in_practice": "emerging",
      "tags": [
        "safety",
        "sandbox",
        "monitoring"
      ]
    },
    {
      "id": "shadow-canary",
      "name": "Shadow Canary",
      "aliases": [
        "Shadow Agent",
        "Canary Deployment"
      ],
      "category": "governance-observability",
      "intent": "Run a candidate agent version in shadow alongside the champion, comparing outputs without affecting users.",
      "context": "Agent changes are non-deterministic and prompt-sensitive; CI tests cover correctness, not field behaviour.",
      "problem": "Releases without field comparison miss regressions visible only on real traffic.",
      "forces": [
        "Shadow runs cost money for output never shown.",
        "Comparison logic for free-form outputs is non-trivial.",
        "Shadow latency must not affect the user-visible path."
      ],
      "solution": "Route a fraction of real traffic through both champion and challenger. Champion's output reaches the user. Challenger's output is logged. Diff the outputs on agreed metrics (judge model, exact match on tool calls, latency, cost). Promote on lift; revert on regression.",
      "consequences": {
        "benefits": [
          "Field-quality regression detection.",
          "Confidence to roll out non-deterministic changes."
        ],
        "liabilities": [
          "2x cost during shadow window.",
          "Diff-noise on free-form outputs is hard to attribute."
        ]
      },
      "constrains": "Challenger output is not user-visible during shadow; only logging.",
      "known_uses": [
        {
          "system": "Standard practice in ML/agent platforms",
          "status": "available"
        }
      ],
      "related": [
        {
          "pattern": "eval-harness",
          "relation": "complements"
        },
        {
          "pattern": "llm-as-judge",
          "relation": "uses"
        },
        {
          "pattern": "perma-beta",
          "relation": "alternative-to"
        },
        {
          "pattern": "eval-as-contract",
          "relation": "complements"
        },
        {
          "pattern": "prompt-versioning",
          "relation": "complements"
        }
      ],
      "references": [
        {
          "type": "book",
          "title": "Site Reliability Engineering: Release Engineering",
          "authors": "Google SRE",
          "year": 2016,
          "url": "https://sre.google/sre-book/release-engineering/"
        }
      ],
      "status_in_practice": "mature",
      "tags": [
        "governance",
        "shadow",
        "release"
      ]
    },
    {
      "id": "append-only-thought-stream",
      "name": "Append-Only Thought Stream",
      "aliases": [
        "Event-Sourced Memory",
        "Immutable Journal"
      ],
      "category": "memory",
      "intent": "Make the agent's thought log append-only so the agent cannot rewrite its own history.",
      "context": "Self-modifying or long-running agents could in principle revise their own past; doing so undermines audit and learning.",
      "problem": "If the agent can edit its own history, every later inference is conditioned on a possibly-rewritten past.",
      "forces": [
        "Append-only stores grow without bound.",
        "Strict immutability conflicts with redaction (PII, mistakes).",
        "Compaction must respect append-only at the underlying log layer."
      ],
      "solution": "Thoughts and journal entries are written to files or a log the agent has no permission to delete or modify. Compaction creates new summary files at higher tiers without touching originals. Redaction goes through an explicit operator path, not the agent.",
      "consequences": {
        "benefits": [
          "Provenance and audit are tractable.",
          "Reasoning over the past is deterministic across runs."
        ],
        "liabilities": [
          "Storage growth.",
          "Operator burden when redactions are needed."
        ]
      },
      "constrains": "The agent has read access only to thoughts/ and journal/; writes go through an append-only API enforced at the tool layer.",
      "known_uses": [
        {
          "system": "Sparrot",
          "note": "thoughts/ and journal/ are read-only at the agent's tool layer.",
          "status": "available"
        }
      ],
      "related": [
        {
          "pattern": "provenance-ledger",
          "relation": "composes-with"
        },
        {
          "pattern": "five-tier-memory-cascade",
          "relation": "composes-with"
        },
        {
          "pattern": "decision-log",
          "relation": "used-by"
        },
        {
          "pattern": "blackboard",
          "relation": "complements"
        },
        {
          "pattern": "todo-list-driven-agent",
          "relation": "complements"
        }
      ],
      "references": [
        {
          "type": "book",
          "title": "Designing Data-Intensive Applications (event sourcing)",
          "authors": "Martin Kleppmann",
          "year": 2017,
          "url": "https://dataintensive.net/"
        }
      ],
      "status_in_practice": "emerging",
      "tags": [
        "memory",
        "append-only",
        "provenance"
      ]
    },
    {
      "id": "awareness",
      "name": "Awareness",
      "aliases": [
        "Situational Awareness",
        "Capability Self-Knowledge"
      ],
      "category": "memory",
      "intent": "Maintain the agent's explicit knowledge of its own tools, capabilities, environment, and current context as queryable state.",
      "context": "Agents that operate over time and across capabilities need to know what they can do and where they are; without explicit awareness, capability is implicit in prompts.",
      "problem": "Agents that do not know their own capabilities either over-promise (hallucinate tools) or under-deliver (forget tools they have).",
      "forces": [
        "Awareness state grows with capability.",
        "Stale awareness misleads.",
        "Self-description is itself a prompt-engineering effort."
      ],
      "solution": "Persist explicit state about: available tools (with descriptions), the environment (what host, what user, what permissions), the current task, and the agent's own identity. Refresh on capability changes. Inject relevant slices of awareness into each turn's context.",
      "consequences": {
        "benefits": [
          "Reduces hallucinated tool calls.",
          "Grounds the agent in its own context."
        ],
        "liabilities": [
          "Awareness state is a maintenance burden.",
          "Excess awareness wastes context tokens."
        ]
      },
      "constrains": "Tool calls and self-references must match the awareness state; mismatches are flagged.",
      "known_uses": [
        {
          "system": "Sparrot world.md + personality.md",
          "status": "available"
        },
        {
          "system": "Avramovic Awareness pattern",
          "status": "available"
        }
      ],
      "related": [
        {
          "pattern": "tool-use",
          "relation": "complements"
        },
        {
          "pattern": "model-card",
          "relation": "specialises"
        },
        {
          "pattern": "tool-discovery",
          "relation": "complements"
        }
      ],
      "references": [
        {
          "type": "repo",
          "title": "zeljkoavramovic/agentic-design-patterns",
          "url": "https://github.com/zeljkoavramovic/agentic-design-patterns"
        }
      ],
      "status_in_practice": "emerging",
      "tags": [
        "awareness",
        "state",
        "self-model"
      ]
    },
    {
      "id": "context-window-packing",
      "name": "Context Window Packing",
      "aliases": [
        "Context Compression",
        "Token Budget Management",
        "Fit in Context",
        "Token Cost Reduction"
      ],
      "category": "memory",
      "intent": "Choose what fits in the context window each turn given a fixed token budget.",
      "context": "Agents whose available context (system prompt + history + retrieved chunks + tools + state) exceeds the model's window.",
      "problem": "Naive concatenation overflows; naive truncation loses critical state.",
      "forces": [
        "What to drop is task-dependent.",
        "Compression has its own LLM cost.",
        "Reserved budget for the response itself."
      ],
      "solution": "Define a packing policy. Reserve N tokens for system + tools + response. Allocate the rest across history (compressed), retrieved chunks (top-k after rerank), and current state. Use eviction (drop oldest), summarisation (compress), or selection (relevance-rank) policies. Audit token counts before each call.",
      "consequences": {
        "benefits": [
          "Predictable behaviour at the window edge.",
          "Inspectable trade-offs."
        ],
        "liabilities": [
          "Complexity of the packing logic.",
          "Compression artefacts."
        ]
      },
      "constrains": "Total tokens passed to the model must not exceed the window minus the reserved response budget.",
      "known_uses": [
        {
          "system": "LangChain ConversationSummaryBufferMemory",
          "status": "available",
          "url": "https://python.langchain.com/api_reference/langchain/memory/langchain.memory.summary_buffer.ConversationSummaryBufferMemory.html"
        },
        {
          "system": "Most production agent frameworks",
          "status": "available"
        }
      ],
      "related": [
        {
          "pattern": "episodic-summaries",
          "relation": "uses"
        },
        {
          "pattern": "memgpt-paging",
          "relation": "alternative-to"
        },
        {
          "pattern": "dynamic-scaffolding",
          "relation": "complements"
        },
        {
          "pattern": "todo-list-driven-agent",
          "relation": "used-by"
        },
        {
          "pattern": "reasoning-trace-carry-forward",
          "relation": "used-by"
        }
      ],
      "references": [
        {
          "type": "paper",
          "title": "Lost in the Middle: How Language Models Use Long Contexts",
          "authors": "Liu, Lin, Hewitt, Paranjape, Bevilacqua, Petroni, Liang",
          "year": 2023,
          "url": "https://arxiv.org/abs/2307.03172"
        }
      ],
      "status_in_practice": "mature",
      "tags": [
        "context",
        "tokens",
        "budget"
      ]
    },
    {
      "id": "cross-session-memory",
      "name": "Cross-Session Memory",
      "aliases": [
        "Persistent User Memory",
        "Long-Lived User Profile",
        "Beat Agent Amnesia",
        "No-Forget Memory",
        "Agent Forgets Between Sessions",
        "Session-to-Session Memory"
      ],
      "category": "memory",
      "intent": "Persist user-specific facts, preferences, and prior context across all sessions, threads, and devices.",
      "context": "User-facing assistants where users expect continuity beyond a single conversation.",
      "problem": "Per-thread memory loses everything between sessions; users repeat themselves; the assistant feels amnesic.",
      "forces": [
        "What to remember vs forget; user agency.",
        "Privacy, deletion, portability requirements.",
        "Cost of always-on memory loading per turn."
      ],
      "solution": "Maintain a per-user store of distilled facts (preferences, prior context, names, projects). Load relevant slices into each session's context. Provide explicit add/forget tools. Audit and surface memory entries to the user. Deletion controls and a user-visible memory inspector (delete / disable / export) satisfy regulatory and trust requirements.",
      "consequences": {
        "benefits": [
          "Continuity across sessions and devices.",
          "Compounding usefulness over time."
        ],
        "liabilities": [
          "Privacy obligations.",
          "Memory hallucinations are stickier than chat hallucinations."
        ]
      },
      "constrains": "Memory entries must be added through declared tools; the model cannot silently mutate persistent user state.",
      "known_uses": [
        {
          "system": "ChatGPT Memory",
          "status": "available"
        },
        {
          "system": "Claude Projects + memory",
          "status": "available"
        },
        {
          "system": "Letta",
          "status": "available",
          "url": "https://docs.letta.com/"
        },
        {
          "system": "Lindy memory",
          "status": "available"
        }
      ],
      "related": [
        {
          "pattern": "short-term-memory",
          "relation": "complements"
        },
        {
          "pattern": "memgpt-paging",
          "relation": "alternative-to"
        },
        {
          "pattern": "session-isolation",
          "relation": "complements"
        }
      ],
      "references": [
        {
          "type": "blog",
          "title": "OpenAI: Memory and new controls for ChatGPT",
          "year": 2024,
          "url": "https://openai.com/index/memory-and-new-controls-for-chatgpt/"
        }
      ],
      "status_in_practice": "mature",
      "tags": [
        "memory",
        "persistence",
        "user"
      ],
      "variants": [
        {
          "name": "Profile facts (key-value)",
          "summary": "Distil each session into a small set of stable user facts (preferences, goals, constraints). Inject the facts into every new session's system prompt.",
          "distinguishing_factor": "structured key-value distillation",
          "when_to_use": "User-facing assistants where personalisation is the point and facts are short and stable."
        },
        {
          "name": "Per-session narrative summary",
          "summary": "Each session is summarised into a paragraph; recent summaries are retrieved and prepended to the new session's context.",
          "distinguishing_factor": "narrative summaries",
          "when_to_use": "Conversational continuity matters more than fact extraction; users expect the agent to 'remember the conversation'.",
          "see_also": "episodic-summaries"
        },
        {
          "name": "Embedded session retrieval",
          "summary": "Past sessions are embedded and stored in a vector index; each new turn retrieves relevant past content by similarity.",
          "distinguishing_factor": "retrieval-time relevance",
          "when_to_use": "Memory is large and only a small fraction is relevant to any one new turn."
        },
        {
          "name": "Tiered cascade",
          "summary": "Combines profile facts, episodic summaries, and vector memory at different tiers; each tier retrieves at its own rate.",
          "distinguishing_factor": "multi-tier composition",
          "when_to_use": "Long-running personalised agents where no single memory shape is enough.",
          "see_also": "five-tier-memory-cascade"
        }
      ]
    },
    {
      "id": "episodic-summaries",
      "name": "Episodic Summaries",
      "aliases": [
        "Compaction",
        "Conversation Summarisation",
        "Chunk Summaries",
        "Reduce Token Cost",
        "Shrink Context",
        "Cuts Token Use",
        "Too Many Tokens Reduction"
      ],
      "category": "memory",
      "intent": "Compress past episodes into summaries that preserve gist while shedding token cost.",
      "context": "A long-running agent has more history than fits the context window; raw replay is impractical.",
      "problem": "Without compaction, either the context grows unboundedly or important facts fall off the back of a sliding window.",
      "forces": [
        "Token savings vs summary fidelity loss.",
        "Compaction LLM cost vs context-window relief.",
        "Single source of truth vs raw-archive availability."
      ],
      "solution": "On a schedule (or at thresholds), summarise blocks of recent thoughts/conversation into compact representations. Store summaries in a higher tier; archive originals. Reads consult summaries first, originals on demand.",
      "consequences": {
        "benefits": [
          "Bounded effective context size despite unbounded history.",
          "Summaries are easier to embed and search."
        ],
        "liabilities": [
          "Summary errors are sticky; the agent reasons over the summary, not the original.",
          "Compaction policy is its own configuration burden."
        ]
      },
      "constrains": "Past events older than the compaction horizon are accessible only via summary, not raw.",
      "known_uses": [
        {
          "system": "Sparrot",
          "note": "Hourly chunk summarisation; daily insight extraction.",
          "status": "available"
        },
        {
          "system": "Generative Agents (Park et al. 2023)",
          "status": "available"
        }
      ],
      "related": [
        {
          "pattern": "five-tier-memory-cascade",
          "relation": "used-by"
        },
        {
          "pattern": "reflexion",
          "relation": "complements"
        },
        {
          "pattern": "context-window-packing",
          "relation": "used-by"
        },
        {
          "pattern": "short-term-memory",
          "relation": "complements"
        }
      ],
      "references": [
        {
          "type": "paper",
          "title": "Generative Agents: Interactive Simulacra of Human Behavior",
          "authors": "Park, O'Brien, Cai, Morris, Liang, Bernstein",
          "year": 2023,
          "url": "https://arxiv.org/abs/2304.03442"
        }
      ],
      "status_in_practice": "mature",
      "tags": [
        "memory",
        "summarisation",
        "compaction"
      ]
    },
    {
      "id": "five-tier-memory-cascade",
      "name": "Five-Tier Memory Cascade",
      "aliases": [
        "Multi-Tier Memory",
        "Cognitive Memory Hierarchy"
      ],
      "category": "memory",
      "intent": "Stage agent memory across sensory, working, short-term, episodic, and long-term tiers with explicit promotion and decay between them.",
      "context": "A long-running agent accumulates information of different timescales (one-tick observation, one-day pattern, one-month rule); flat memory cannot represent this.",
      "problem": "Flat append-only logs collapse signal across timescales; pure long-term memory cannot capture momentary salience.",
      "forces": [
        "Promotion criteria from one tier to the next must be defined and audited.",
        "Storage cost grows with tier count.",
        "Reads must consult the right tier; cross-tier conflicts must be resolved."
      ],
      "solution": "Five tiers. Sensory: raw input per tick. Working: top-N items in active focus (Global Workspace Theory, ≤7 items). Short-term: recent verbatim (1-7 days). Episodic: compressed summaries (5-10x). Long-term: distilled rules and insights. Compaction promotes upward on a schedule; decay archives downward; rehearsal lifts archived items back when re-attended.",
      "consequences": {
        "benefits": [
          "Each tier optimises for its timescale.",
          "Inspectable memory hierarchy maps to cognitive science vocabulary."
        ],
        "liabilities": [
          "Architecturally heavy; only earns its seat in long-running agents.",
          "Tuning the promotion thresholds is empirical work."
        ]
      },
      "constrains": "Reads at each tier may only return items at that tier's compaction level; cross-tier joins go through promotion or rehearsal.",
      "known_uses": [
        {
          "system": "Sparrot",
          "note": "workspace/focus.md (working), thoughts/ (short), chunks/ (episodic), insights+rules+motivations (long).",
          "status": "available"
        }
      ],
      "related": [
        {
          "pattern": "episodic-summaries",
          "relation": "uses"
        },
        {
          "pattern": "hippocampal-rehearsal",
          "relation": "uses"
        },
        {
          "pattern": "append-only-thought-stream",
          "relation": "composes-with"
        },
        {
          "pattern": "memgpt-paging",
          "relation": "alternative-to"
        }
      ],
      "references": [
        {
          "type": "paper",
          "title": "Generative Agents (memory stream + reflection)",
          "authors": "Park et al.",
          "year": 2023,
          "url": "https://arxiv.org/abs/2304.03442"
        },
        {
          "type": "book",
          "title": "A Cognitive Theory of Consciousness (Global Workspace Theory)",
          "authors": "Bernard Baars",
          "year": 1988,
          "url": "https://www.amazon.com/Cognitive-Theory-Consciousness-Bernard-Baars/dp/0521427436"
        }
      ],
      "status_in_practice": "experimental",
      "tags": [
        "memory",
        "cognitive-architecture"
      ]
    },
    {
      "id": "hippocampal-rehearsal",
      "name": "Hippocampal Rehearsal",
      "aliases": [
        "Memory Reactivation",
        "Lift-from-Archive"
      ],
      "category": "memory",
      "intent": "Lift archived memory items back into short-term tiers when something re-attends to them.",
      "context": "A long-running agent has archived something that becomes relevant again; cold-storage retrieval is slow and out-of-band.",
      "problem": "Archived items might as well not exist if the agent never thinks about them again, even when current context makes them relevant.",
      "forces": [
        "Re-attention triggers must be cheap to evaluate.",
        "Lifting too aggressively floods the working tier.",
        "The lifted item is now a duplicate of the archive copy."
      ],
      "solution": "When salience scoring matches against archived items (embedding similarity, keyword match, explicit reference), the matched item is reactivated into short-term memory for one or more cycles. The original archive copy stays untouched.",
      "consequences": {
        "benefits": [
          "Long-tail relevance does not require the agent to remember to remember.",
          "Mimics the rehearsal step of biological memory consolidation."
        ],
        "liabilities": [
          "False rehearsals waste working-memory slots.",
          "Operationally complex; requires content-addressable storage."
        ]
      },
      "constrains": "Archived items become readable only after rehearsal lifts them; direct cold reads are not part of the agent's primary path.",
      "known_uses": [
        {
          "system": "Sparrot",
          "note": "Stage P5+ rehearsal step.",
          "status": "available"
        }
      ],
      "related": [
        {
          "pattern": "five-tier-memory-cascade",
          "relation": "used-by"
        }
      ],
      "references": [
        {
          "type": "paper",
          "title": "Memory consolidation through hippocampal-cortical replay (review)",
          "year": 2017,
          "url": "https://www.cell.com/current-biology/fulltext/S0960-9822(20)31397-3"
        }
      ],
      "status_in_practice": "experimental",
      "tags": [
        "memory",
        "rehearsal"
      ]
    },
    {
      "id": "knowledge-graph-memory",
      "name": "Knowledge Graph Memory",
      "aliases": [
        "Triple Store Memory",
        "Symbolic Memory"
      ],
      "category": "memory",
      "intent": "Persist agent memory as entities and relations in a structured graph so symbolic queries (path, neighbour, type) become possible.",
      "context": "Tasks where structured queries over relationships beat semantic similarity (organisation charts, code call graphs, family trees, knowledge bases).",
      "problem": "Vector memory cannot answer 'who reports to whom' or 'what depends on X' queries; symbolic structure is lost.",
      "forces": [
        "Entity and relation extraction is itself a model task with errors.",
        "Schema design for the graph is a separate engineering effort.",
        "Updates and deletions need referential integrity."
      ],
      "solution": "Extract entities and relations from observations into a graph store (Neo4j, RDF, simple JSON). Queries traverse the graph (Cypher/SPARQL or programmatic). Combine with vector memory for hybrid retrieval (vector finds entry points; graph traverses).",
      "consequences": {
        "benefits": [
          "Structured queries over relationships.",
          "Inspectable, editable, debuggable knowledge."
        ],
        "liabilities": [
          "Extraction quality bounds graph quality.",
          "Schema rigidity vs flexibility tension."
        ]
      },
      "constrains": "Memory queries that require traversal must use graph operations; ad-hoc text matching over the graph is not the supported access path.",
      "known_uses": [
        {
          "system": "Microsoft GraphRAG (graph as memory + retrieval)",
          "status": "available"
        },
        {
          "system": "Zep memory (hybrid)",
          "status": "available"
        }
      ],
      "related": [
        {
          "pattern": "vector-memory",
          "relation": "alternative-to"
        },
        {
          "pattern": "graphrag",
          "relation": "composes-with"
        }
      ],
      "references": [
        {
          "type": "paper",
          "title": "From Local to Global: A Graph RAG Approach to Query-Focused Summarization",
          "authors": "Edge et al.",
          "year": 2024,
          "url": "https://arxiv.org/abs/2404.16130"
        },
        {
          "type": "repo",
          "title": "microsoft/graphrag",
          "url": "https://github.com/microsoft/graphrag"
        }
      ],
      "status_in_practice": "emerging",
      "tags": [
        "memory",
        "graph",
        "knowledge"
      ]
    },
    {
      "id": "memgpt-paging",
      "name": "MemGPT-Style Paging",
      "aliases": [
        "Virtual Context",
        "Memory Paging",
        "OS-Style Memory"
      ],
      "category": "memory",
      "intent": "Treat the LLM context window as RAM and external storage as disk, with the model issuing tool calls to page memory in and out.",
      "context": "Long-running agents whose conversation or document state exceeds the context window; naive truncation loses state unpredictably.",
      "problem": "Fixed context windows force a choice between losing state and stuffing irrelevant content; both degrade quality.",
      "forces": [
        "Paging tools compete for context space themselves.",
        "Eviction policy (LRU? LFU? salience?) affects quality.",
        "Tool latency on page faults adds to user-visible time."
      ],
      "solution": "Two memory tiers. Main context: system prompt, working set, recent messages. External context: recall (raw history) and archival (vector store). The model has tool calls for read_recall, write_archival, search_archival. Paging happens at the agent's discretion; the model treats main context as RAM and external as disk.",
      "consequences": {
        "benefits": [
          "Conversation continuity beyond the context window.",
          "Inspectable memory tiers; archival is queryable independently."
        ],
        "liabilities": [
          "Tool definitions consume context budget.",
          "Page-fault tool calls add latency."
        ]
      },
      "constrains": "Memory beyond the working set is accessible only via paging tool calls; the agent cannot directly read external state.",
      "known_uses": [
        {
          "system": "Letta (formerly MemGPT)",
          "status": "available",
          "url": "https://github.com/letta-ai/letta"
        }
      ],
      "related": [
        {
          "pattern": "vector-memory",
          "relation": "uses"
        },
        {
          "pattern": "five-tier-memory-cascade",
          "relation": "alternative-to"
        },
        {
          "pattern": "tool-use",
          "relation": "uses",
          "note": "Paging operations are tool calls."
        },
        {
          "pattern": "cross-session-memory",
          "relation": "alternative-to"
        },
        {
          "pattern": "context-window-packing",
          "relation": "alternative-to"
        }
      ],
      "references": [
        {
          "type": "paper",
          "title": "MemGPT: Towards LLMs as Operating Systems",
          "authors": "Packer, Wooders, Lin, Fang, Patil, Stoica, Gonzalez",
          "year": 2023,
          "url": "https://arxiv.org/abs/2310.08560"
        }
      ],
      "status_in_practice": "emerging",
      "tags": [
        "memory",
        "paging",
        "os"
      ]
    },
    {
      "id": "reasoning-trace-carry-forward",
      "name": "Reasoning Trace Carry-Forward",
      "aliases": [
        "Reasoning Content Episode",
        "CoT Carry Across Tool Calls",
        "Episode-Bound Reasoning"
      ],
      "category": "memory",
      "intent": "For reasoning models that emit a separate reasoning trace, preserve that trace in context across the same logical task episode (across tool-call/result turns) but drop it at user-turn boundaries.",
      "context": "The agent uses a reasoning model that exposes a separate reasoning_content field (the model's chain of thought, distinct from the user-visible content), inside a tool-use loop with multi-turn history.",
      "problem": "Two failure modes pull in opposite directions. (1) If the reasoning trace is dropped between a tool call and its result, the model loses the context of why it called the tool. (2) If the reasoning trace is preserved across user-turn boundaries, conversation history bloats with stale reasoning and the next user task inherits irrelevant prior thinking.",
      "forces": [
        "Reasoning trace is the bridge between tool-call intent and post-tool-result interpretation.",
        "Reasoning trace is private intermediate state, not conversational record.",
        "Tokens are expensive; preserving traces forever costs money.",
        "Stale reasoning leaks bias into the next task."
      ],
      "solution": "Define an episode as: from one user turn to the next user turn (inclusive of all intervening tool calls and tool results). Within an episode, preserve assistant reasoning_content as part of the context concatenation across all turns. At the next user turn boundary, drop reasoning_content from prior episodes (the API silently ignores it when passed across boundaries). The user-visible content remains in history; only the reasoning trace is episode-scoped.",
      "structure": "User -> [reasoning + tool_call] -> tool_result -> [reasoning + tool_call] -> tool_result -> [reasoning + final_content] -> User. Within episode: preserve reasoning. Across episodes: drop reasoning, keep content.",
      "consequences": {
        "benefits": [
          "Tool-using episodes get the benefit of CoT continuity.",
          "Multi-turn dialogues do not accumulate stale reasoning.",
          "Cheaper than naive reasoning-trace preservation forever."
        ],
        "liabilities": [
          "Episode boundary detection has to be encoded in the agent loop, not the model.",
          "If the model expects its own past reasoning at a later turn, dropping it breaks that.",
          "Provider-specific (DeepSeek-style reasoning_content); needs adaptation per API."
        ]
      },
      "constrains": "Internal reasoning content may not cross user-task boundaries; only user-visible content persists in conversation history.",
      "known_uses": [
        {
          "system": "DeepSeek API (thinking mode)",
          "note": "Documented behaviour: pass reasoning_content back across tool-call turns; drop it across user turns.",
          "status": "available",
          "url": "https://api-docs.deepseek.com/guides/thinking_mode"
        }
      ],
      "related": [
        {
          "pattern": "extended-thinking",
          "relation": "complements"
        },
        {
          "pattern": "context-window-packing",
          "relation": "uses"
        },
        {
          "pattern": "short-term-memory",
          "relation": "specialises"
        },
        {
          "pattern": "prompt-caching",
          "relation": "complements"
        }
      ],
      "references": [
        {
          "type": "doc",
          "title": "DeepSeek API: Thinking Mode",
          "url": "https://api-docs.deepseek.com/guides/thinking_mode"
        },
        {
          "type": "paper",
          "title": "DeepSeek-V3 Technical Report",
          "authors": "DeepSeek-AI",
          "year": 2024,
          "url": "https://arxiv.org/abs/2412.19437"
        }
      ],
      "status_in_practice": "emerging",
      "tags": [
        "memory",
        "reasoning",
        "china-origin",
        "deepseek"
      ]
    },
    {
      "id": "scratchpad",
      "name": "Scratchpad",
      "aliases": [
        "Working Notes",
        "Thinking Tool",
        "Notepad"
      ],
      "category": "memory",
      "intent": "Give the agent a writable scratch space for intermediate notes that informs later turns but does not pollute the response.",
      "context": "Long tasks where the agent benefits from writing things down (computations, plans, lists of unresolved questions) without showing the user.",
      "problem": "Without a scratchpad, intermediate work pollutes the response or is lost between turns.",
      "forces": [
        "Scratchpad content adds tokens to subsequent turns.",
        "What stays in the scratchpad vs the response is a UX choice.",
        "Scratchpad content can leak via traces."
      ],
      "solution": "Provide a tool or convention for writing to a scratchpad (a section of the prompt, a tool call, a file). The agent reads from and writes to it across turns. The user-visible response is separate. The scratchpad is purged at task completion or expires with the session.",
      "consequences": {
        "benefits": [
          "Intermediate work persists without cluttering output.",
          "Useful for chain-of-thought style reasoning that should not be visible."
        ],
        "liabilities": [
          "Token cost grows with scratchpad size.",
          "Scratchpad becomes shadow state if not purged."
        ]
      },
      "constrains": "Scratchpad contents are visible only to the agent loop; user-facing output draws from the response slot.",
      "known_uses": [
        {
          "system": "OpenAI o1-style internal reasoning",
          "status": "available"
        },
        {
          "system": "Anthropic <thinking> blocks",
          "status": "available"
        }
      ],
      "related": [
        {
          "pattern": "short-term-memory",
          "relation": "complements"
        },
        {
          "pattern": "chain-of-thought",
          "relation": "uses"
        },
        {
          "pattern": "extended-thinking",
          "relation": "complements"
        },
        {
          "pattern": "todo-list-driven-agent",
          "relation": "generalises"
        }
      ],
      "references": [
        {
          "type": "paper",
          "title": "Show Your Work: Scratchpads for Intermediate Computation with Language Models",
          "authors": "Nye et al.",
          "year": 2021,
          "url": "https://arxiv.org/abs/2112.00114"
        }
      ],
      "status_in_practice": "mature",
      "tags": [
        "memory",
        "scratchpad",
        "thinking"
      ]
    },
    {
      "id": "session-isolation",
      "name": "Session Isolation",
      "aliases": [
        "Tenant Separation",
        "Per-User State"
      ],
      "category": "memory",
      "intent": "Keep one user's session state and memory unreachable from another user's agent.",
      "context": "Multi-user agent products where leaks across users are a privacy and security failure.",
      "problem": "Shared memory backends and shared model contexts can leak one user's data into another's response.",
      "forces": [
        "Cache hits across users are tempting for cost; they break isolation.",
        "Auth scope must travel with every read and write.",
        "Multi-tenant prompt injection becomes a real attack surface."
      ],
      "solution": "Session state is keyed by per-user identity (OAuth/JWT subject). Reads and writes carry that identity end-to-end. Caches are scoped per user. Prompts never include another user's content.",
      "consequences": {
        "benefits": [
          "Privacy and security boundary is explicit and testable.",
          "Multi-tenant compliance posture is simpler."
        ],
        "liabilities": [
          "Loss of cross-user cache benefits.",
          "Auth plumbing in every layer."
        ]
      },
      "constrains": "No code path may read or cache user A's state under user B's identity.",
      "known_uses": [
        {
          "system": "Bobbin (Stash2Go)",
          "note": "Per-user OAuth/JWT scope for tools and state.",
          "status": "available"
        },
        {
          "system": "Weft",
          "note": "Bearer-wrapped per-user OAuth 1.0a tokens.",
          "status": "available"
        }
      ],
      "related": [
        {
          "pattern": "short-term-memory",
          "relation": "complements"
        },
        {
          "pattern": "input-output-guardrails",
          "relation": "complements"
        },
        {
          "pattern": "cross-session-memory",
          "relation": "complements"
        },
        {
          "pattern": "tool-result-caching",
          "relation": "complements"
        },
        {
          "pattern": "prompt-injection-defense",
          "relation": "complements"
        },
        {
          "pattern": "pii-redaction",
          "relation": "complements"
        },
        {
          "pattern": "secrets-handling",
          "relation": "complements"
        },
        {
          "pattern": "sovereign-inference-stack",
          "relation": "complements"
        }
      ],
      "references": [
        {
          "type": "doc",
          "title": "Prompt caching",
          "year": 2025,
          "url": "https://docs.claude.com/en/docs/build-with-claude/prompt-caching"
        }
      ],
      "status_in_practice": "mature",
      "tags": [
        "memory",
        "multi-tenant",
        "auth"
      ]
    },
    {
      "id": "short-term-memory",
      "name": "Short-Term Thread Memory",
      "aliases": [
        "Conversation State",
        "Per-Thread State",
        "Working Memory"
      ],
      "category": "memory",
      "intent": "Carry the relevant slice of conversation context across turns within a session.",
      "context": "A multi-turn agent needs continuity (the user's most recent screen, the active plan, prior tool results) but does not need it forever.",
      "problem": "Replaying the entire conversation every turn is expensive and pollutes context with stale facts.",
      "forces": [
        "TTL choice (minutes? hours? days?) trades freshness for cost.",
        "What to keep vs. summarise is a quality-vs-cost tension.",
        "Multi-device sessions complicate where state lives."
      ],
      "solution": "Define a typed state object per thread (messages, current screen, active plan, agent step). Persist with a TTL (commonly 24h). Reload on the next turn; expire and reset on TTL.",
      "consequences": {
        "benefits": [
          "Continuity without full-history replay.",
          "Bounded memory footprint per active user."
        ],
        "liabilities": [
          "TTL boundaries surprise users when state vanishes mid-task.",
          "Schema migrations are painful for live state."
        ]
      },
      "constrains": "The agent cannot rely on facts older than the TTL window without re-fetching them.",
      "known_uses": [
        {
          "system": "Bobbin (Stash2Go)",
          "note": "Per-thread state in chat/backend/state.py with 24h TTL.",
          "status": "available"
        },
        {
          "system": "LangGraph MemorySaver checkpoints",
          "status": "available"
        }
      ],
      "related": [
        {
          "pattern": "episodic-summaries",
          "relation": "complements"
        },
        {
          "pattern": "session-isolation",
          "relation": "complements"
        },
        {
          "pattern": "agent-resumption",
          "relation": "used-by"
        },
        {
          "pattern": "cross-session-memory",
          "relation": "complements"
        },
        {
          "pattern": "scratchpad",
          "relation": "complements"
        },
        {
          "pattern": "reasoning-trace-carry-forward",
          "relation": "generalises"
        }
      ],
      "references": [
        {
          "type": "doc",
          "title": "LangGraph: Persistence",
          "url": "https://langchain-ai.github.io/langgraph/concepts/persistence/"
        }
      ],
      "status_in_practice": "mature",
      "tags": [
        "memory",
        "state",
        "ttl"
      ]
    },
    {
      "id": "vector-memory",
      "name": "Vector Memory",
      "aliases": [
        "Semantic Memory",
        "Embedding-Indexed Memory"
      ],
      "category": "memory",
      "intent": "Store memories as embeddings in a vector index and retrieve the most semantically similar items at query time.",
      "context": "Long-running agents accumulate facts/observations whose relevance is best judged by similarity to the current context.",
      "problem": "Append-only logs grow unboundedly; without semantic retrieval the agent cannot find the relevant past.",
      "forces": [
        "Embedding choice constrains retrieval quality.",
        "Index updates have non-trivial latency.",
        "Forgetting is achieved by deletion or decay; both have failure modes."
      ],
      "solution": "Each memory item is embedded and indexed. At query time, embed the query (or a summary of current state), retrieve top-k most similar memories, prepend to context. Optional decay (boost recent, age old) and salience weighting.",
      "consequences": {
        "benefits": [
          "Semantically relevant past surfaces automatically.",
          "Scales to memory stores too large for context."
        ],
        "liabilities": [
          "Misses purely temporal queries ('what did I do yesterday?').",
          "Embedding drift on schema changes."
        ]
      },
      "constrains": "The agent reads memory only through the retriever; full-store scans are not part of the loop.",
      "known_uses": [
        {
          "system": "MemGPT / Letta archival memory",
          "status": "available",
          "url": "https://docs.letta.com/"
        },
        {
          "system": "Generative Agents memory stream (Park et al.)",
          "status": "available"
        },
        {
          "system": "LangChain VectorStoreRetrieverMemory",
          "status": "available"
        }
      ],
      "related": [
        {
          "pattern": "memgpt-paging",
          "relation": "used-by"
        },
        {
          "pattern": "naive-rag",
          "relation": "specialises",
          "note": "Vector Memory is RAG over the agent's own past."
        },
        {
          "pattern": "knowledge-graph-memory",
          "relation": "alternative-to"
        }
      ],
      "references": [
        {
          "type": "paper",
          "title": "Generative Agents: Interactive Simulacra of Human Behavior",
          "authors": "Park et al.",
          "year": 2023,
          "url": "https://arxiv.org/abs/2304.03442"
        }
      ],
      "status_in_practice": "mature",
      "tags": [
        "memory",
        "vector",
        "embedding"
      ]
    },
    {
      "id": "agent-as-tool-embedding",
      "name": "Agent-as-Tool Embedding",
      "aliases": [
        "Sub-Agent as Function",
        "Nested Agent",
        "Agent Wrapped in a Tool Signature"
      ],
      "category": "multi-agent",
      "intent": "Wrap a sub-agent (with its own loop, prompt, and tool palette) behind a single function-shaped tool signature, so the parent agent calls it like any other tool and never sees the sub-agent's internal turns.",
      "context": "A parent agent that benefits from delegating bounded sub-tasks (\"search the web for X and summarise\", \"plan a multi-day itinerary\") to a specialised loop without inheriting the sub-agent's turn-level context.",
      "problem": "Letting the parent observe the sub-agent's turns either bloats parent context or couples parent reasoning to sub-agent intermediate state. Hand-rolled multi-agent broadcast bus is over-engineering for this case.",
      "forces": [
        "Nested loops add abstraction; parent shouldn't care about how sub solves it.",
        "The function-shaped tool signature is already the agent's native composition unit.",
        "Sub-agent failure has to surface cleanly to the parent.",
        "Cost attribution across nesting depth is non-trivial."
      ],
      "solution": "Define the sub-agent as `def sub_agent(task: str, ...) -> Result`. The parent calls it like any other tool. Inside the function: a fresh agent loop with its own model, tool palette, and step budget runs to completion or failure, returning a structured result. Parent context records only the call and the return value. Step budget and timeout are enforced by the wrapper, not by the sub-agent's prompt.",
      "structure": "Parent agent -> tool_call(sub_agent, task) -> [hidden: sub-agent loop] -> Result -> Parent agent.",
      "consequences": {
        "benefits": [
          "Composition without ad-hoc multi-agent infrastructure.",
          "Parent context stays small and stable.",
          "Sub-agent can be replaced or upgraded behind the same signature."
        ],
        "liabilities": [
          "Hidden costs: sub-agent failures or timeouts surprise the parent.",
          "Debugging requires traceability across the boundary (parent sees only the return).",
          "Recursive nesting can spiral cost if the sub-agent itself spawns more."
        ]
      },
      "constrains": "The parent may not access the sub-agent's intermediate turns; only the return value crosses the boundary.",
      "known_uses": [
        {
          "system": "Hugging Face Transformers Agents (multi-agent)",
          "note": "ReactCodeAgent embeds sub-agents as callable Python functions.",
          "status": "available",
          "url": "https://huggingface.co/docs/transformers/v4.47.1/agents_advanced"
        },
        {
          "system": "smolagents",
          "note": "Same pattern; sub-agents exposed as ordinary tool functions to a CodeAgent.",
          "status": "available",
          "url": "https://huggingface.co/docs/smolagents/"
        },
        {
          "system": "OpenAI Agents SDK / handoffs",
          "note": "Adjacent pattern with explicit handoff semantics rather than function-call nesting.",
          "status": "available",
          "url": "https://openai.github.io/openai-agents-python/"
        }
      ],
      "related": [
        {
          "pattern": "orchestrator-workers",
          "relation": "specialises"
        },
        {
          "pattern": "subagent-isolation",
          "relation": "complements"
        },
        {
          "pattern": "hierarchical-agents",
          "relation": "specialises"
        },
        {
          "pattern": "tool-use",
          "relation": "uses"
        },
        {
          "pattern": "step-budget",
          "relation": "complements"
        }
      ],
      "references": [
        {
          "type": "doc",
          "title": "Hugging Face Transformers — Agents Advanced (Multi-Agents)",
          "url": "https://huggingface.co/docs/transformers/v4.47.1/agents_advanced"
        }
      ],
      "status_in_practice": "emerging",
      "tags": [
        "multi-agent",
        "composition",
        "france-origin",
        "huggingface",
        "smolagents"
      ]
    },
    {
      "id": "autogen-conversational",
      "name": "Conversational Multi-Agent",
      "aliases": [
        "AutoGen Conversation",
        "Two-Agent Conversation"
      ],
      "category": "multi-agent",
      "intent": "Have agents converse turn by turn until a completion criterion fires; agent roles drive the conversation forward.",
      "context": "Tasks that benefit from back-and-forth between specialists (a coder agent and a reviewer agent; a teacher agent and a student agent).",
      "problem": "Single-agent loops cannot represent dialogue-shaped collaboration; rigid orchestration patterns over-prescribe the flow.",
      "forces": [
        "Turn allocation across agents.",
        "Termination criterion definition.",
        "Conversation can drift without supervision."
      ],
      "solution": "Define agents with system prompts and allowed actions. Implement a conversation manager that selects which agent speaks next (round-robin, condition-based, model-decided). Each agent reads the conversation and emits a turn. Continue until termination criterion (task complete, max turns, explicit handoff to user).",
      "consequences": {
        "benefits": [
          "Natural way to model peer collaboration.",
          "Each agent has a clean role definition."
        ],
        "liabilities": [
          "Conversation drift is real.",
          "Hard to reason about correctness of the multi-agent flow."
        ]
      },
      "constrains": "Each agent's outputs must conform to its role's allowed action set; agents may not act outside their role's vocabulary.",
      "known_uses": [
        {
          "system": "Microsoft AutoGen",
          "status": "available",
          "url": "https://microsoft.github.io/autogen/"
        }
      ],
      "related": [
        {
          "pattern": "role-assignment",
          "relation": "complements"
        },
        {
          "pattern": "supervisor",
          "relation": "alternative-to"
        },
        {
          "pattern": "camel-role-playing",
          "relation": "alternative-to"
        }
      ],
      "references": [
        {
          "type": "paper",
          "title": "AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation",
          "authors": "Wu, Bansal, Zhang, Wu, Zhang, Zhu, Li, Jiang, Zhang, Wang",
          "year": 2023,
          "url": "https://arxiv.org/abs/2308.08155"
        }
      ],
      "status_in_practice": "emerging",
      "tags": [
        "multi-agent",
        "conversation",
        "autogen"
      ]
    },
    {
      "id": "blackboard",
      "name": "Blackboard",
      "aliases": [
        "Shared Workspace",
        "Collaboration Whiteboard"
      ],
      "category": "multi-agent",
      "intent": "Give multiple agents a shared, queryable workspace they can read from and write to as they collaborate.",
      "context": "Coordinated multi-agent work where agents need to see each other's progress and contribute to a shared artefact.",
      "problem": "Agents working in isolation miss each other's progress; explicit messaging requires a protocol; shared mutable state without discipline races.",
      "forces": [
        "Concurrent writes need conflict resolution.",
        "Blackboard contents grow; pruning is needed.",
        "Read latency: pulling vs subscribing."
      ],
      "solution": "Establish a shared store (file, database, in-memory). Each agent reads the relevant slice and writes its contribution under structured keys. Optional event notification when keys change. Conflict resolution is policy-driven (last-write-wins, version-vector, append-only).",
      "consequences": {
        "benefits": [
          "Loose coupling: agents do not know about each other directly.",
          "Inspectable shared state."
        ],
        "liabilities": [
          "Race conditions under concurrent writes.",
          "Blackboard bloat without pruning."
        ]
      },
      "constrains": "Cross-agent communication happens only via the blackboard; out-of-band agent-to-agent calls are forbidden.",
      "known_uses": [
        {
          "system": "Classical AI blackboard architectures",
          "status": "available"
        },
        {
          "system": "Multi-agent code review with shared scratchpad",
          "status": "available"
        }
      ],
      "related": [
        {
          "pattern": "swarm",
          "relation": "complements"
        },
        {
          "pattern": "supervisor",
          "relation": "alternative-to"
        },
        {
          "pattern": "append-only-thought-stream",
          "relation": "complements"
        },
        {
          "pattern": "graph-of-thoughts",
          "relation": "composes-with"
        },
        {
          "pattern": "sop-encoded-multi-agent",
          "relation": "used-by"
        }
      ],
      "references": [
        {
          "type": "book",
          "title": "Blackboard Systems (Engelmore, Morgan)",
          "year": 1988,
          "url": "https://www.amazon.com/Blackboard-Systems-Robert-Engelmore/dp/0201117398"
        }
      ],
      "status_in_practice": "experimental",
      "tags": [
        "multi-agent",
        "blackboard",
        "shared-state"
      ]
    },
    {
      "id": "camel-role-playing",
      "name": "CAMEL Role-Playing",
      "aliases": [
        "Inception Prompting",
        "AI-User AI-Assistant"
      ],
      "category": "multi-agent",
      "intent": "Have two agents role-play a user-assistant interaction to autonomously complete a task neither could solve alone.",
      "context": "Tasks where a user-style agent and an assistant-style agent productively elaborate a task through their interaction.",
      "problem": "Single agents miss the turn-taking dynamics that human collaboration relies on; pure debate is adversarial when collaboration is what is wanted.",
      "forces": [
        "Roles drift toward sameness without inception prompting.",
        "Conversation length must be bounded.",
        "Tasks need to be specified as something the role-play can converge on."
      ],
      "solution": "Use inception prompts to instantiate two agents (AI-User and AI-Assistant) with their roles fixed and the task specified. They converse until the task is completed or budget exhausted. The output is the final assistant message; the conversation log is debugging artefact.",
      "consequences": {
        "benefits": [
          "Synthetic task-solving without human-in-the-loop.",
          "Useful for generating training data."
        ],
        "liabilities": [
          "Cost: 2x inference per task.",
          "Role drift over long conversations."
        ]
      },
      "constrains": "The AI-User role may only ask, never answer; AI-Assistant may only answer, never ask user-style questions.",
      "known_uses": [
        {
          "system": "CAMEL framework",
          "status": "available",
          "url": "https://www.camel-ai.org/"
        }
      ],
      "related": [
        {
          "pattern": "autogen-conversational",
          "relation": "alternative-to"
        },
        {
          "pattern": "role-assignment",
          "relation": "specialises"
        }
      ],
      "references": [
        {
          "type": "paper",
          "title": "CAMEL: Communicative Agents for \"Mind\" Exploration of Large Language Model Society",
          "authors": "Li, Hammoud, Itani, Khizbullin, Ghanem",
          "year": 2023,
          "url": "https://arxiv.org/abs/2303.17760"
        }
      ],
      "status_in_practice": "experimental",
      "tags": [
        "multi-agent",
        "role-play"
      ]
    },
    {
      "id": "chat-chain",
      "name": "Chat Chain",
      "aliases": [
        "Phased Multi-Agent Pipeline",
        "Sequential Role-Pair Chats",
        "Communicative Phase Chain"
      ],
      "category": "multi-agent",
      "intent": "Decompose a long, multi-disciplinary task into ordered phases; within each phase, run a paired-role chat between two agents until the phase artefact is signed off; pass the artefact to the next phase.",
      "context": "A task (build a small program, write a brief, prepare a report) requires several disciplines in sequence and is too long for a single agent loop or a flat multi-agent broadcast.",
      "problem": "A single agent loop loses focus; broadcast multi-agent chat produces tangled context; flat prompt-chaining cannot host the back-and-forth a discipline needs.",
      "forces": [
        "Each discipline benefits from focused two-agent dialogue.",
        "Context windows blow up if every agent sees every chat.",
        "Phase-to-phase hand-off needs a clean artefact contract.",
        "Termination of a phase has to be explicit, not vibes-based."
      ],
      "solution": "Define an ordered chain of phases. Each phase has (a) a defined input artefact, (b) two role-paired agents (e.g. designer + coder, coder + tester), (c) a phase-specific completion predicate, (d) a defined output artefact. Within a phase, the two agents converse multi-turn; the completion predicate ends the phase; the artefact moves to the next phase. The chain is the macro-control; the chat is the micro-control.",
      "structure": "Phase_1 (Role_A <-> Role_B) -> artefact_1 -> Phase_2 (Role_B <-> Role_C) -> artefact_2 -> ... -> final_artefact.",
      "consequences": {
        "benefits": [
          "Clear macro-progression with chat-level flexibility inside each phase.",
          "Keeps each phase's context tight; only the artefact crosses the boundary.",
          "Auditable artefact trail per phase."
        ],
        "liabilities": [
          "Designing the chain (phases + completion predicates) is the architecture problem.",
          "Sequential by construction; parallelism inside a phase requires extra design.",
          "Wrong phase decomposition forces agents into awkward role pairings."
        ]
      },
      "constrains": "Agents may not skip phases or address agents outside the current phase; phase output must satisfy the completion predicate before transition.",
      "known_uses": [
        {
          "system": "ChatDev",
          "note": "Software-development chain: design → coding → testing → documentation, each as a paired-role chat.",
          "status": "available",
          "url": "https://github.com/OpenBMB/ChatDev"
        }
      ],
      "related": [
        {
          "pattern": "prompt-chaining",
          "relation": "generalises",
          "note": "Prompt chaining is a single-agent special case."
        },
        {
          "pattern": "sop-encoded-multi-agent",
          "relation": "complements"
        },
        {
          "pattern": "supervisor",
          "relation": "alternative-to"
        },
        {
          "pattern": "pipes-and-filters",
          "relation": "uses"
        },
        {
          "pattern": "stop-hook",
          "relation": "uses",
          "note": "Phase completion predicate is a stop hook scoped to a phase."
        }
      ],
      "references": [
        {
          "type": "paper",
          "title": "ChatDev: Communicative Agents for Software Development",
          "authors": "Qian et al.",
          "year": 2023,
          "url": "https://arxiv.org/abs/2307.07924"
        }
      ],
      "status_in_practice": "emerging",
      "tags": [
        "multi-agent",
        "pipeline",
        "china-origin",
        "chatdev"
      ]
    },
    {
      "id": "communicative-dehallucination",
      "name": "Communicative Dehallucination",
      "aliases": [
        "Instructor-Reversal Clarification",
        "Inter-Agent Clarifying Question"
      ],
      "category": "multi-agent",
      "intent": "When an instructed agent would have to invent missing context to comply, have it reverse roles and ask the instructor for the missing detail before answering.",
      "context": "Two agents communicate (instructor → assistant) and the instruction is under-specified in a way that would force the assistant to fabricate (e.g. a missing class name, an unspecified API version, an ambiguous unit).",
      "problem": "Without a clarification channel between agents, the assistant fabricates the missing detail; the fabrication propagates and is hard to detect at the artefact boundary.",
      "forces": [
        "Speed of completion vs. fidelity of context.",
        "Adding a clarification round costs latency and tokens.",
        "Asking too eagerly degrades into chatter; not asking at all produces hallucinated outputs."
      ],
      "solution": "Define an explicit role-reversal protocol: when the assistant detects that the instruction is missing a deciding piece of context, it pivots and emits a focused question back to the instructor (\"the precise name of the dependency, please\"). The instructor answers, and only then does the assistant produce its conclusion. Bound the depth (one or two reversals) to prevent infinite ping-pong.",
      "structure": "Instructor -> instruction -> Assistant; if context_gap_detected: Assistant -> question -> Instructor -> answer -> Assistant -> conclusion.",
      "consequences": {
        "benefits": [
          "Targets the specific dehallucination point instead of after-the-fact verification.",
          "Cheaper than full multi-agent debate; the question is scoped.",
          "Produces a more faithful artefact at the next hand-off."
        ],
        "liabilities": [
          "Adds latency for every clarification round.",
          "Detecting the gap is itself a model judgement and can fail.",
          "Risk of infinite ping-pong without a depth bound."
        ]
      },
      "constrains": "The assistant may not produce a final answer when a designated context slot is unfilled; it must instead emit a clarifying question.",
      "known_uses": [
        {
          "system": "ChatDev",
          "note": "Original demonstration; assistant reverses to instructor role to request missing detail before delivering a conclusive response.",
          "status": "available",
          "url": "https://github.com/OpenBMB/ChatDev"
        }
      ],
      "related": [
        {
          "pattern": "disambiguation",
          "relation": "specialises",
          "note": "Same shape, but agent-to-agent rather than agent-to-user."
        },
        {
          "pattern": "human-in-the-loop",
          "relation": "alternative-to"
        },
        {
          "pattern": "debate",
          "relation": "alternative-to"
        },
        {
          "pattern": "infinite-debate",
          "relation": "conflicts-with",
          "note": "Requires a depth bound to avoid this anti-pattern."
        },
        {
          "pattern": "inter-agent-communication",
          "relation": "uses"
        }
      ],
      "references": [
        {
          "type": "paper",
          "title": "ChatDev: Communicative Agents for Software Development",
          "authors": "Qian et al.",
          "year": 2023,
          "url": "https://arxiv.org/abs/2307.07924"
        }
      ],
      "status_in_practice": "emerging",
      "tags": [
        "multi-agent",
        "verification",
        "china-origin",
        "chatdev"
      ]
    },
    {
      "id": "cross-domain-agent-network",
      "name": "Cross-Domain Enterprise Agent Network",
      "aliases": [
        "Domain-Specialised Agent Mesh",
        "Joule-Style Agent Collaboration",
        "Per-Function Agent Network"
      ],
      "category": "multi-agent",
      "intent": "Decompose enterprise agency into domain-specialised agents (finance, supply chain, HR, service), each grounded in its own system of record, and route artefacts between them through a standardised inter-agent protocol.",
      "context": "Large enterprises whose business processes already live across many backing systems (ERP, CRM, HR, ticketing) and where end-to-end workflows (e.g. dispute → cash collection → finance close) cross domain boundaries.",
      "problem": "A single mega-agent grounded against every system has bad recall, no clear ownership, and no domain-specific guardrails; flat tool-use agents over a flat tool catalogue degrade as the catalogue grows.",
      "forces": [
        "Each domain has its own data model, vocabulary, and compliance rules.",
        "End-to-end workflows must cross domains.",
        "A single agent over all systems blows up the tool catalogue and the prompt.",
        "Domain teams want ownership and lifecycle of their own agents."
      ],
      "solution": "Build one specialised agent per business domain, each with its own grounded data, tool palette, and acceptance criteria. Define a standardised inter-agent protocol for handoffs (e.g. A2A, MCP). When a task crosses domains, the source agent routes to the target via the protocol, passing a typed artefact. An optional supervisor or role-based assistant fronts the user and dispatches to the right entry agent.",
      "structure": "User -> Role Assistant -> Domain Agent A (own data + tools) -- protocol message --> Domain Agent B -- ... --> outcome.",
      "consequences": {
        "benefits": [
          "Each domain agent stays small, grounded, and ownable.",
          "Cross-domain workflows are auditable per agent.",
          "Domain teams ship and update their agents independently."
        ],
        "liabilities": [
          "Protocol design is the core engineering problem; bad protocol fossilises mistakes.",
          "Routing decisions become a second-order problem (who does what).",
          "Failure attribution across the chain is harder than for a monolith."
        ]
      },
      "constrains": "An agent may only call across domains via the standardised protocol; ad-hoc backdoor integrations between domain agents are forbidden.",
      "known_uses": [
        {
          "system": "SAP Joule",
          "note": "Per-domain Joule Agents (finance, HR, supply chain, service) collaborating via SAP's collaborative agent architecture; A2A and MCP support announced 2025.",
          "status": "available",
          "url": "https://www.sap.com/products/artificial-intelligence/ai-agents.html"
        },
        {
          "system": "ServiceNow Now Assist",
          "note": "Comparable pattern in ITSM/HR/CSM domain agents.",
          "status": "available"
        }
      ],
      "related": [
        {
          "pattern": "supervisor",
          "relation": "uses"
        },
        {
          "pattern": "handoff",
          "relation": "uses"
        },
        {
          "pattern": "inter-agent-communication",
          "relation": "uses"
        },
        {
          "pattern": "mcp",
          "relation": "uses"
        },
        {
          "pattern": "role-assignment",
          "relation": "uses"
        },
        {
          "pattern": "hero-agent",
          "relation": "alternative-to"
        }
      ],
      "references": [
        {
          "type": "blog",
          "title": "Joule Agents: How SAP Uniquely Delivers AI Agents That Truly Mean Business",
          "url": "https://news.sap.com/2025/02/joule-sap-uniquely-delivers-ai-agents/"
        }
      ],
      "status_in_practice": "emerging",
      "tags": [
        "multi-agent",
        "enterprise",
        "germany-origin",
        "sap",
        "joule"
      ]
    },
    {
      "id": "debate",
      "name": "Debate",
      "aliases": [
        "Multi-Agent Debate",
        "Adversarial Debate"
      ],
      "category": "multi-agent",
      "intent": "Have multiple agents argue different positions on a question and converge through structured exchange.",
      "context": "The answer is contested or the user wants the strongest case for and against.",
      "problem": "Single-agent answers hide reasoning blind spots; the same model giving both answer and critique reinforces them.",
      "forces": [
        "Genuinely independent positions are hard to engineer with one model.",
        "Debate length must be bounded.",
        "A judge is needed to decide; the judge has its own biases."
      ],
      "solution": "Two or more agents are given different positions. They exchange arguments over N rounds. A judge agent (or a tie-break rule) selects the answer or synthesises a position from both.",
      "consequences": {
        "benefits": [
          "Surfaces counterarguments the user can read.",
          "Higher answer quality on contested questions in benchmarks."
        ],
        "liabilities": [
          "N-x cost over single-agent.",
          "Position assignment is itself a prompt-engineering problem."
        ]
      },
      "constrains": "Each debater may only argue its assigned position until the judge step.",
      "known_uses": [
        {
          "system": "Anthropic AI Safety via Debate research",
          "status": "available"
        },
        {
          "system": "MIT CSAIL multi-agent debate work",
          "status": "available"
        }
      ],
      "related": [
        {
          "pattern": "inner-committee",
          "relation": "alternative-to"
        },
        {
          "pattern": "self-consistency",
          "relation": "complements"
        },
        {
          "pattern": "swarm",
          "relation": "generalises"
        },
        {
          "pattern": "infinite-debate",
          "relation": "alternative-to"
        },
        {
          "pattern": "communicative-dehallucination",
          "relation": "alternative-to"
        }
      ],
      "references": [
        {
          "type": "paper",
          "title": "Improving Factuality and Reasoning in Language Models through Multiagent Debate",
          "authors": "Du, Li, Torralba, Tenenbaum, Mordatch",
          "year": 2023,
          "url": "https://arxiv.org/abs/2305.14325"
        }
      ],
      "status_in_practice": "experimental",
      "tags": [
        "debate",
        "multi-agent"
      ]
    },
    {
      "id": "dynamic-expert-recruitment",
      "name": "Dynamic Expert Recruitment",
      "aliases": [
        "Recruiter Agent",
        "Run-Time Team Assembly",
        "Adaptive Role Generation"
      ],
      "category": "multi-agent",
      "intent": "Generate the agent team — role descriptions and instances — at run time based on the specific task, then adjust team composition between iterations based on evaluation feedback.",
      "context": "The task arrives without a known type; a fixed pool of pre-assigned roles either over-provisions (expensive, noisy) or under-covers (missing the right specialist).",
      "problem": "Hard-coded role lists are brittle: the right team for \"draft a regulatory filing\" is not the right team for \"refactor a Python module\"; both tasks share an entry point.",
      "forces": [
        "Pre-specified roles are stable but mis-fit;",
        "Run-time generation costs an extra LLM call before any work begins;",
        "Adaptive composition risks instability: the team that solves step 1 may not solve step 5."
      ],
      "solution": "Add a recruiter agent (or a meta-agent committee: planner + agent observer + plan observer). Stage 1 — Drafting: recruiter receives the goal, generates role descriptions matched to that goal, instantiates the team and an execution plan. Stage 2 — Execution: the team works. Stage 3 — Evaluation: a reviewer scores progress; if unsatisfactory, the recruiter adjusts the team (add, remove, replace roles) and the next iteration runs. The recruiter is the only meta-agent that mutates team composition.",
      "structure": "goal -> Recruiter -> [role descriptions] -> instantiated agents -> joint execution -> Evaluator -> feedback -> Recruiter (adjust team) -> ...",
      "consequences": {
        "benefits": [
          "Team matches the task instead of the task being squeezed into a fixed team.",
          "Adaptive composition closes the gap as the task evolves.",
          "Recruiter prompt is the only place the meta-policy lives."
        ],
        "liabilities": [
          "Recruiter quality is the bottleneck; a bad recruiter produces bad teams.",
          "Run-time team generation is non-deterministic; reproducibility suffers.",
          "Adjustment between iterations can churn (replace too aggressively)."
        ]
      },
      "constrains": "No role may be instantiated outside the recruiter; agents may not unilaterally co-opt or invent peers.",
      "known_uses": [
        {
          "system": "AgentVerse",
          "note": "Recruiter agent generates expert descriptions per goal; team composition adjusted across iterations.",
          "status": "available",
          "url": "https://github.com/OpenBMB/AgentVerse"
        },
        {
          "system": "AutoAgents",
          "note": "Drafting stage with three meta-agents (Planner, Agent Observer, Plan Observer) synthesises the team.",
          "status": "available",
          "url": "https://github.com/Link-AGI/AutoAgents"
        }
      ],
      "related": [
        {
          "pattern": "supervisor",
          "relation": "complements"
        },
        {
          "pattern": "role-assignment",
          "relation": "generalises",
          "note": "Role assignment is the design-time special case."
        },
        {
          "pattern": "mixture-of-experts-routing",
          "relation": "alternative-to",
          "note": "MoE routes to a fixed expert pool; this constructs the experts."
        },
        {
          "pattern": "orchestrator-workers",
          "relation": "complements"
        },
        {
          "pattern": "evaluator-optimizer",
          "relation": "uses",
          "note": "Evaluation step drives team adjustment."
        }
      ],
      "references": [
        {
          "type": "paper",
          "title": "AgentVerse: Facilitating Multi-Agent Collaboration and Exploring Emergent Behaviors",
          "authors": "Chen et al.",
          "year": 2023,
          "url": "https://arxiv.org/abs/2308.10848"
        },
        {
          "type": "paper",
          "title": "AutoAgents: A Framework for Automatic Agent Generation",
          "authors": "Chen et al.",
          "year": 2023,
          "url": "https://arxiv.org/abs/2309.17288"
        }
      ],
      "status_in_practice": "experimental",
      "tags": [
        "multi-agent",
        "dynamic",
        "china-origin",
        "agentverse",
        "autoagents"
      ]
    },
    {
      "id": "handoff",
      "name": "Handoff",
      "aliases": [
        "Agent Handoff",
        "Transfer",
        "Routine Switch"
      ],
      "category": "multi-agent",
      "intent": "Transfer the active conversation from one agent to another, carrying context across the switch.",
      "context": "A specialist agent realises it is the wrong agent for the current request and needs to pass control to another specialist.",
      "problem": "Without a handoff primitive, mid-conversation reroutes either restart context or stay stuck with the wrong specialist.",
      "forces": [
        "Context transfer is lossy; what travels?",
        "Handoff loops (A→B→A→B) are a real failure.",
        "User experience must signal the change without disorienting."
      ],
      "solution": "Define a handoff tool. The current agent invokes it with target agent and a context summary. The target agent receives the summary plus the original conversation and continues from there. Loop detection prevents thrash.",
      "consequences": {
        "benefits": [
          "Specialisation without supervisor overhead on every turn.",
          "User-visible continuity."
        ],
        "liabilities": [
          "Context summary fidelity bounds quality.",
          "Loop detection is its own code path."
        ]
      },
      "constrains": "Handoffs happen only via the registered tool; out-of-band agent switches are forbidden.",
      "known_uses": [
        {
          "system": "OpenAI Swarm primitives",
          "status": "available",
          "url": "https://github.com/openai/swarm"
        }
      ],
      "related": [
        {
          "pattern": "supervisor",
          "relation": "alternative-to"
        },
        {
          "pattern": "role-assignment",
          "relation": "complements"
        },
        {
          "pattern": "inter-agent-communication",
          "relation": "composes-with"
        },
        {
          "pattern": "conversation-handoff",
          "relation": "generalises"
        },
        {
          "pattern": "cross-domain-agent-network",
          "relation": "used-by"
        }
      ],
      "references": [
        {
          "type": "repo",
          "title": "openai/swarm",
          "url": "https://github.com/openai/swarm"
        }
      ],
      "status_in_practice": "emerging",
      "tags": [
        "multi-agent",
        "handoff"
      ]
    },
    {
      "id": "hierarchical-agents",
      "name": "Hierarchical Agents",
      "aliases": [
        "Manager-Worker Tree",
        "Agent Hierarchy"
      ],
      "category": "multi-agent",
      "intent": "Organise agents in a tree where higher-level agents decompose tasks for lower-level agents, recursively.",
      "context": "Tasks decomposable across multiple levels of granularity (project → epic → ticket → step); a flat supervisor would have too many direct reports.",
      "problem": "Flat supervisor patterns scale poorly: one supervisor with N specialists has prompt complexity that grows with N.",
      "forces": [
        "Tree depth trades latency for clarity.",
        "Inter-level communication needs a contract.",
        "Failure recovery: which level retries?"
      ],
      "solution": "Each non-leaf agent receives a task, decomposes it, and dispatches sub-tasks to its children. Children may be specialists (leaves) or further managers. Results bubble up; each manager synthesises its children's outputs. Bounded depth and breadth prevent runaway hierarchies.",
      "consequences": {
        "benefits": [
          "Scales to deep decomposition.",
          "Each level has clear responsibility."
        ],
        "liabilities": [
          "Latency multiplies with depth.",
          "Coordination bugs become hard to localise."
        ]
      },
      "constrains": "An agent communicates only with its parent and children; cross-tree communication is forbidden.",
      "known_uses": [
        {
          "system": "AutoGen GroupChat with nested groups",
          "status": "available"
        },
        {
          "system": "CrewAI hierarchical processes",
          "status": "available"
        }
      ],
      "related": [
        {
          "pattern": "supervisor",
          "relation": "generalises"
        },
        {
          "pattern": "orchestrator-workers",
          "relation": "specialises"
        },
        {
          "pattern": "goal-decomposition",
          "relation": "complements"
        },
        {
          "pattern": "agent-as-tool-embedding",
          "relation": "generalises"
        }
      ],
      "references": [
        {
          "type": "doc",
          "title": "AutoGen multi-agent docs",
          "url": "https://microsoft.github.io/autogen/"
        }
      ],
      "status_in_practice": "mature",
      "tags": [
        "multi-agent",
        "hierarchy"
      ]
    },
    {
      "id": "inner-committee",
      "name": "Inner Committee",
      "aliases": [
        "Multi-Persona Single Model",
        "Self-as-Multiple-Roles"
      ],
      "category": "multi-agent",
      "intent": "Run one model under several distinct personas (executor, critic, planner) within a single agent loop.",
      "context": "The agent benefits from review or planning split off from execution, but spinning up genuine separate model instances is overkill.",
      "problem": "Mono-persona prompts conflate roles and produce muddled outputs that are neither plans nor critiques nor execution.",
      "forces": [
        "Persona switching costs a prompt and a context reset.",
        "The model has the same blind spots in each persona; true diversity is limited.",
        "Persona drift in long conversations dilutes the role separation."
      ],
      "solution": "Define explicit personas (system prompts) for each role: planner, executor, critic. The agent loop steps through personas at fixed points. Each persona sees only the inputs its role needs, not the full context of the others.",
      "consequences": {
        "benefits": [
          "Cheaper than running multiple model instances.",
          "Surprisingly effective for self-critique and self-modification gating."
        ],
        "liabilities": [
          "Same model means correlated errors; reflexion suffers from this.",
          "Persona prompts add up to a non-trivial token budget."
        ]
      },
      "constrains": "Each persona may only act within its declared role; cross-persona reasoning is forbidden in a single prompt.",
      "known_uses": [
        {
          "system": "Sparrot",
          "note": "Executor + critic + planner; one model worn three ways.",
          "status": "available"
        }
      ],
      "related": [
        {
          "pattern": "inner-critic",
          "relation": "specialises"
        },
        {
          "pattern": "debate",
          "relation": "alternative-to"
        },
        {
          "pattern": "role-assignment",
          "relation": "alternative-to"
        }
      ],
      "references": [
        {
          "type": "blog",
          "title": "Marco Nissen, Working with the models",
          "year": 2026,
          "url": "https://substack.com/@marconissen"
        }
      ],
      "status_in_practice": "emerging",
      "tags": [
        "multi-persona",
        "single-model"
      ]
    },
    {
      "id": "inter-agent-communication",
      "name": "Inter-Agent Communication",
      "aliases": [
        "A2A",
        "Agent-to-Agent Protocol"
      ],
      "category": "multi-agent",
      "intent": "Define a protocol for agents to exchange tasks, capabilities, and results across process or vendor boundaries.",
      "context": "Agents from different teams or vendors need to cooperate but speak different internal shapes.",
      "problem": "Bespoke point-to-point integrations do not scale; each new agent pair requires fresh glue.",
      "forces": [
        "Capability discovery: how does agent A know what agent B can do?",
        "Auth and trust across organisational boundaries.",
        "Versioning: protocols evolve faster than legacy agents."
      ],
      "solution": "Adopt a protocol (Google A2A, Anthropic MCP, in-house equivalents) that covers capability advertisement, task delegation, result return, and auth. Agents advertise capabilities; clients discover and invoke; results round-trip in typed envelopes.",
      "consequences": {
        "benefits": [
          "Cross-team and cross-vendor reuse.",
          "Capability inventory becomes inspectable."
        ],
        "liabilities": [
          "Protocol overhead.",
          "Schema versioning becomes everyone's problem."
        ]
      },
      "constrains": "Agents may only invoke each other through the advertised protocol; out-of-band calls are forbidden.",
      "known_uses": [
        {
          "system": "Google A2A Protocol",
          "status": "available"
        }
      ],
      "related": [
        {
          "pattern": "mcp",
          "relation": "complements"
        },
        {
          "pattern": "handoff",
          "relation": "composes-with"
        },
        {
          "pattern": "supervisor",
          "relation": "complements"
        },
        {
          "pattern": "orchestrator-workers",
          "relation": "complements"
        },
        {
          "pattern": "communicative-dehallucination",
          "relation": "used-by"
        },
        {
          "pattern": "cross-domain-agent-network",
          "relation": "used-by"
        }
      ],
      "references": [
        {
          "type": "doc",
          "title": "A2A Protocol",
          "year": 2025,
          "url": "https://a2a-protocol.org/"
        }
      ],
      "status_in_practice": "emerging",
      "tags": [
        "a2a",
        "protocol",
        "interop"
      ]
    },
    {
      "id": "lead-researcher",
      "name": "Lead Researcher",
      "aliases": [
        "Research Orchestrator",
        "Lead-and-Subagents"
      ],
      "category": "multi-agent",
      "intent": "A lead agent writes a research plan and dispatches parallel sub-agents that fan out for breadth-first information gathering, then merges results.",
      "context": "Open-ended research tasks where breadth-first exploration across many sources beats depth-first single-thread reasoning.",
      "problem": "Single-agent research is bottlenecked on serial token generation; generic orchestrator-workers under-specifies how to handle research-shaped tasks (parallelism, source diversity, synthesis).",
      "forces": [
        "Sub-agent count vs cost.",
        "Synthesis quality bounded by lead agent's reasoning over fragmented results.",
        "Information overlap across sub-agents is wasted compute."
      ],
      "solution": "Lead agent receives the user query, plans a set of parallel research questions, and dispatches each to a sub-agent. Each sub-agent searches independently and returns structured findings to the lead. The lead reads the returned findings and synthesises the answer; if synthesis reveals gaps, the lead spawns additional sub-agents.",
      "consequences": {
        "benefits": [
          "Breadth-first parallelism cuts wall-clock time.",
          "Inspectable scratchpad makes the research auditable."
        ],
        "liabilities": [
          "Sub-agent overlap and redundancy.",
          "Synthesis is the new bottleneck."
        ]
      },
      "constrains": "Sub-agents return findings only to the lead; peer-to-peer communication is forbidden.",
      "known_uses": [
        {
          "system": "Anthropic Multi-Agent Research",
          "status": "available",
          "url": "https://www.anthropic.com/engineering/multi-agent-research-system"
        },
        {
          "system": "OpenAI Deep Research",
          "status": "available"
        }
      ],
      "related": [
        {
          "pattern": "orchestrator-workers",
          "relation": "specialises"
        },
        {
          "pattern": "parallelization",
          "relation": "uses"
        },
        {
          "pattern": "supervisor",
          "relation": "specialises"
        }
      ],
      "references": [
        {
          "type": "blog",
          "title": "How we built our multi-agent research system",
          "authors": "Anthropic",
          "year": 2025,
          "url": "https://www.anthropic.com/engineering/multi-agent-research-system"
        }
      ],
      "status_in_practice": "mature",
      "tags": [
        "multi-agent",
        "research",
        "lead-subagent"
      ]
    },
    {
      "id": "orchestrator-workers",
      "name": "Orchestrator-Workers",
      "aliases": [
        "Dynamic Decomposition",
        "Orchestrator-Subagents"
      ],
      "category": "multi-agent",
      "intent": "An orchestrator dynamically breaks a task into subtasks at runtime and delegates each to a worker LLM, then synthesises results.",
      "context": "The subtasks needed are not known in advance and depend on the task; coding tasks where the number of files to change varies are the canonical example.",
      "problem": "Static decomposition (Plan-and-Execute, Prompt Chaining) cannot handle tasks whose shape is data-dependent.",
      "forces": [
        "The orchestrator must reason at a higher level than any worker.",
        "Workers should not have to know they are workers.",
        "Synthesis must reconcile conflicting worker outputs."
      ],
      "solution": "Orchestrator agent receives the task, decides at runtime what subtasks to spawn, hands each to a worker (often via tool call), collects results, and synthesises the final output. Worker count and roles can vary per task.",
      "consequences": {
        "benefits": [
          "Handles tasks with data-dependent decomposition.",
          "Workers stay simple; complexity lives in the orchestrator."
        ],
        "liabilities": [
          "Orchestrator failure is unrecoverable without retry logic.",
          "Token cost scales with worker count; budget awareness matters."
        ]
      },
      "constrains": "Workers see only their assigned subtask; only the orchestrator has the global view.",
      "known_uses": [
        {
          "system": "Anthropic Building Effective Agents (Workflow #4)",
          "status": "available"
        },
        {
          "system": "Claude Code subagents",
          "status": "available"
        },
        {
          "system": "Anthropic Multi-Agent Research",
          "status": "available"
        },
        {
          "system": "OpenAI Deep Research",
          "status": "available"
        }
      ],
      "related": [
        {
          "pattern": "supervisor",
          "relation": "alternative-to"
        },
        {
          "pattern": "plan-and-execute",
          "relation": "alternative-to"
        },
        {
          "pattern": "subagent-isolation",
          "relation": "generalises"
        },
        {
          "pattern": "lead-researcher",
          "relation": "generalises"
        },
        {
          "pattern": "inter-agent-communication",
          "relation": "complements"
        },
        {
          "pattern": "hierarchical-agents",
          "relation": "generalises"
        },
        {
          "pattern": "dynamic-expert-recruitment",
          "relation": "complements"
        },
        {
          "pattern": "agent-as-tool-embedding",
          "relation": "generalises"
        }
      ],
      "references": [
        {
          "type": "blog",
          "title": "Anthropic: Building Effective Agents",
          "year": 2024,
          "url": "https://www.anthropic.com/research/building-effective-agents"
        }
      ],
      "status_in_practice": "mature",
      "tags": [
        "multi-agent",
        "orchestrator"
      ]
    },
    {
      "id": "role-assignment",
      "name": "Role Assignment",
      "aliases": [
        "Persona Roles",
        "Agent Crew",
        "Specialist Roles"
      ],
      "category": "multi-agent",
      "intent": "Assign each agent a named role (researcher, writer, critic, planner) with a role-specific prompt, tool palette, and acceptance criteria.",
      "context": "Multi-agent systems where personas need to be distinct enough that the user (and the system) can reason about who did what.",
      "problem": "Generic agents drift toward similarity; without explicit roles, contributions blur and review becomes hard.",
      "forces": [
        "Role definitions can ossify into bureaucracy.",
        "Cross-role handoffs need typed contracts.",
        "Role count multiplies prompt-engineering effort."
      ],
      "solution": "Define each role with a system prompt naming its responsibility and constraints, a tool palette scoped to its role, and acceptance criteria for outputs it produces. Workflow assigns tasks to roles. Outputs are evaluated against the role's acceptance criteria.",
      "consequences": {
        "benefits": [
          "Outputs are attributable and reviewable per role.",
          "Specialisation improves quality on each role's task."
        ],
        "liabilities": [
          "Bureaucratic overhead.",
          "Role drift over long sessions."
        ]
      },
      "constrains": "An agent operates only within its role's constraints and tool palette; cross-role action is forbidden.",
      "known_uses": [
        {
          "system": "CrewAI",
          "status": "available",
          "url": "https://www.crewai.com/"
        },
        {
          "system": "AutoGen named agents",
          "status": "available"
        }
      ],
      "related": [
        {
          "pattern": "supervisor",
          "relation": "complements"
        },
        {
          "pattern": "inner-committee",
          "relation": "alternative-to"
        },
        {
          "pattern": "handoff",
          "relation": "complements"
        },
        {
          "pattern": "mixture-of-experts-routing",
          "relation": "complements"
        },
        {
          "pattern": "autogen-conversational",
          "relation": "complements"
        },
        {
          "pattern": "camel-role-playing",
          "relation": "generalises"
        },
        {
          "pattern": "sop-encoded-multi-agent",
          "relation": "used-by"
        },
        {
          "pattern": "dynamic-expert-recruitment",
          "relation": "specialises"
        },
        {
          "pattern": "cross-domain-agent-network",
          "relation": "used-by"
        }
      ],
      "references": [
        {
          "type": "doc",
          "title": "CrewAI docs",
          "url": "https://docs.crewai.com"
        }
      ],
      "status_in_practice": "mature",
      "tags": [
        "multi-agent",
        "roles",
        "crew"
      ]
    },
    {
      "id": "sop-encoded-multi-agent",
      "name": "SOP-Encoded Multi-Agent Workflow",
      "aliases": [
        "Standard Operating Procedure Multi-Agent",
        "Assembly-Line Agents",
        "Software-Company Agents"
      ],
      "category": "multi-agent",
      "intent": "Encode a human Standard Operating Procedure (roles, ordered phases, standardised hand-off artefacts) into a multi-agent pipeline so that agents communicate through structured documents rather than free-form chat.",
      "context": "A complex, repeatable task (software development, document production, regulatory submission) already has a well-known human procedure with named roles and defined deliverables between them.",
      "problem": "Free-form multi-agent chat hallucinates context, drifts off-task, and produces no auditable trail; without a procedure, agents redo each other's work or skip steps.",
      "forces": [
        "The model is good at playing a role; it is bad at inventing the workflow that connects roles.",
        "Free chat between agents is cheap to write but expensive to debug.",
        "Defined artefacts (PRD, design doc, test plan) compress context across role hand-offs.",
        "Rigid SOPs lose the model's ability to adapt; the SOP has to leave room for the role to think."
      ],
      "solution": "Encode the SOP as: (a) a fixed set of named roles each with role-specific prompt and tool palette, (b) an ordered sequence of phases, (c) a typed artefact contract for each phase boundary (e.g. PRD → design doc → code → test plan → user manual). Agents communicate via the artefacts; a shared message pool plus a subscription filter routes only relevant context to each role.",
      "structure": "Role_A -- artefact_1 --> Role_B -- artefact_2 --> Role_C ... ; shared message pool; per-role subscription filter.",
      "consequences": {
        "benefits": [
          "Auditable trail of artefacts at every phase boundary.",
          "Specialised role prompts beat one mega-prompt on long tasks.",
          "Standardised artefact schemas catch ambiguity at the hand-off, not at the end."
        ],
        "liabilities": [
          "Designing the artefact contract is the real work; bad contracts propagate to every role.",
          "Procedure rigidity makes the system brittle when the task does not match the SOP.",
          "Token cost scales with the number of phases."
        ]
      },
      "constrains": "Agents may not communicate outside the artefact contract; a role's output that does not conform to the next role's expected schema is rejected at the phase boundary.",
      "known_uses": [
        {
          "system": "MetaGPT",
          "note": "Five roles (Product Manager, Architect, Project Manager, Engineer, QA) producing standardised artefacts in an assembly-line pipeline.",
          "status": "available",
          "url": "https://github.com/geekan/MetaGPT"
        },
        {
          "system": "ChatDev",
          "note": "CEO/CTO/Programmer/Reviewer/Tester roles in a phased pipeline with artefact hand-offs.",
          "status": "available",
          "url": "https://github.com/OpenBMB/ChatDev"
        }
      ],
      "related": [
        {
          "pattern": "role-assignment",
          "relation": "uses"
        },
        {
          "pattern": "supervisor",
          "relation": "complements"
        },
        {
          "pattern": "blackboard",
          "relation": "uses",
          "note": "Shared message pool plus subscription filter is a blackboard variant."
        },
        {
          "pattern": "spec-first-agent",
          "relation": "complements",
          "note": "The SOP is itself a spec for the multi-agent system."
        },
        {
          "pattern": "hero-agent",
          "relation": "alternative-to"
        },
        {
          "pattern": "structured-output",
          "relation": "uses"
        },
        {
          "pattern": "chat-chain",
          "relation": "complements"
        }
      ],
      "references": [
        {
          "type": "paper",
          "title": "MetaGPT: Meta Programming for A Multi-Agent Collaborative Framework",
          "authors": "Hong et al.",
          "year": 2023,
          "url": "https://arxiv.org/abs/2308.00352"
        },
        {
          "type": "paper",
          "title": "ChatDev: Communicative Agents for Software Development",
          "authors": "Qian et al.",
          "year": 2023,
          "url": "https://arxiv.org/abs/2307.07924"
        }
      ],
      "status_in_practice": "emerging",
      "tags": [
        "multi-agent",
        "workflow",
        "china-origin",
        "metagpt",
        "chatdev"
      ]
    },
    {
      "id": "subagent-isolation",
      "name": "Subagent Isolation",
      "aliases": [
        "Worktree Subagent",
        "Parallel Subagent",
        "Isolated Worker"
      ],
      "category": "multi-agent",
      "intent": "Run subagents in isolated workspaces so their writes do not collide and parallelism is safe.",
      "context": "Coding agents that delegate to multiple subagents for parallel work; without isolation, subagents fight over the same files.",
      "problem": "Subagents writing to the same workspace race each other; one's edits are clobbered by another's.",
      "forces": [
        "Isolation has setup cost (new worktree, branch, container).",
        "Reconciling work back to the main workspace is its own problem.",
        "Excessive isolation prevents subagents from seeing each other's progress when that would help."
      ],
      "solution": "Each subagent runs in its own workspace (git worktree, container, branch, sandbox). The supervisor reconciles results back to the main workspace on completion (merge, cherry-pick, replay). Only one workspace can land changes at a time.",
      "consequences": {
        "benefits": [
          "True parallelism without write collisions.",
          "Failed subagents leave their workspace as evidence."
        ],
        "liabilities": [
          "Setup latency.",
          "Reconciliation conflicts."
        ]
      },
      "constrains": "Subagents may only write to their own isolated workspace; cross-workspace writes are forbidden.",
      "known_uses": [
        {
          "system": "Claude Code subagent + git worktree",
          "status": "available"
        },
        {
          "system": "Devin sessions",
          "status": "available",
          "url": "https://devin.ai/"
        },
        {
          "system": "Cursor parallel agents",
          "status": "available",
          "url": "https://cursor.com/"
        },
        {
          "system": "OpenHands",
          "status": "available",
          "url": "https://github.com/All-Hands-AI/OpenHands"
        }
      ],
      "related": [
        {
          "pattern": "orchestrator-workers",
          "relation": "specialises"
        },
        {
          "pattern": "sandbox-isolation",
          "relation": "composes-with"
        },
        {
          "pattern": "llm-compiler",
          "relation": "composes-with"
        },
        {
          "pattern": "agent-as-tool-embedding",
          "relation": "complements"
        },
        {
          "pattern": "unbounded-subagent-spawn",
          "relation": "complements"
        }
      ],
      "references": [
        {
          "type": "doc",
          "title": "Claude Code subagents",
          "year": 2025,
          "url": "https://docs.claude.com/en/docs/claude-code/sub-agents"
        }
      ],
      "status_in_practice": "emerging",
      "tags": [
        "multi-agent",
        "isolation",
        "parallel"
      ],
      "applicability": {
        "use_when": [
          "A bounded sub-task has its own tool palette, prompt, or model.",
          "The parent's context should not bloat with the sub-agent's intermediate turns.",
          "Sub-agents can run in parallel and their failures must be containable."
        ],
        "do_not_use_when": [
          "The parent must observe the sub's intermediate state for debugging.",
          "The sub is a single-shot operation; Tool Use suffices without an agent loop.",
          "Recursive nesting depth is unbounded; cost will spiral."
        ]
      },
      "diagram": {
        "type": "sequence",
        "mermaid": "sequenceDiagram\n  participant Parent\n  participant Sub\n  participant Tool\n  Parent->>Sub: task\n  Note right of Sub: own context, own tools, own step budget\n  loop sub-agent loop\n    Sub->>Tool: action\n    Tool-->>Sub: result\n  end\n  Sub-->>Parent: structured result only",
        "caption": "Subagent Isolation wraps a full agent loop behind a function-shaped boundary so the parent only sees the return value."
      },
      "example_scenario": "A research agent is asked to write a market report. Instead of doing every sub-task in its main loop, it spawns three sub-agents in parallel: one to research competitors, one to pull pricing data, one to summarise news. Each sub-agent has its own tool set and step budget. The main agent only sees the three structured results that come back, not the dozens of intermediate web searches each sub-agent ran.",
      "variants": [
        {
          "name": "Function-call wrapper",
          "summary": "The sub-agent is exposed as a function the parent calls like any other tool. The parent waits synchronously for the structured result.",
          "distinguishing_factor": "synchronous, in-process",
          "when_to_use": "Default. Simplest implementation when sub-agents are quick (seconds) and parent must wait for the result."
        },
        {
          "name": "Async / queue-backed",
          "summary": "The sub-agent runs out-of-band on a worker; the parent enqueues a task and polls or subscribes for the result.",
          "distinguishing_factor": "decoupled execution",
          "when_to_use": "Sub-agent runs are slow (minutes-to-hours) or the parent should make progress on other work in the meantime."
        },
        {
          "name": "Separate-process sandbox",
          "summary": "The sub-agent runs in a separate OS process or container with its own filesystem, network, and credentials.",
          "distinguishing_factor": "OS-level isolation",
          "when_to_use": "Sub-agent runs untrusted code or third-party tools that must not see the parent's secrets.",
          "see_also": "sandbox-isolation"
        }
      ]
    },
    {
      "id": "supervisor",
      "name": "Supervisor",
      "aliases": [
        "Multi-Agent Supervisor",
        "Lane Supervisor"
      ],
      "category": "multi-agent",
      "intent": "Place a coordinating agent above a set of specialised agents and route work to them.",
      "context": "Different request types benefit from different system prompts, tool palettes, and models; routing alone is too coarse because the lanes themselves want their own loop.",
      "problem": "A single agent that handles everything has either too few tools (limiting capability) or too many (confusing the model).",
      "forces": [
        "Adding a supervisor layer adds a model call.",
        "Inter-agent communication needs a protocol.",
        "Specialisation reduces transfer learning across requests."
      ],
      "solution": "A supervisor classifies requests and dispatches them to a specialised agent. Each specialist has its own prompt, tools, and possibly its own model. The supervisor may receive results back and decide whether to escalate or respond.",
      "consequences": {
        "benefits": [
          "Each lane can be tuned and tested in isolation.",
          "Capability grows by adding lanes, not by enlarging one prompt."
        ],
        "liabilities": [
          "Multi-agent before simpler patterns are running is decoration.",
          "Coordination failures are often invisible until production."
        ]
      },
      "constrains": "Specialists may only act within their declared scope; the supervisor owns dispatch and aggregation.",
      "known_uses": [
        {
          "system": "Bobbin (Stash2Go)",
          "note": "agent_v2.py + supervisor.py implement the lane-supervisor pattern.",
          "status": "available"
        },
        {
          "system": "LangGraph Supervisor",
          "status": "available"
        }
      ],
      "related": [
        {
          "pattern": "routing",
          "relation": "uses"
        },
        {
          "pattern": "orchestrator-workers",
          "relation": "alternative-to"
        },
        {
          "pattern": "hierarchical-agents",
          "relation": "specialises"
        },
        {
          "pattern": "blackboard",
          "relation": "alternative-to"
        },
        {
          "pattern": "lead-researcher",
          "relation": "generalises"
        },
        {
          "pattern": "inter-agent-communication",
          "relation": "complements"
        },
        {
          "pattern": "role-assignment",
          "relation": "complements"
        },
        {
          "pattern": "swarm",
          "relation": "alternative-to"
        },
        {
          "pattern": "hero-agent",
          "relation": "alternative-to"
        },
        {
          "pattern": "handoff",
          "relation": "alternative-to"
        },
        {
          "pattern": "mixture-of-experts-routing",
          "relation": "complements"
        },
        {
          "pattern": "autogen-conversational",
          "relation": "alternative-to"
        },
        {
          "pattern": "sop-encoded-multi-agent",
          "relation": "complements"
        },
        {
          "pattern": "chat-chain",
          "relation": "alternative-to"
        },
        {
          "pattern": "dynamic-expert-recruitment",
          "relation": "complements"
        },
        {
          "pattern": "outer-inner-agent-loop",
          "relation": "complements"
        },
        {
          "pattern": "cross-domain-agent-network",
          "relation": "used-by"
        }
      ],
      "references": [
        {
          "type": "doc",
          "title": "LangGraph Multi-Agent Supervisor",
          "url": "https://langchain-ai.github.io/langgraph/tutorials/multi_agent/agent_supervisor/"
        }
      ],
      "status_in_practice": "mature",
      "tags": [
        "multi-agent",
        "supervisor"
      ],
      "example_scenario": "A customer-service platform routes incoming chats. A supervisor agent classifies each request: billing, technical, or sales. It dispatches each to the matching specialist agent, which has its own prompt, tool set, and ticket-system access. The supervisor doesn't try to be good at all three roles — it just routes and aggregates."
    },
    {
      "id": "swarm",
      "name": "Swarm",
      "aliases": [
        "Society of Mind",
        "Peer Agents",
        "Decentralised Multi-Agent"
      ],
      "category": "multi-agent",
      "intent": "Run many peer agents that interact directly without a central supervisor, achieving emergent coordination.",
      "context": "Tasks where centralised coordination is a bottleneck or where the problem benefits from many independent attempts (simulation, exploration).",
      "problem": "Centralised supervisors become bottlenecks at scale; some tasks (negotiation, simulation, exploration) need agent-to-agent dynamics.",
      "forces": [
        "Emergent behaviour can surprise designers; debugging is hard.",
        "Communication topology (broadcast? gossip? pub/sub?) is a design choice.",
        "Termination is non-trivial without a supervisor."
      ],
      "solution": "Agents interact via a shared message bus, chat, or environment. Each agent has its own goals and policies. No central coordinator; convergence is emergent. Termination conditions are environment-level (time budget, consensus threshold, external trigger).",
      "consequences": {
        "benefits": [
          "Scales horizontally.",
          "Suits negotiation, market simulation, exploration."
        ],
        "liabilities": [
          "Hard to debug; emergent failures are global.",
          "Cost can balloon without supervision."
        ]
      },
      "constrains": "Agents communicate only via the shared channel; out-of-band coordination is forbidden.",
      "known_uses": [
        {
          "system": "OpenAI Swarm (deprecated; succeeded by OpenAI Agents SDK)",
          "status": "deprecated",
          "url": "https://github.com/openai/swarm"
        },
        {
          "system": "Stanford Generative Agents simulation",
          "status": "available"
        }
      ],
      "related": [
        {
          "pattern": "debate",
          "relation": "specialises"
        },
        {
          "pattern": "supervisor",
          "relation": "alternative-to"
        },
        {
          "pattern": "blackboard",
          "relation": "complements"
        }
      ],
      "references": [
        {
          "type": "repo",
          "title": "openai/swarm",
          "url": "https://github.com/openai/swarm"
        }
      ],
      "status_in_practice": "experimental",
      "tags": [
        "multi-agent",
        "swarm",
        "emergent"
      ]
    },
    {
      "id": "disambiguation",
      "name": "Disambiguation",
      "aliases": [
        "Clarifying Questions",
        "Confirmation Loop",
        "Ask About Ambiguity"
      ],
      "category": "planning-control-flow",
      "intent": "Have the agent ask a clarifying question before acting on an ambiguous request.",
      "context": "User requests are often underspecified; acting on the wrong interpretation costs more than asking.",
      "problem": "Agents that always act produce confidently wrong results on ambiguous inputs.",
      "forces": [
        "Asking too often is annoying.",
        "Asking too rarely produces wrong work.",
        "The model must detect ambiguity, which is itself hard."
      ],
      "solution": "Detect ambiguity via low-confidence intent classification or explicit ambiguity rubric. When detected, ask one focused question and wait for the answer before acting. Phrase the question with the most-likely interpretation as a default.",
      "consequences": {
        "benefits": [
          "Quality improvement on ambiguous inputs.",
          "User feels in control."
        ],
        "liabilities": [
          "Latency penalty.",
          "Conversational drag if overused."
        ]
      },
      "constrains": "Below the confidence threshold the agent must ask; it is forbidden to guess.",
      "known_uses": [
        {
          "system": "Cursor / Claude Code clarifying questions",
          "status": "available",
          "url": "https://cursor.com/"
        },
        {
          "system": "Production support chatbots",
          "status": "available"
        },
        {
          "system": "ChatGPT clarifying questions",
          "status": "available",
          "url": "https://chat.openai.com/"
        },
        {
          "system": "Claude clarifying questions",
          "status": "available",
          "url": "https://claude.com/"
        }
      ],
      "related": [
        {
          "pattern": "routing",
          "relation": "uses"
        },
        {
          "pattern": "human-in-the-loop",
          "relation": "specialises"
        },
        {
          "pattern": "confidence-reporting",
          "relation": "complements"
        },
        {
          "pattern": "communicative-dehallucination",
          "relation": "generalises"
        }
      ],
      "references": [
        {
          "type": "paper",
          "title": "ClariQ: Asking Clarification Questions in Conversational Information Seeking",
          "authors": "Aliannejadi, Zamani et al.",
          "year": 2020,
          "url": "https://arxiv.org/abs/2009.11352"
        }
      ],
      "status_in_practice": "mature",
      "tags": [
        "ux",
        "clarification"
      ]
    },
    {
      "id": "event-driven-agent",
      "name": "Event-Driven Agent",
      "aliases": [
        "Event Subscriber",
        "Reactive Agent",
        "Webhook Agent"
      ],
      "category": "planning-control-flow",
      "intent": "Trigger the agent on external events (webhooks, message queues, file changes) instead of user requests or schedules.",
      "context": "Tasks that should happen in response to something happening (PR opened, message received, alert fired) — not on a clock and not on demand.",
      "problem": "Pulling for state on a schedule wastes effort; pushing on events is timely and efficient when an event source exists.",
      "forces": [
        "Event source reliability.",
        "Burst handling: event storms can overwhelm.",
        "Dedup of events that fire multiple times."
      ],
      "solution": "Subscribe to event source (webhook, queue, watcher). On event, validate, deduplicate, and invoke the agent with event payload as input. Apply rate limiting and idempotency. Acknowledge after successful processing.",
      "consequences": {
        "benefits": [
          "Timely action without polling cost.",
          "Composes with downstream automations naturally."
        ],
        "liabilities": [
          "Event-source failures stop the agent silently.",
          "Idempotency is its own engineering."
        ]
      },
      "constrains": "The agent runs only on validated events; spurious or duplicate events are filtered.",
      "known_uses": [
        {
          "system": "GitHub Actions agent triggers",
          "status": "available",
          "url": "https://docs.github.com/en/actions"
        },
        {
          "system": "Pub/Sub-driven agent platforms",
          "status": "available"
        }
      ],
      "related": [
        {
          "pattern": "scheduled-agent",
          "relation": "alternative-to"
        },
        {
          "pattern": "rate-limiting",
          "relation": "complements"
        },
        {
          "pattern": "agent-resumption",
          "relation": "complements"
        },
        {
          "pattern": "salience-triggered-output",
          "relation": "complements"
        }
      ],
      "references": [
        {
          "type": "doc",
          "title": "AutoGen",
          "authors": "Microsoft",
          "year": 2025,
          "url": "https://microsoft.github.io/autogen/stable/"
        }
      ],
      "status_in_practice": "mature",
      "tags": [
        "events",
        "reactive",
        "webhook"
      ]
    },
    {
      "id": "exploration-exploitation",
      "name": "Exploration vs Exploitation",
      "aliases": [
        "Exploration & Discovery",
        "Curiosity-Driven Action"
      ],
      "category": "planning-control-flow",
      "intent": "Balance taking the best-known action (exploit) with trying alternatives that might be better (explore).",
      "context": "Long-running agents that learn from outcomes (skill libraries, multi-armed-bandit-style choice across tools or strategies).",
      "problem": "Pure exploitation locks the agent into local optima; pure exploration wastes effort on unproven options.",
      "forces": [
        "Exploration costs (failed attempts) are real.",
        "Reward signals must exist to shape the trade-off.",
        "Schedule (epsilon-greedy, UCB, Thompson sampling) is its own design."
      ],
      "solution": "Pick a strategy: epsilon-greedy (exploit with probability 1-ε), upper-confidence-bound (favor under-explored options with bonus), Thompson sampling (sample from posterior). Apply across tools, strategies, prompts. Track outcomes and adjust.",
      "consequences": {
        "benefits": [
          "Avoids local optima.",
          "Improves with experience."
        ],
        "liabilities": [
          "Requires reward signal.",
          "Strategy choice is empirical."
        ]
      },
      "constrains": "The agent's action distribution must follow the chosen strategy; unconditional exploitation is forbidden.",
      "known_uses": [
        {
          "system": "Voyager (Minecraft skill discovery)",
          "status": "available",
          "url": "https://voyager.minedojo.org/"
        },
        {
          "system": "Sparrot drives (curiosity drive)",
          "status": "available"
        },
        {
          "system": "Gulli ch.21 Exploration & Discovery",
          "status": "available"
        }
      ],
      "related": [
        {
          "pattern": "lats",
          "relation": "complements"
        },
        {
          "pattern": "skill-library",
          "relation": "complements"
        }
      ],
      "references": [
        {
          "type": "book",
          "title": "Agentic Design Patterns (Gulli)",
          "year": 2025,
          "url": "https://www.amazon.com/Agentic-Design-Patterns-Antonio-Gulli/dp/B0FFTBPNM4"
        }
      ],
      "status_in_practice": "emerging",
      "tags": [
        "planning",
        "rl",
        "exploration"
      ]
    },
    {
      "id": "goal-decomposition",
      "name": "Goal Decomposition",
      "aliases": [
        "Hierarchical Task Network",
        "Goal Setting & Monitoring",
        "Task Tree"
      ],
      "category": "planning-control-flow",
      "intent": "Decompose a goal into sub-goals recursively until each leaf is directly actionable.",
      "context": "Long-horizon tasks where the top-level goal cannot be acted on directly; intermediate scaffolding is needed.",
      "problem": "Without explicit decomposition, the agent attacks the goal in one shot and produces shallow work.",
      "forces": [
        "Decomposition depth: too shallow loses scaffolding; too deep loses the forest.",
        "Sub-goal independence affects parallelisation.",
        "Goal-monitoring at each level adds overhead."
      ],
      "solution": "Build a tree of goals. The root is the user's goal. Each non-leaf goal decomposes into sub-goals. Leaves are directly actionable steps. Monitor progress at each level; surface stuck branches. Distinct from least-to-most (which is sequential) by allowing parallel sibling goals.",
      "consequences": {
        "benefits": [
          "Long-horizon tasks become tractable.",
          "Progress is visible at multiple granularities."
        ],
        "liabilities": [
          "Tree construction is itself work.",
          "Stuck branches at deep levels are easy to lose."
        ]
      },
      "constrains": "Action is taken only at leaf goals; non-leaf goals must decompose further before action.",
      "known_uses": [
        {
          "system": "Classical AI Hierarchical Task Networks",
          "status": "available"
        },
        {
          "system": "Gulli ch.20 Goal Setting & Monitoring",
          "status": "available"
        }
      ],
      "related": [
        {
          "pattern": "least-to-most",
          "relation": "alternative-to"
        },
        {
          "pattern": "hierarchical-agents",
          "relation": "complements"
        },
        {
          "pattern": "plan-and-execute",
          "relation": "specialises"
        }
      ],
      "references": [
        {
          "type": "book",
          "title": "Agentic Design Patterns (Gulli, ch. 20 Prioritization)",
          "year": 2025,
          "url": "https://www.amazon.com/Agentic-Design-Patterns-Antonio-Gulli/dp/B0FFTBPNM4"
        }
      ],
      "status_in_practice": "mature",
      "tags": [
        "planning",
        "decomposition",
        "htn"
      ]
    },
    {
      "id": "lats",
      "name": "Language Agent Tree Search",
      "aliases": [
        "LATS",
        "MCTS for Agents",
        "Tree-Search Agent",
        "Backtracking Agent"
      ],
      "category": "planning-control-flow",
      "intent": "Lift the agent loop into a search tree with a learned value function and backtracking.",
      "context": "The task has multiple plausible reasoning paths and committing to the first one yields suboptimal answers.",
      "problem": "ReAct and Plan-and-Execute commit to a single chain; ambiguous problems benefit from exploring alternatives.",
      "forces": [
        "Search is expensive; the value function must be cheap.",
        "Branch ranking determines whether search beats greedy.",
        "Memory of failed branches must not leak into successful ones."
      ],
      "solution": "Apply Monte Carlo Tree Search (MCTS) to the agent loop. Each node is a partial trajectory. Expansion samples next thoughts/actions. Backpropagation updates a value estimate. Selection chooses the next node by UCT. The agent can backtrack from a failing branch instead of committing.",
      "consequences": {
        "benefits": [
          "Higher answer quality on hard / ambiguous tasks.",
          "Explicit exploration / exploitation trade-off."
        ],
        "liabilities": [
          "Token cost can be 5-10x ReAct.",
          "The value function is hard to train without supervision signals."
        ]
      },
      "constrains": "Each node may be expanded only by sampling actions consistent with the parent state.",
      "known_uses": [
        {
          "system": "Pure future for Stash2Go",
          "note": "Conversational answerer for ambiguous knitting questions ('can I substitute X for Y?').",
          "status": "pure-future"
        }
      ],
      "related": [
        {
          "pattern": "react",
          "relation": "uses"
        },
        {
          "pattern": "self-consistency",
          "relation": "complements"
        },
        {
          "pattern": "tree-of-thoughts",
          "relation": "specialises",
          "note": "LATS adds learned value function and MCTS-style search."
        },
        {
          "pattern": "exploration-exploitation",
          "relation": "complements"
        },
        {
          "pattern": "test-time-compute-scaling",
          "relation": "specialises"
        },
        {
          "pattern": "graph-of-thoughts",
          "relation": "complements"
        },
        {
          "pattern": "process-reward-model",
          "relation": "complements"
        },
        {
          "pattern": "automatic-workflow-search",
          "relation": "complements"
        }
      ],
      "references": [
        {
          "type": "paper",
          "title": "Language Agent Tree Search Unifies Reasoning, Acting, and Planning in Language Models",
          "authors": "Zhou, Yan, Shlapentokh-Rothman, Wang, Wang",
          "year": 2023,
          "url": "https://arxiv.org/abs/2310.04406"
        }
      ],
      "status_in_practice": "experimental",
      "tags": [
        "search",
        "mcts",
        "planning"
      ],
      "variants": [
        {
          "name": "MCTS with LLM rollout",
          "summary": "Monte Carlo Tree Search expands the reasoning tree; the LLM serves as the rollout policy for unexplored branches.",
          "distinguishing_factor": "MCTS dynamics",
          "when_to_use": "Default for the original LATS paper formulation."
        },
        {
          "name": "Beam-search variant",
          "summary": "Maintain top-k partial reasoning paths; expand each by one step; keep the top-k overall by score.",
          "distinguishing_factor": "beam discipline, not stochastic",
          "when_to_use": "Determinism matters or the search budget is small enough that exploration is expensive."
        },
        {
          "name": "Self-evaluated branch pruning",
          "summary": "After each expansion, the LLM scores the partial path; low-scoring branches are pruned aggressively.",
          "distinguishing_factor": "LLM is the value function",
          "when_to_use": "No external scorer is available; LLM self-evaluation is good enough on this task."
        }
      ]
    },
    {
      "id": "llm-compiler",
      "name": "LLMCompiler",
      "aliases": [
        "LLM Compiler",
        "Parallel ReWOO"
      ],
      "category": "planning-control-flow",
      "intent": "Take ReWOO's plan-as-DAG and run independent steps in parallel through a task-fetching dispatcher.",
      "context": "Sequential ReWOO leaves parallelism on the table when steps have no mutual dependency.",
      "problem": "Latency-sensitive agents waiting on tools sequentially are slower than they need to be.",
      "forces": [
        "Concurrency control: limits per provider, rate limits, fan-out costs.",
        "Failure isolation: one branch failing should not kill others.",
        "Joiner correctness: combining out-of-order results."
      ],
      "solution": "Three roles. Planner builds the dependency DAG. Task-Fetching Unit dispatches steps as their inputs become available, with bounded concurrency. Joiner assembles the final answer from the resolved DAG.",
      "consequences": {
        "benefits": [
          "End-to-end latency drops to the longest dependency chain.",
          "Cost remains roughly the same as ReWOO."
        ],
        "liabilities": [
          "Concurrency adds operational complexity.",
          "Planner mistakes are amplified by parallel execution."
        ]
      },
      "constrains": "Steps run only when all referenced upstream variables are resolved.",
      "known_uses": [
        {
          "system": "Pure future for Bobbin",
          "note": "Agent-lane plans with two unrelated tools could run concurrently.",
          "status": "pure-future"
        }
      ],
      "related": [
        {
          "pattern": "rewoo",
          "relation": "specialises"
        },
        {
          "pattern": "parallelization",
          "relation": "uses"
        },
        {
          "pattern": "parallel-tool-calls",
          "relation": "alternative-to"
        },
        {
          "pattern": "subagent-isolation",
          "relation": "composes-with"
        },
        {
          "pattern": "graph-of-thoughts",
          "relation": "complements"
        }
      ],
      "references": [
        {
          "type": "paper",
          "title": "An LLM Compiler for Parallel Function Calling",
          "authors": "Kim, Moon, Tabrizi, Lee, Mahoney, Keutzer, Gholami",
          "year": 2023,
          "url": "https://arxiv.org/abs/2312.04511"
        }
      ],
      "status_in_practice": "experimental",
      "tags": [
        "planning",
        "parallel",
        "dag"
      ]
    },
    {
      "id": "map-reduce",
      "name": "MapReduce for Agents",
      "aliases": [
        "LLM×MapReduce",
        "Divide-and-Conquer"
      ],
      "category": "planning-control-flow",
      "intent": "Split an oversize task into independent chunks, process each in parallel, then aggregate.",
      "context": "The input does not fit the model's context window or the task naturally decomposes (per-row, per-document, per-section).",
      "problem": "Long-context models still degrade with size; chunked processing without coordination loses cross-chunk dependencies.",
      "forces": [
        "Naive chunking loses dependencies that span chunks.",
        "Conflicts between chunk answers need a resolver.",
        "Aggregation must not become its own context-window problem."
      ],
      "solution": "Map: split input into chunks; process each independently (per-chunk LLM call). Reduce: aggregate intermediate answers via a structured information protocol that surfaces dependencies, plus a confidence-calibration step to resolve conflicts.",
      "consequences": {
        "benefits": [
          "Scales to inputs orders of magnitude larger than the context window.",
          "Embarrassingly parallel; latency scales with chunk count, not input size."
        ],
        "liabilities": [
          "Cross-chunk dependencies must be modelled explicitly.",
          "Reduce stage can become the new bottleneck."
        ]
      },
      "constrains": "Each Map step sees only its chunk; cross-chunk reasoning is forbidden until the Reduce stage.",
      "known_uses": [
        {
          "system": "LLM×MapReduce paper implementation",
          "status": "available"
        }
      ],
      "related": [
        {
          "pattern": "parallelization",
          "relation": "specialises"
        },
        {
          "pattern": "self-consistency",
          "relation": "alternative-to",
          "note": "Both aggregate multiple LLM outputs but differ in whether inputs are the same."
        },
        {
          "pattern": "graphrag",
          "relation": "used-by"
        },
        {
          "pattern": "pipes-and-filters",
          "relation": "composes-with"
        }
      ],
      "references": [
        {
          "type": "paper",
          "title": "LLM×MapReduce: Simplified Long-Sequence Processing using Large Language Models",
          "authors": "Zhou, Li, Chen, Wang et al.",
          "year": 2024,
          "url": "https://arxiv.org/abs/2410.09342"
        }
      ],
      "status_in_practice": "emerging",
      "tags": [
        "mapreduce",
        "long-context",
        "parallel"
      ]
    },
    {
      "id": "outer-inner-agent-loop",
      "name": "Outer-Inner Agent Loop",
      "aliases": [
        "Dual-Loop Agent",
        "Planner-Outside Executor-Inside",
        "Dispatch-and-Act Loop"
      ],
      "category": "planning-control-flow",
      "intent": "Run two nested loops: an outer planner agent decomposes the goal into subtasks; an inner executor runs a ReAct loop on each, and the outer can replan based on the inner's progress.",
      "context": "Long-horizon tasks where decomposition matters as much as execution, and where execution evidence may invalidate the plan partway through.",
      "problem": "A single agent loop conflates planning and acting: replanning interrupts execution, and execution drift is invisible to the planner until late.",
      "forces": [
        "Plans need a stable horizon; execution needs flexibility within steps.",
        "Replanning is expensive; doing it every turn is wasteful, doing it never is brittle.",
        "Inner-loop autonomy must not silently expand subtask scope."
      ],
      "solution": "Define two roles. Outer agent (Dispatcher + Planner): decomposes the goal into subtasks with milestones, dispatches each to the inner agent, and may interrupt to replan when milestones are missed or new evidence arrives. Inner agent (Actor): runs a tool-use loop on a single subtask, reports back a structured result. Outer holds the global state; inner holds the local state. The interruption channel is the only path the outer has into the inner's loop.",
      "structure": "Outer (plan, dispatch, monitor) <-- result/interrupt --> Inner (ReAct loop on subtask) <-- tool calls --> Tools.",
      "consequences": {
        "benefits": [
          "Planning and execution are separately legible and separately tunable.",
          "Outer can budget steps and cost per subtask.",
          "Inner failures are localised; outer can retry with a different plan."
        ],
        "liabilities": [
          "Two loops double the orchestration surface and the failure modes.",
          "Interrupt semantics are easy to get wrong (mid-step interrupts, partial state).",
          "Cost: outer's monitoring is itself an LLM call."
        ]
      },
      "constrains": "The inner agent may not change its subtask scope; scope changes must come back through the outer planner.",
      "known_uses": [
        {
          "system": "XAgent (OpenBMB)",
          "note": "Explicit Dispatcher + Planner outer loop and Actor inner loop.",
          "status": "available",
          "url": "https://github.com/OpenBMB/XAgent"
        },
        {
          "system": "Manus",
          "note": "Planner-Execution-Verification sub-agent split is a related shape with three roles instead of two.",
          "status": "available",
          "url": "https://manus.im/"
        }
      ],
      "related": [
        {
          "pattern": "planner-executor-observer",
          "relation": "specialises",
          "note": "Two-loop variant with explicit interrupt channel."
        },
        {
          "pattern": "plan-and-execute",
          "relation": "specialises"
        },
        {
          "pattern": "replan-on-failure",
          "relation": "uses"
        },
        {
          "pattern": "step-budget",
          "relation": "uses",
          "note": "Outer enforces step budget on inner."
        },
        {
          "pattern": "supervisor",
          "relation": "complements"
        }
      ],
      "references": [
        {
          "type": "repo",
          "title": "XAgent: An Autonomous LLM Agent for Complex Task Solving",
          "url": "https://github.com/OpenBMB/XAgent"
        }
      ],
      "status_in_practice": "experimental",
      "tags": [
        "planning",
        "multi-agent",
        "china-origin",
        "xagent"
      ]
    },
    {
      "id": "plan-and-execute",
      "name": "Plan-and-Execute",
      "aliases": [
        "Plan-Then-Execute",
        "Outline-Then-Run"
      ],
      "category": "planning-control-flow",
      "intent": "Plan all the steps once with a strong model, then execute each step with a cheaper model under the plan.",
      "context": "The task is multi-step, the steps are mostly known up front, and the cost of tokens varies widely between models.",
      "problem": "ReAct pays the strong model's price on every step including trivial executions.",
      "forces": [
        "Planning quality depends on context the planner has at planning time.",
        "Execution may discover the plan was wrong; replan-versus-fail is a real choice.",
        "Cheaper model may not faithfully execute the plan."
      ],
      "solution": "Two-stage loop. Planner: produce an ordered list of steps with explicit dependencies. Executor: run each step (often with tools) and accumulate results. On failure or surprise, replan with the new evidence in context.",
      "structure": "Planner -> [Step_1, Step_2, ..., Step_N] -> Executor -> Result. On failure, return to Planner.",
      "consequences": {
        "benefits": [
          "Plan is inspectable before execution starts.",
          "Cost shifts to the cheap model for routine steps."
        ],
        "liabilities": [
          "Plans can be brittle when the world differs from the planner's mental model.",
          "Replans add latency and complicate debugging."
        ]
      },
      "constrains": "The executor cannot deviate from the current plan without raising a replan request.",
      "known_uses": [
        {
          "system": "Bobbin (Stash2Go)",
          "note": "Explicit planner + screen_executor nodes in the agent lane.",
          "status": "available"
        },
        {
          "system": "LangChain Plan-and-Execute",
          "status": "available"
        }
      ],
      "related": [
        {
          "pattern": "react",
          "relation": "alternative-to"
        },
        {
          "pattern": "rewoo",
          "relation": "generalises"
        },
        {
          "pattern": "planner-executor-observer",
          "relation": "generalises"
        },
        {
          "pattern": "step-budget",
          "relation": "complements"
        },
        {
          "pattern": "structured-output",
          "relation": "complements"
        },
        {
          "pattern": "orchestrator-workers",
          "relation": "alternative-to"
        },
        {
          "pattern": "least-to-most",
          "relation": "complements"
        },
        {
          "pattern": "replan-on-failure",
          "relation": "complements"
        },
        {
          "pattern": "goal-decomposition",
          "relation": "generalises"
        },
        {
          "pattern": "outer-inner-agent-loop",
          "relation": "generalises"
        }
      ],
      "references": [
        {
          "type": "paper",
          "title": "Plan-and-Solve Prompting: Improving Zero-Shot Chain-of-Thought Reasoning by Large Language Models",
          "authors": "Wang, Xu, Lan, Hu, Lan, Lee, Lim",
          "year": 2023,
          "url": "https://arxiv.org/abs/2305.04091"
        },
        {
          "type": "blog",
          "title": "LangChain: Plan-and-Execute Agents",
          "year": 2023,
          "url": "https://blog.langchain.com/planning-agents/"
        }
      ],
      "status_in_practice": "mature",
      "tags": [
        "planning",
        "two-stage"
      ],
      "applicability": {
        "use_when": [
          "The task decomposes cleanly into mostly-independent steps.",
          "The world is stable enough that a plan made once is still good to execute.",
          "Cost of replanning per step would dominate the run."
        ],
        "do_not_use_when": [
          "Each step's outcome materially changes what the next step should be — ReAct fits.",
          "The task is a single step; planning is overhead.",
          "Steps are tightly interdependent and a DAG with placeholders fits better — see ReWOO or LLMCompiler."
        ]
      },
      "diagram": {
        "type": "sequence",
        "mermaid": "sequenceDiagram\n  participant User\n  participant Planner\n  participant Executor\n  participant Tool\n  User->>Planner: goal\n  Planner->>Planner: decompose into ordered steps\n  Planner-->>Executor: plan\n  loop per step\n    Executor->>Tool: action\n    Tool-->>Executor: result\n  end\n  Executor-->>User: outcome",
        "caption": "Plan-and-Execute commits to a plan up front; the executor walks the steps and only loops back to the planner on failure."
      },
      "example_scenario": "An office-assistant agent is told, 'Book a team offsite in Barcelona for ten people next month, find a restaurant for dinner, and email everyone the schedule.' Up front it writes a five-step plan: search venues, pick one, search restaurants, pick one, send emails. The executor walks the plan in order. Because the venue list does not depend on what restaurants exist, planning once is cheaper than re-thinking every step.",
      "variants": [
        {
          "name": "Linear plan",
          "summary": "Planner emits an ordered list of steps; the executor walks them top-to-bottom.",
          "distinguishing_factor": "ordered list, no branching",
          "when_to_use": "Default. Each step's output feeds the next; no parallelism is available."
        },
        {
          "name": "Replan-on-failure",
          "summary": "If a step fails or returns unexpected results, control returns to the planner with the partial state. The planner emits a new plan from there.",
          "distinguishing_factor": "planner re-invoked on failure",
          "when_to_use": "World drift mid-execution is likely (external systems failing, data freshness changes)."
        },
        {
          "name": "DAG plan",
          "summary": "Planner emits a directed acyclic graph of steps with placeholder variables for outputs; independent steps run in parallel.",
          "distinguishing_factor": "graph, parallelisable",
          "when_to_use": "Many steps are independent and parallelism would shorten total wall-clock time.",
          "see_also": "rewoo"
        }
      ]
    },
    {
      "id": "planner-executor-observer",
      "name": "Planner-Executor-Observer",
      "aliases": [
        "Three-Role Loop",
        "POE"
      ],
      "category": "planning-control-flow",
      "intent": "Add an explicit Observer role between Planner and Executor so progress is checked against the plan instead of trusted blindly.",
      "context": "Plan-and-Execute systems either let the executor run blind (planner finds out at the end) or report at every step (rebuilding ReAct with extra coordination overhead).",
      "problem": "The missing third role is supervision of execution against intent; without it, plan quality is invisible until the run completes.",
      "forces": [
        "Observation must be cheap or it negates the plan-execute speedup.",
        "Triggering replans too eagerly thrashes; too lazily wastes effort.",
        "The Observer needs visibility into plan and tool results both."
      ],
      "solution": "Three roles: Planner produces a plan; Executor runs steps; Observer reads the cumulative result and decides loop / respond / replan. Each role has its own prompt and (optionally) its own model.",
      "consequences": {
        "benefits": [
          "Catches plan failure earlier than end-of-run.",
          "Cleaner separation of concerns than ReAct's monolithic step."
        ],
        "liabilities": [
          "Three coordinated prompts to maintain.",
          "Latency adds up if Observer runs every step."
        ]
      },
      "constrains": "The Executor cannot decide to stop or replan; only the Observer can.",
      "known_uses": [
        {
          "system": "Bobbin (Stash2Go)",
          "note": "planner / screen_executor / observe with route_after_observe edge.",
          "status": "available"
        }
      ],
      "related": [
        {
          "pattern": "plan-and-execute",
          "relation": "specialises"
        },
        {
          "pattern": "evaluator-optimizer",
          "relation": "composes-with"
        },
        {
          "pattern": "react",
          "relation": "alternative-to"
        },
        {
          "pattern": "replan-on-failure",
          "relation": "used-by"
        },
        {
          "pattern": "outer-inner-agent-loop",
          "relation": "generalises"
        }
      ],
      "references": [
        {
          "type": "blog",
          "title": "Marco Nissen, Working with the models (Code Different #14)",
          "year": 2026,
          "url": "https://substack.com/@marconissen"
        }
      ],
      "status_in_practice": "emerging",
      "tags": [
        "planning",
        "three-role"
      ]
    },
    {
      "id": "react",
      "name": "ReAct",
      "aliases": [
        "Reason+Act",
        "Think-Act-Observe Loop"
      ],
      "category": "planning-control-flow",
      "intent": "Interleave a single thought, a single tool call, and a single observation per step so the agent reasons over fresh evidence.",
      "context": "The task requires looking things up or acting on the world; the answer cannot be produced by pure thinking.",
      "problem": "Pure chain-of-thought hallucinates facts; pure tool-blasting wastes calls on the wrong things.",
      "forces": [
        "Tool calls are expensive (latency, cost, side effects).",
        "Observations change the right next step.",
        "The loop must terminate."
      ],
      "solution": "On each step the agent emits Thought (private reasoning), Action (one tool call), Observation (the tool's result). Repeat until the agent decides to answer. A step budget bounds the loop.",
      "structure": "[Thought_i, Action_i, Observation_i] for i in 1..N, then Answer.",
      "consequences": {
        "benefits": [
          "Lowest-overhead path for simple lookups and single-field updates.",
          "Easy to inspect and debug step by step."
        ],
        "liabilities": [
          "Sequential by nature; long traces are slow and expensive.",
          "No global plan; the agent can wander."
        ]
      },
      "constrains": "Each step the model may call exactly one tool; reasoning between calls is not actuated.",
      "known_uses": [
        {
          "system": "Bobbin (Stash2Go)",
          "note": "observe node + route_after_observe edge in LangGraph.",
          "status": "available"
        },
        {
          "system": "LangChain AgentExecutor (default)",
          "status": "available",
          "url": "https://python.langchain.com/docs/how_to/agent_executor/"
        },
        {
          "system": "Claude Code",
          "status": "available",
          "url": "https://docs.claude.com/en/docs/claude-code/overview"
        },
        {
          "system": "Cursor",
          "status": "available",
          "url": "https://cursor.com/"
        },
        {
          "system": "GitHub Copilot agent",
          "status": "available"
        },
        {
          "system": "Devin",
          "status": "available",
          "url": "https://devin.ai/"
        }
      ],
      "related": [
        {
          "pattern": "plan-and-execute",
          "relation": "alternative-to"
        },
        {
          "pattern": "tool-use",
          "relation": "uses"
        },
        {
          "pattern": "agentic-rag",
          "relation": "used-by"
        },
        {
          "pattern": "planner-executor-observer",
          "relation": "alternative-to"
        },
        {
          "pattern": "lats",
          "relation": "used-by"
        },
        {
          "pattern": "computer-use",
          "relation": "used-by"
        },
        {
          "pattern": "self-ask",
          "relation": "specialises"
        },
        {
          "pattern": "code-execution",
          "relation": "composes-with"
        },
        {
          "pattern": "code-as-action",
          "relation": "generalises"
        }
      ],
      "references": [
        {
          "type": "paper",
          "title": "ReAct: Synergizing Reasoning and Acting in Language Models",
          "authors": "Yao, Zhao, Yu, Du, Shafran, Narasimhan, Cao",
          "year": 2022,
          "url": "https://arxiv.org/abs/2210.03629"
        }
      ],
      "status_in_practice": "mature",
      "tags": [
        "react",
        "loop",
        "tool-use"
      ],
      "applicability": {
        "use_when": [
          "The next action depends on what was learned from the previous action.",
          "The agent needs tool access during a multi-step task.",
          "Outputs from tools are short and inspectable so the model can react to them."
        ],
        "do_not_use_when": [
          "The full plan is known up front; Plan-and-Execute commits earlier.",
          "Latency is critical; each iteration adds a model round-trip.",
          "Tool results are large; they will dominate the context window."
        ]
      },
      "diagram": {
        "type": "sequence",
        "mermaid": "sequenceDiagram\n  participant Model\n  participant Tool\n  participant User\n  loop until answer or step budget exhausted\n    Model->>Model: Thought\n    Model->>Tool: Action\n    Tool-->>Model: Observation\n  end\n  Model->>User: Answer",
        "caption": "ReAct alternates Thought, Action, Observation until the model decides it has an answer or a budget halts the loop."
      },
      "example_scenario": "A travel-planning chatbot is asked, 'Find me a flight from Oslo to Tokyo next Tuesday under €800.' It thinks: 'I should check flights.' It calls a flight-search tool. The result shows three options. It thinks again: 'The cheapest is on a Tuesday morning — that fits.' Each step it sees what the previous tool call returned, then decides what to do next.",
      "variants": [
        {
          "name": "Zero-shot ReAct",
          "summary": "Triggers the think-act-observe loop with an instruction only — 'Let's solve this step by step using the available tools.' No examples in the prompt.",
          "distinguishing_factor": "no in-context examples",
          "when_to_use": "The model is large enough to follow the loop format from instruction alone, and tokens spent on examples are not justified."
        },
        {
          "name": "Few-shot ReAct",
          "summary": "Includes 2-3 worked examples in the prompt showing the Thought / Action / Observation format the model should emit.",
          "distinguishing_factor": "in-context examples carry the format",
          "when_to_use": "Smaller or non-instruction-tuned models that need format demonstration to stay in-shape across turns."
        },
        {
          "name": "Trained ReAct",
          "summary": "The base model has been fine-tuned on ReAct trajectories so the loop format is its native output. No examples or trigger phrase needed.",
          "distinguishing_factor": "fine-tuned weights",
          "when_to_use": "High-volume production where prompt overhead matters and you control the model.",
          "see_also": "rest-em"
        }
      ]
    },
    {
      "id": "replan-on-failure",
      "name": "Replan on Failure",
      "aliases": [
        "Adaptive Replanning",
        "Plan Revision"
      ],
      "category": "planning-control-flow",
      "intent": "Trigger a fresh planning step when execution evidence contradicts the current plan.",
      "context": "Plan-and-Execute systems where the world differs from the planner's mental model and the executor surfaces evidence the plan must respond to.",
      "problem": "Plans are made under incomplete information; without replanning, the executor either grinds through a wrong plan or fails silently.",
      "forces": [
        "Replanning resets cost; thrashing is real.",
        "When to trigger replanning is itself a judgment.",
        "Stale context: the new plan must include lessons from the failed run."
      ],
      "solution": "Define replan triggers (tool error, unexpected observation, observer dissent). When triggered, the executor pauses and the planner runs again with the failure context. The new plan replaces the old one; partial progress is preserved if compatible.",
      "consequences": {
        "benefits": [
          "Recovers from plan failures gracefully.",
          "The planner gets feedback; future plans improve."
        ],
        "liabilities": [
          "Replanning thrash if triggers are too sensitive.",
          "Compatibility logic between old and new plans is non-trivial."
        ]
      },
      "constrains": "The executor cannot deviate from the current plan without raising a replan request.",
      "known_uses": [
        {
          "system": "LangGraph plan-and-execute templates",
          "status": "available"
        }
      ],
      "related": [
        {
          "pattern": "plan-and-execute",
          "relation": "complements"
        },
        {
          "pattern": "planner-executor-observer",
          "relation": "uses"
        },
        {
          "pattern": "exception-recovery",
          "relation": "complements"
        },
        {
          "pattern": "outer-inner-agent-loop",
          "relation": "used-by"
        }
      ],
      "references": [
        {
          "type": "doc",
          "title": "LangGraph: Plan-and-Execute",
          "url": "https://langchain-ai.github.io/langgraph/tutorials/plan-and-execute/plan-and-execute/"
        }
      ],
      "status_in_practice": "mature",
      "tags": [
        "planning",
        "replan"
      ]
    },
    {
      "id": "rewoo",
      "name": "ReWOO",
      "aliases": [
        "Reasoning Without Observation",
        "Plan-as-DAG",
        "Placeholder-Variable Plan"
      ],
      "category": "planning-control-flow",
      "intent": "Plan a complete dependency DAG with placeholder variables before any tool runs, then execute and substitute observations into the plan.",
      "context": "Many agent tasks have planning that does not actually depend on early observations; ReAct re-prompts the big model after each tool call wastefully.",
      "problem": "Token cost in ReAct grows linearly with steps because each observation re-enters the prompt for the next reasoning turn.",
      "forces": [
        "Pre-planning fails when dependencies are truly observation-dependent.",
        "Placeholder substitution requires a typed variable convention.",
        "Plan correctness must be high; mid-run replans defeat the saving."
      ],
      "solution": "Three roles. Planner emits a DAG with steps `t1 = ToolA(x); t2 = ToolB(#t1)` using variable references. Worker executes each tool in dependency order. Solver reads the resolved trace and produces the final answer. The planner never sees observations.",
      "structure": "Planner(query) -> DAG(steps with #refs) -> Worker(steps) -> resolved_trace -> Solver(query, trace) -> answer.",
      "consequences": {
        "benefits": [
          "Up to 5x fewer tokens than ReAct on the original benchmarks.",
          "Plan is fully inspectable before any tool fires."
        ],
        "liabilities": [
          "Bad plans are paid for in full.",
          "Not a fit for tasks where observation truly redirects planning."
        ]
      },
      "constrains": "The Planner cannot see tool outputs; substitution happens only at the Worker stage.",
      "known_uses": [
        {
          "system": "Pure future for Stash2Go",
          "note": "Periodic offline normalisation of yarn catalogue when an upstream weight definition changes.",
          "status": "pure-future"
        },
        {
          "system": "agent-patterns library",
          "status": "available"
        }
      ],
      "related": [
        {
          "pattern": "plan-and-execute",
          "relation": "specialises"
        },
        {
          "pattern": "llm-compiler",
          "relation": "generalises"
        }
      ],
      "references": [
        {
          "type": "paper",
          "title": "ReWOO: Decoupling Reasoning from Observations for Efficient Augmented Language Models",
          "authors": "Xu, Peng, Liang, Lei, Mukherjee, Liu, Xu",
          "year": 2023,
          "url": "https://arxiv.org/abs/2305.18323"
        }
      ],
      "status_in_practice": "experimental",
      "tags": [
        "planning",
        "dag",
        "cost"
      ]
    },
    {
      "id": "scheduled-agent",
      "name": "Scheduled Agent",
      "aliases": [
        "Cron Agent",
        "Time-Triggered Agent",
        "Periodic Agent"
      ],
      "category": "planning-control-flow",
      "intent": "Run the agent on a fixed schedule independent of user requests.",
      "context": "Tasks that should happen periodically (overnight summaries, weekly triage, hourly health checks) regardless of user prompting.",
      "problem": "Request-driven agents only act when called; many useful tasks need to act on time, not on demand.",
      "forces": [
        "Schedule density trades cost for freshness.",
        "Failure modes when the agent's run is missed.",
        "Drift if the schedule is not authoritative."
      ],
      "solution": "Schedule the agent run at fixed cadence (cron, scheduler service). The agent reads its current state, executes its task, writes results, and exits. State persists across runs in durable storage.",
      "consequences": {
        "benefits": [
          "Time-bounded tasks happen reliably.",
          "Idempotent runs make retries safe."
        ],
        "liabilities": [
          "Cost per run regardless of need.",
          "Skew between expected and actual cadence."
        ]
      },
      "constrains": "The agent is not invoked by user requests; only the scheduler triggers runs.",
      "known_uses": [
        {
          "system": "Sparrot tick (every 60 seconds)",
          "status": "available"
        },
        {
          "system": "Claude Code scheduled agents",
          "status": "available"
        }
      ],
      "related": [
        {
          "pattern": "event-driven-agent",
          "relation": "alternative-to"
        },
        {
          "pattern": "spec-driven-loop",
          "relation": "alternative-to"
        },
        {
          "pattern": "agent-resumption",
          "relation": "complements"
        }
      ],
      "references": [
        {
          "type": "doc",
          "title": "Message Batches",
          "year": 2025,
          "url": "https://docs.claude.com/en/docs/build-with-claude/batch-processing"
        }
      ],
      "status_in_practice": "mature",
      "tags": [
        "schedule",
        "cron",
        "periodic"
      ]
    },
    {
      "id": "spec-driven-loop",
      "name": "Spec-Driven Loop",
      "aliases": [
        "Naive Iterative Loop",
        "Ralph Wiggum Loop",
        "Ralph Loop"
      ],
      "category": "planning-control-flow",
      "intent": "Run the same prompt against a fixed spec in a deterministic outer loop until the spec is satisfied.",
      "context": "A greenfield task with a clear (or improvable) specification where each iteration can improve the codebase incrementally.",
      "problem": "Agents that try to plan an entire feature in one go are brittle; agents that wander without a spec drift.",
      "forces": [
        "The spec must be good or the loop polishes the wrong artefact.",
        "Tests gate progress; without them the loop has no error signal.",
        "Cost per iteration must be tolerable for hundreds of runs."
      ],
      "solution": "An outer shell loop (`while :; do cat PROMPT.md | claude-code ; done`) runs the same prompt repeatedly. The prompt encodes one task at a time, references a fix_plan.md that the agent itself updates, and ends with a test invocation that gates the next iteration. Subagents are used for parallel reads; build/test stays serial.",
      "consequences": {
        "benefits": [
          "Brutally simple. No orchestration framework required.",
          "Self-improving in practice: the agent updates the spec as it learns."
        ],
        "liabilities": [
          "Easy to burn tokens on the wrong shape.",
          "Hard to share state between iterations beyond what the agent writes to disk."
        ]
      },
      "constrains": "Each loop iteration is constrained by the spec and the test gate; the agent cannot expand scope without editing the spec first.",
      "known_uses": [
        {
          "system": "Geoffrey Huntley's Ralph",
          "note": "The canonical write-up.",
          "status": "available",
          "url": "https://ghuntley.com/ralph/"
        }
      ],
      "related": [
        {
          "pattern": "spec-first-agent",
          "relation": "uses"
        },
        {
          "pattern": "step-budget",
          "relation": "complements"
        },
        {
          "pattern": "scheduled-agent",
          "relation": "alternative-to"
        }
      ],
      "references": [
        {
          "type": "blog",
          "title": "Ralph Wiggum as a 'software engineer'",
          "authors": "Geoffrey Huntley",
          "year": 2025,
          "url": "https://ghuntley.com/ralph/"
        }
      ],
      "status_in_practice": "emerging",
      "tags": [
        "loop",
        "spec",
        "iterative"
      ]
    },
    {
      "id": "spec-first-agent",
      "name": "Spec-First Agent",
      "aliases": [
        "Specification-Driven Agent",
        "Plan-as-Document"
      ],
      "category": "planning-control-flow",
      "intent": "Drive the agent loop from a human-authored specification document rather than free-form prompts.",
      "context": "The task is well-defined enough to write down; the agent benefits from a stable, inspectable target.",
      "problem": "Free-form prompts drift; the spec is in the head of one engineer and not auditable.",
      "forces": [
        "Spec authoring is up-front work.",
        "The agent must update the spec when learnings invalidate it; uncontrolled spec mutation is dangerous.",
        "Spec format must be both human- and agent-readable."
      ],
      "solution": "Write the specification as a markdown file (PROMPT.md, fix_plan.md, or similar). The agent reads the spec at each iteration, executes against it, and may update it under controlled conditions. The spec is the single source of truth for what 'done' means.",
      "consequences": {
        "benefits": [
          "Inspectable target; reviewable diffs over time.",
          "Pairs naturally with iterative loops (Ralph)."
        ],
        "liabilities": [
          "Spec quality bounds agent quality.",
          "Spec mutation introduces drift if uncontrolled."
        ]
      },
      "constrains": "The agent acts only against goals named in the spec; out-of-scope work must be added to the spec first.",
      "known_uses": [
        {
          "system": "Ralph Wiggum loop",
          "note": "PROMPT.md + fix_plan.md drive the loop.",
          "status": "available"
        },
        {
          "system": "Spec-driven Claude Code workflows",
          "status": "available"
        }
      ],
      "related": [
        {
          "pattern": "spec-driven-loop",
          "relation": "used-by"
        },
        {
          "pattern": "agent-skills",
          "relation": "complements"
        },
        {
          "pattern": "sop-encoded-multi-agent",
          "relation": "complements"
        },
        {
          "pattern": "todo-list-driven-agent",
          "relation": "alternative-to"
        },
        {
          "pattern": "automatic-workflow-search",
          "relation": "alternative-to"
        }
      ],
      "references": [
        {
          "type": "blog",
          "title": "Geoffrey Huntley, Ralph",
          "year": 2025,
          "url": "https://ghuntley.com/ralph/"
        }
      ],
      "status_in_practice": "emerging",
      "tags": [
        "spec",
        "documentation"
      ]
    },
    {
      "id": "todo-list-driven-agent",
      "name": "Todo-List-Driven Autonomous Agent",
      "aliases": [
        "todo.md Agent",
        "Persistent Markdown Plan",
        "Externalised Plan File"
      ],
      "category": "planning-control-flow",
      "intent": "Have the agent author a plan file (e.g. todo.md) early in the run, tick items as it completes them, and re-inject the remaining plan into context; the file is durable plan and working memory.",
      "context": "A long-horizon autonomous task in a sandboxed VM with file-system access; the run may span hundreds of tool calls and exceed any usable in-context plan.",
      "problem": "In-context plans drift to the middle of the window where the model attends least; without a durable plan artefact, paused or context-truncated runs cannot recover; the agent forgets which sub-tasks are done.",
      "forces": [
        "Models attend most strongly to the end (and start) of the context window.",
        "File-system memory is durable; in-context memory is volatile.",
        "Re-injecting the full plan every turn is repetitive but combats attention drift.",
        "Markdown is human- and model-readable, supports easy ticking."
      ],
      "solution": "Early in the run, the agent writes its plan as a checklist file (todo.md) in its sandbox. Each turn: read the file, work the next unticked item, update the file (tick the item, add follow-ups, drop dead-ends). Re-inject the unticked tail of the file into the prompt before the model's next turn. The file outlives any single context window. Paired with a sandboxed VM that gives the agent persistent storage and basic tools (browser, shell, file editor).",
      "structure": "Sandbox VM (browser, shell, files) + agent loop: read(todo.md) -> select next item -> act -> update(todo.md) -> repeat.",
      "consequences": {
        "benefits": [
          "Plan survives context truncation and pause/resume.",
          "Re-injecting unticked items keeps the model focused on what's left.",
          "Human-readable trail for debugging and review."
        ],
        "liabilities": [
          "Re-injection costs tokens every turn.",
          "The agent may rewrite the file capriciously; needs guardrails on plan mutations.",
          "Sandboxed VM cost (one VM per task) is non-trivial."
        ]
      },
      "constrains": "The agent may not advance past an unticked item without recording the action in the plan file; arbitrary in-context-only plans are forbidden.",
      "known_uses": [
        {
          "system": "Manus (Monica.im)",
          "note": "Plan persisted as todo.md inside a per-task sandbox VM; remaining items re-injected each turn.",
          "status": "available",
          "url": "https://manus.im"
        },
        {
          "system": "OpenManus",
          "note": "Open-source replica of Manus's loop.",
          "status": "available",
          "url": "https://github.com/FoundationAgents/OpenManus"
        }
      ],
      "related": [
        {
          "pattern": "scratchpad",
          "relation": "specialises",
          "note": "Scratchpad for the plan specifically."
        },
        {
          "pattern": "spec-first-agent",
          "relation": "alternative-to",
          "note": "Spec-first uses a human-authored spec; this is agent-authored."
        },
        {
          "pattern": "agent-resumption",
          "relation": "complements"
        },
        {
          "pattern": "context-window-packing",
          "relation": "uses"
        },
        {
          "pattern": "sandbox-isolation",
          "relation": "uses"
        },
        {
          "pattern": "append-only-thought-stream",
          "relation": "complements"
        }
      ],
      "references": [
        {
          "type": "paper",
          "title": "From Mind to Machine: The Rise of Manus AI as a Fully Autonomous Digital Agent",
          "year": 2025,
          "url": "https://arxiv.org/abs/2505.02024"
        },
        {
          "type": "blog",
          "title": "How Manus Uses E2B to Provide Agents With Virtual Computers",
          "url": "https://e2b.dev/blog/how-manus-uses-e2b-to-provide-agents-with-virtual-computers"
        }
      ],
      "status_in_practice": "emerging",
      "tags": [
        "planning",
        "memory",
        "china-origin",
        "manus"
      ]
    },
    {
      "id": "chain-of-thought",
      "name": "Chain of Thought",
      "aliases": [
        "CoT",
        "Step-by-Step Prompting"
      ],
      "category": "reasoning",
      "intent": "Elicit multi-step reasoning by prompting the model to produce intermediate steps before its final answer.",
      "context": "The task is compositional, arithmetic, or otherwise requires working through several inferences a human would write down.",
      "problem": "LLMs given only (input, output) exemplars fail at problems whose answers depend on a sequence of intermediate inferences.",
      "forces": [
        "Longer outputs cost more.",
        "Wrong reasoning chains can produce confidently wrong answers.",
        "Few-shot exemplars are dataset-specific; zero-shot triggers generalise but lose accuracy."
      ],
      "solution": "Prompt the model with exemplars showing intermediate reasoning, or use a zero-shot trigger ('Let's think step by step') before answering. The reasoning trace is visible and parseable.",
      "consequences": {
        "benefits": [
          "Substantial accuracy gains on reasoning benchmarks.",
          "Reasoning trace is inspectable for debugging."
        ],
        "liabilities": [
          "Single linear trace; no branching or self-correction.",
          "Cost scales with trace length."
        ]
      },
      "constrains": "The model is required to emit reasoning before the final answer; one-shot answer-only generation is forbidden by prompt design.",
      "known_uses": [
        {
          "system": "OpenAI Reasoning prompts",
          "status": "available",
          "url": "https://platform.openai.com/docs/guides/reasoning"
        },
        {
          "system": "Most production agents (CoT inside system prompts)",
          "status": "available"
        }
      ],
      "related": [
        {
          "pattern": "self-consistency",
          "relation": "complements"
        },
        {
          "pattern": "tree-of-thoughts",
          "relation": "generalises"
        },
        {
          "pattern": "least-to-most",
          "relation": "alternative-to"
        },
        {
          "pattern": "extended-thinking",
          "relation": "complements"
        },
        {
          "pattern": "zero-shot-cot",
          "relation": "generalises"
        },
        {
          "pattern": "scratchpad",
          "relation": "used-by"
        },
        {
          "pattern": "star-bootstrapping",
          "relation": "used-by"
        }
      ],
      "references": [
        {
          "type": "paper",
          "title": "Chain-of-Thought Prompting Elicits Reasoning in Large Language Models",
          "authors": "Wei, Wang, Schuurmans, Bosma, Ichter, Xia, Chi, Le, Zhou",
          "year": 2022,
          "url": "https://arxiv.org/abs/2201.11903"
        },
        {
          "type": "paper",
          "title": "Large Language Models are Zero-Shot Reasoners",
          "authors": "Kojima, Gu, Reid, Matsuo, Iwasawa",
          "year": 2022,
          "url": "https://arxiv.org/abs/2205.11916"
        }
      ],
      "status_in_practice": "mature",
      "tags": [
        "reasoning",
        "cot",
        "prompting"
      ]
    },
    {
      "id": "chain-of-verification",
      "name": "Chain of Verification",
      "aliases": [
        "CoVe",
        "Factored Verification",
        "Verify Before Answering"
      ],
      "category": "reasoning",
      "intent": "Reduce hallucination by drafting an answer, generating independent verification questions, answering them in isolation, and revising.",
      "context": "Long-form factual generation (biographies, summaries, recommendations) where plausible-sounding errors creep into the draft.",
      "problem": "When the model verifies its own draft in the same context, the draft biases follow-up checks; errors persist.",
      "forces": [
        "Verification questions must be independently answerable.",
        "Joint verification (all questions in one prompt) underperforms factored.",
        "Verification cost scales with question count."
      ],
      "solution": "Four-step pipeline. Draft: produce initial answer. Plan: generate verification questions covering claims in the draft. Execute: answer each question in isolation, without seeing the original draft. Revise: rewrite the draft using the verification answers.",
      "consequences": {
        "benefits": [
          "Substantial hallucination reduction without retrieval.",
          "Composes with retrieval naturally (retrieve evidence per question)."
        ],
        "liabilities": [
          "4x baseline cost.",
          "Verification quality depends on question coverage."
        ]
      },
      "constrains": "Verification answers are produced without the draft in context; coupled verification is not permitted.",
      "known_uses": [
        {
          "system": "Meta AI implementation",
          "status": "available"
        }
      ],
      "related": [
        {
          "pattern": "reflection",
          "relation": "specialises"
        },
        {
          "pattern": "self-consistency",
          "relation": "complements"
        },
        {
          "pattern": "naive-rag",
          "relation": "composes-with"
        },
        {
          "pattern": "critic",
          "relation": "alternative-to"
        }
      ],
      "references": [
        {
          "type": "paper",
          "title": "Chain-of-Verification Reduces Hallucination in Large Language Models",
          "authors": "Dhuliawala, Komeili, Xu, Raileanu, Li, Celikyilmaz, Weston",
          "year": 2023,
          "url": "https://arxiv.org/abs/2309.11495"
        }
      ],
      "status_in_practice": "emerging",
      "tags": [
        "reasoning",
        "verification",
        "hallucination"
      ]
    },
    {
      "id": "extended-thinking",
      "name": "Extended Thinking",
      "aliases": [
        "Reasoning Tokens",
        "Reasoning Budget"
      ],
      "category": "reasoning",
      "intent": "Spend a configurable budget of internal reasoning tokens before producing a user-visible answer.",
      "context": "Hard reasoning tasks where extra inference-time compute reliably improves answer quality.",
      "problem": "Static prompt-based CoT mixes reasoning into the response; reasoning models offer a separate budget meter and opaque internal reasoning the user does not see.",
      "forces": [
        "Reasoning tokens cost more than standard tokens on most providers.",
        "User-visible latency rises with thinking budget.",
        "Opaque reasoning blocks: harder to inspect and debug."
      ],
      "solution": "Use the provider's reasoning-mode API (OpenAI o-series reasoning effort, Anthropic Claude extended thinking budget_tokens, Gemini thinking budget). Set budget per request based on task difficulty (cheap for routing, expensive for hard reasoning). Monitor reasoning-token consumption.",
      "consequences": {
        "benefits": [
          "Quality lift on hard reasoning without prompt rewrites.",
          "Budget meter is a clean control."
        ],
        "liabilities": [
          "Cost spikes with budget.",
          "Opaque reasoning blocks are harder to debug than visible CoT."
        ]
      },
      "constrains": "Reasoning happens within the declared token budget; exceeding it terminates reasoning and forces an answer.",
      "known_uses": [
        {
          "system": "Anthropic Claude extended thinking (budget_tokens)",
          "status": "available"
        },
        {
          "system": "Gemini 2.5 thinking budget",
          "status": "available"
        },
        {
          "system": "DeepSeek-R1",
          "status": "available"
        },
        {
          "system": "OpenAI reasoning effort (o1, o3, o4-mini)",
          "status": "available",
          "note": "Qualitative low/medium/high control."
        }
      ],
      "related": [
        {
          "pattern": "chain-of-thought",
          "relation": "complements"
        },
        {
          "pattern": "scratchpad",
          "relation": "complements"
        },
        {
          "pattern": "cost-gating",
          "relation": "complements"
        },
        {
          "pattern": "test-time-compute-scaling",
          "relation": "specialises"
        },
        {
          "pattern": "reasoning-trace-carry-forward",
          "relation": "complements"
        }
      ],
      "references": [
        {
          "type": "doc",
          "title": "Anthropic: Extended thinking",
          "url": "https://docs.anthropic.com/en/docs/build-with-claude/extended-thinking"
        },
        {
          "type": "doc",
          "title": "OpenAI: Reasoning models",
          "url": "https://platform.openai.com/docs/guides/reasoning"
        }
      ],
      "status_in_practice": "mature",
      "tags": [
        "reasoning",
        "budget",
        "tokens"
      ]
    },
    {
      "id": "graph-of-thoughts",
      "name": "Graph of Thoughts",
      "aliases": [
        "GoT",
        "DAG Reasoning"
      ],
      "category": "reasoning",
      "intent": "Model reasoning as an arbitrary DAG so thoughts can be merged, refined, and aggregated across branches.",
      "context": "Tasks with subproblems whose results combine non-trivially (sorting partial lists, set operations, summarisation merge); tree-shaped reasoning loses these aggregation opportunities.",
      "problem": "Tree of Thoughts cannot combine partial solutions or reuse intermediate results across sibling branches.",
      "forces": [
        "Richer reasoning topology vs orchestration complexity.",
        "Cross-branch reuse vs aggregation prompt cost.",
        "DAG expressiveness vs cycle-safety enforcement."
      ],
      "solution": "Reasoning state is a DAG of thoughts. Operations include generate (CoT-style), aggregate (merge multiple thoughts), refine (improve one thought), and score. The orchestrator chains operations to produce a final thought; the agent can reuse intermediate nodes across branches.",
      "consequences": {
        "benefits": [
          "Strict superset of CoT and ToT.",
          "Most useful when subproblems have non-tree dependencies."
        ],
        "liabilities": [
          "Orchestration overhead.",
          "Hard to debug when the DAG grows."
        ]
      },
      "constrains": "Thought operations must be composed via the named operators; ad-hoc reasoning outside the operator vocabulary is forbidden.",
      "known_uses": [
        {
          "system": "GoT paper benchmarks (sorting, set intersection, document merge)",
          "status": "available"
        }
      ],
      "related": [
        {
          "pattern": "tree-of-thoughts",
          "relation": "generalises"
        },
        {
          "pattern": "lats",
          "relation": "complements"
        },
        {
          "pattern": "blackboard",
          "relation": "composes-with"
        },
        {
          "pattern": "llm-compiler",
          "relation": "complements"
        }
      ],
      "references": [
        {
          "type": "paper",
          "title": "Graph of Thoughts: Solving Elaborate Problems with Large Language Models",
          "authors": "Besta, Blach, Kubicek, Gerstenberger, Podstawski, Gianinazzi, Gajda, Lehmann, Niewiadomski, Nyczyk, Hoefler",
          "year": 2023,
          "url": "https://arxiv.org/abs/2308.09687"
        }
      ],
      "status_in_practice": "experimental",
      "tags": [
        "reasoning",
        "graph",
        "dag"
      ]
    },
    {
      "id": "least-to-most",
      "name": "Least-to-Most Prompting",
      "aliases": [
        "L2M",
        "Easy-First Decomposition"
      ],
      "category": "reasoning",
      "intent": "Decompose a hard problem into an ordered list of easier subproblems, then solve them sequentially with each answer feeding the next.",
      "context": "Tasks with poor length or complexity generalisation: the model handles the easy training-style cases but fails on harder, longer instances.",
      "problem": "CoT generalises poorly out of distribution; the model needs explicit scaffolding for harder instances.",
      "forces": [
        "Decomposition prompts are themselves a design problem.",
        "Two stages double minimum cost.",
        "Errors in the decomposition cascade."
      ],
      "solution": "Two-stage prompt. Stage 1 (decomposition): prompt the model to list subproblems from easiest to hardest. Stage 2 (sequential solve): for each subproblem in order, prompt the model with the original question, prior subproblem answers, and the current subproblem.",
      "consequences": {
        "benefits": [
          "Strong length and complexity generalisation.",
          "Subproblem answers are inspectable."
        ],
        "liabilities": [
          "Decomposition prompt design is task-specific.",
          "Two-stage pipeline; ambiguity in stage 1 propagates."
        ]
      },
      "constrains": "Subproblems must be solved in the listed order; out-of-order solving is forbidden.",
      "known_uses": [
        {
          "system": "L2M paper benchmarks (last letter, SCAN, math)",
          "status": "available"
        }
      ],
      "related": [
        {
          "pattern": "chain-of-thought",
          "relation": "alternative-to"
        },
        {
          "pattern": "self-ask",
          "relation": "complements"
        },
        {
          "pattern": "plan-and-execute",
          "relation": "complements"
        },
        {
          "pattern": "goal-decomposition",
          "relation": "alternative-to"
        }
      ],
      "references": [
        {
          "type": "paper",
          "title": "Least-to-Most Prompting Enables Complex Reasoning in Large Language Models",
          "authors": "Zhou, Schärli, Hou, Wei, Scales, Wang, Schuurmans, Cui, Bousquet, Le, Chi",
          "year": 2022,
          "url": "https://arxiv.org/abs/2205.10625"
        }
      ],
      "status_in_practice": "emerging",
      "tags": [
        "reasoning",
        "decomposition"
      ]
    },
    {
      "id": "rest-em",
      "name": "ReST-EM",
      "aliases": [
        "Reinforced Self-Training",
        "Self-Training Loop"
      ],
      "category": "reasoning",
      "intent": "Iterate generate → reward-filter → fine-tune to bootstrap reasoning capabilities without human-labelled data.",
      "context": "Reasoning tasks where the model is partially competent and a reward signal (correct answer, executable test, formal verifier) exists.",
      "problem": "Pure prompting plateaus; full RL with PPO is unstable and expensive; supervised data is unavailable.",
      "forces": [
        "Reward filter quality bounds learning quality.",
        "Iteration count vs cost.",
        "Distribution drift across iterations."
      ],
      "solution": "EM-style loop. (E-step) Generate many responses per problem. Filter by reward (correctness against ground truth or executable test). (M-step) Fine-tune on the filtered set. Iterate. Variants: ReST (DeepMind, RL-shaped), ReST-EM (Singh et al., expectation-maximisation framing).",
      "consequences": {
        "benefits": [
          "Strong gains without human-labelled rationales.",
          "Stable; converges in a few iterations."
        ],
        "liabilities": [
          "Compute-heavy.",
          "Reward gaming possible."
        ]
      },
      "constrains": "Training data is restricted to filter-passing samples; ungrounded samples are not reinforced.",
      "known_uses": [
        {
          "system": "DeepMind ReST",
          "status": "available"
        },
        {
          "system": "Singh et al. ReST-EM",
          "status": "available"
        }
      ],
      "related": [
        {
          "pattern": "star-bootstrapping",
          "relation": "generalises"
        },
        {
          "pattern": "best-of-n",
          "relation": "uses"
        }
      ],
      "references": [
        {
          "type": "paper",
          "title": "Reinforced Self-Training (ReST) for Language Modeling",
          "authors": "Gulcehre et al.",
          "year": 2023,
          "url": "https://arxiv.org/abs/2308.08998"
        },
        {
          "type": "paper",
          "title": "Beyond Human Data: Scaling Self-Training for Problem-Solving with Language Models",
          "authors": "Singh, Co-Reyes, Agarwal, Anand, Patil, Garcia, Liu, Harrison, Lee, Xu, Parisi, Kumar, Alemi, Rizkowsky, Nova, Adlam, Bohnet, Elsayed, Sedghi, Mordatch, Simpson, Gur, Snoek, Pfaff, Brown, Roy, Mustafa, Hoffman, Botvinick, Faust, Larochelle, Hadsell, Schuurmans, Faruqui",
          "year": 2023,
          "url": "https://arxiv.org/abs/2312.06585"
        }
      ],
      "status_in_practice": "emerging",
      "tags": [
        "reasoning",
        "self-training",
        "rl"
      ]
    },
    {
      "id": "self-ask",
      "name": "Self-Ask",
      "aliases": [
        "Decompose-Ask",
        "Sub-Question Prompting"
      ],
      "category": "reasoning",
      "intent": "Have the model emit explicit follow-up sub-questions, answer them (optionally via search), then compose the final answer.",
      "context": "Multi-hop questions where the model knows each single hop but fails to chain hops in one inference.",
      "problem": "The 'compositionality gap': models know each fact in isolation but fail to combine them into a multi-hop answer.",
      "forces": [
        "Sub-question quality bounds the answer quality.",
        "Sub-question slots invite tool integration but add latency.",
        "Excessive decomposition wastes calls."
      ],
      "solution": "Prompt the model to interleave sub-questions and their answers. Each sub-question is either answered by the model directly or by a search tool. The final answer is composed once all sub-questions are answered.",
      "consequences": {
        "benefits": [
          "Bridges CoT and tool-using agents naturally.",
          "Decomposition is lexical and inspectable."
        ],
        "liabilities": [
          "Latency: N sub-question calls per question.",
          "Sub-questions can drift from the original."
        ]
      },
      "constrains": "Sub-question slots are the only insertion point for retrieval or tool calls; the agent cannot retrieve except through a sub-question.",
      "known_uses": [
        {
          "system": "Self-Ask + Search",
          "status": "available"
        }
      ],
      "related": [
        {
          "pattern": "react",
          "relation": "generalises"
        },
        {
          "pattern": "least-to-most",
          "relation": "complements"
        }
      ],
      "references": [
        {
          "type": "paper",
          "title": "Measuring and Narrowing the Compositionality Gap in Language Models",
          "authors": "Press, Zhang, Min, Schmidt, Smith, Lewis",
          "year": 2022,
          "url": "https://arxiv.org/abs/2210.03350"
        }
      ],
      "status_in_practice": "mature",
      "tags": [
        "reasoning",
        "decomposition",
        "multi-hop"
      ]
    },
    {
      "id": "star-bootstrapping",
      "name": "STaR Bootstrapping",
      "aliases": [
        "Self-Taught Reasoner",
        "Rationale Bootstrapping"
      ],
      "category": "reasoning",
      "intent": "Bootstrap a model's reasoning by training it on its own correct chain-of-thought outputs.",
      "context": "Reasoning tasks where chain-of-thought helps but supervised rationale data is unavailable.",
      "problem": "Without supervised rationale data, fine-tuning for reasoning is constrained; pure CoT prompting plateaus.",
      "forces": [
        "Filter quality determines what 'correct' rationale gets reinforced.",
        "Wrong rationales that produce right answers can leak in.",
        "Compute cost of repeated generation + filtering."
      ],
      "solution": "Prompt the base model with CoT to generate rationale + answer pairs. Keep pairs where the answer matches ground truth. **Rationalization**: when a generated rationale yields the wrong answer, prompt the model with the correct answer as a hint and ask for a rationale that justifies it; add the rationalized example to training. Fine-tune on the kept + rationalized pairs. Repeat: the fine-tuned model generates better rationales next round; iterate.",
      "consequences": {
        "benefits": [
          "Self-improvement on reasoning without rationale labels.",
          "Iterative gains compound."
        ],
        "liabilities": [
          "Spurious-rationale leakage if filtering is too lax.",
          "Compute-heavy."
        ]
      },
      "constrains": "Training data is restricted to filter-passing rationales; ungrounded rationales are not reinforced.",
      "known_uses": [
        {
          "system": "STaR paper experiments",
          "status": "available"
        },
        {
          "system": "Influences modern reasoning-distillation pipelines",
          "status": "available"
        }
      ],
      "related": [
        {
          "pattern": "chain-of-thought",
          "relation": "uses"
        },
        {
          "pattern": "self-consistency",
          "relation": "complements"
        },
        {
          "pattern": "rest-em",
          "relation": "specialises"
        }
      ],
      "references": [
        {
          "type": "paper",
          "title": "STaR: Bootstrapping Reasoning with Reasoning",
          "authors": "Zelikman, Wu, Mu, Goodman",
          "year": 2022,
          "url": "https://arxiv.org/abs/2203.14465"
        }
      ],
      "status_in_practice": "emerging",
      "tags": [
        "reasoning",
        "training",
        "bootstrapping"
      ]
    },
    {
      "id": "test-time-compute-scaling",
      "name": "Test-Time Compute Scaling",
      "aliases": [
        "Inference-Time Scaling",
        "Compute-Time Trade-Off"
      ],
      "category": "reasoning",
      "intent": "Allocate more inference-time compute (samples, search, deeper thinking) instead of scaling parameters to improve quality.",
      "context": "Frontier of agent capability where parameter scaling has saturated and quality gains come from inference-time techniques.",
      "problem": "Naive single-pass inference under-uses available compute; many hard tasks have inference-time techniques that beat ever-larger models.",
      "forces": [
        "Wall-clock latency rises with compute.",
        "Cost rises linearly or worse with sample count.",
        "Best technique (samples / search / deeper thinking) is task-dependent."
      ],
      "solution": "Pick the inference-time technique that fits: best-of-N for verifier-amenable tasks, self-consistency for sampling-amenable tasks, tree search for combinatorial tasks, extended thinking for sequential reasoning. Compose techniques where complementary. Tune the compute budget per task class.",
      "consequences": {
        "benefits": [
          "Quality lifts without retraining.",
          "Compute budget becomes a per-request control."
        ],
        "liabilities": [
          "Latency-sensitive use cases cannot afford much.",
          "Token cost can dominate."
        ]
      },
      "constrains": "Each request specifies its compute budget; over-budget requests are cut off.",
      "known_uses": [
        {
          "system": "OpenAI o-series scaling-with-effort",
          "status": "available"
        },
        {
          "system": "DeepMind AlphaCode/AlphaProof scaling",
          "status": "available"
        }
      ],
      "related": [
        {
          "pattern": "extended-thinking",
          "relation": "generalises"
        },
        {
          "pattern": "best-of-n",
          "relation": "generalises"
        },
        {
          "pattern": "self-consistency",
          "relation": "generalises"
        },
        {
          "pattern": "lats",
          "relation": "generalises"
        },
        {
          "pattern": "process-reward-model",
          "relation": "generalises"
        }
      ],
      "references": [
        {
          "type": "paper",
          "title": "Scaling LLM Test-Time Compute Optimally Can Be More Effective Than Scaling Model Parameters",
          "authors": "Snell, Lee, Xu, Kumar",
          "year": 2024,
          "url": "https://arxiv.org/abs/2408.03314"
        },
        {
          "type": "paper",
          "title": "Large Language Monkeys: Scaling Inference Compute with Repeated Sampling",
          "authors": "Brown, Juravsky, Ehrlich, Clark, Le, Ré, Mirhoseini",
          "year": 2024,
          "url": "https://arxiv.org/abs/2407.21787"
        }
      ],
      "status_in_practice": "mature",
      "tags": [
        "reasoning",
        "scaling",
        "compute"
      ]
    },
    {
      "id": "tree-of-thoughts",
      "name": "Tree of Thoughts",
      "aliases": [
        "ToT",
        "Deliberate Reasoning"
      ],
      "category": "reasoning",
      "intent": "Search over a tree of partial reasoning states with explicit lookahead, evaluation, and backtracking.",
      "context": "The problem benefits from exploring alternatives (puzzles, planning, creative writing) and committing to a single chain leads to dead ends.",
      "problem": "Chain-of-Thought commits to a single trace; if an early step is wrong the model cannot recover or compare alternatives.",
      "forces": [
        "Search costs many model calls per problem.",
        "A value or heuristic function is needed to score partial states.",
        "Termination criteria are non-trivial."
      ],
      "solution": "Decompose the problem into thought steps. At each node, sample several candidate next thoughts. Evaluate each (model self-evaluation or programmatic check). Apply BFS/DFS/beam to explore the tree. Backtrack from dead ends. Return the best leaf.",
      "consequences": {
        "benefits": [
          "Higher accuracy on tasks where alternatives matter (Game of 24, crosswords, creative writing planning).",
          "Explicit search vocabulary (lookahead, prune, backtrack)."
        ],
        "liabilities": [
          "5-100x cost over CoT depending on branching factor and depth.",
          "Value function quality bounds search benefit."
        ]
      },
      "constrains": "The agent may only commit to a final answer after exploring at least one full path; search depth and branching are bounded by configuration.",
      "known_uses": [
        {
          "system": "ToT paper benchmarks (Game of 24, crosswords, creative writing)",
          "status": "available"
        },
        {
          "system": "LangChain ToT integration",
          "status": "available"
        }
      ],
      "related": [
        {
          "pattern": "chain-of-thought",
          "relation": "specialises"
        },
        {
          "pattern": "graph-of-thoughts",
          "relation": "specialises"
        },
        {
          "pattern": "lats",
          "relation": "generalises"
        }
      ],
      "references": [
        {
          "type": "paper",
          "title": "Tree of Thoughts: Deliberate Problem Solving with Large Language Models",
          "authors": "Yao, Yu, Zhao, Shafran, Griffiths, Cao, Narasimhan",
          "year": 2023,
          "url": "https://arxiv.org/abs/2305.10601"
        }
      ],
      "status_in_practice": "emerging",
      "tags": [
        "reasoning",
        "search",
        "tree"
      ]
    },
    {
      "id": "zero-shot-cot",
      "name": "Zero-Shot Chain-of-Thought",
      "aliases": [
        "Let's Think Step by Step",
        "Trigger-Phrase CoT"
      ],
      "category": "reasoning",
      "intent": "Elicit step-by-step reasoning with a single trigger phrase rather than few-shot exemplars.",
      "context": "Reasoning tasks where curating few-shot exemplars is impractical or costly.",
      "problem": "Few-shot CoT requires exemplar curation per task type; that effort scales poorly across many tasks.",
      "forces": [
        "Trigger phrases are model- and language-specific.",
        "Quality lift is smaller than well-curated few-shot CoT.",
        "Trigger-phrase reasoning can drift on complex tasks."
      ],
      "solution": "Append a trigger phrase ('Let's think step by step', 'Let's work through this carefully') to the prompt. The model produces reasoning before its answer with no exemplar required. Optionally extract the final answer with a follow-up prompt.",
      "consequences": {
        "benefits": [
          "Zero curation cost per task.",
          "Generalises across task types."
        ],
        "liabilities": [
          "Lower quality lift than well-tuned few-shot CoT.",
          "Trigger-phrase brittleness."
        ]
      },
      "constrains": "The model is required to reason before answering; one-shot answer-only generation is not the target.",
      "known_uses": [
        {
          "system": "Zero-Shot CoT paper baseline",
          "status": "available"
        }
      ],
      "related": [
        {
          "pattern": "chain-of-thought",
          "relation": "specialises"
        }
      ],
      "references": [
        {
          "type": "paper",
          "title": "Large Language Models are Zero-Shot Reasoners",
          "authors": "Kojima, Gu, Reid, Matsuo, Iwasawa",
          "year": 2022,
          "url": "https://arxiv.org/abs/2205.11916"
        }
      ],
      "status_in_practice": "mature",
      "tags": [
        "reasoning",
        "cot",
        "zero-shot"
      ]
    },
    {
      "id": "agentic-rag",
      "name": "Agentic RAG",
      "aliases": [
        "Iterative RAG"
      ],
      "category": "retrieval",
      "intent": "Replace static retrieve-then-generate with autonomous agents that plan, choose sources, retrieve iteratively, reflect, and re-query.",
      "context": "Multi-hop, ambiguous, or evolving information needs where single-shot retrieval is insufficient.",
      "problem": "Naive RAG cannot decide whether to retrieve, which source to use, when to stop retrieving, or how to recover from poor retrievals.",
      "forces": [
        "Agentic loops cost more than single-shot retrieval.",
        "Source selection requires capability descriptions.",
        "Loop bounds must prevent runaway retrieval."
      ],
      "solution": "Treat retrieval as a tool. The agent decides whether to retrieve, formulates and reformulates the query, picks among multiple retrievers (vector, graph, keyword, web), evaluates retrieved evidence, and re-queries on insufficient results. Composes naturally with reflection, planning, and tool-use patterns.",
      "consequences": {
        "benefits": [
          "Handles multi-hop and adaptive queries.",
          "Source diversity (multi-store retrieval) becomes feasible."
        ],
        "liabilities": [
          "Cost and latency rise with loop iterations.",
          "Loop quality depends on agent self-evaluation."
        ]
      },
      "constrains": "Retrieval is one tool among many; the agent decides invocation, but each retrieval is bounded by the step budget.",
      "known_uses": [
        {
          "system": "Self-RAG, CRAG implementations",
          "status": "available"
        },
        {
          "system": "LangGraph Agentic RAG tutorials",
          "status": "available",
          "url": "https://langchain-ai.github.io/langgraph/tutorials/rag/langgraph_agentic_rag/"
        },
        {
          "system": "Perplexity",
          "status": "available",
          "url": "https://www.perplexity.ai/"
        },
        {
          "system": "ChatGPT Search",
          "status": "available",
          "url": "https://chat.openai.com/"
        },
        {
          "system": "Glean",
          "status": "available",
          "url": "https://www.glean.com/"
        },
        {
          "system": "Notion AI",
          "status": "available",
          "url": "https://www.notion.so/product/ai"
        }
      ],
      "related": [
        {
          "pattern": "naive-rag",
          "relation": "generalises"
        },
        {
          "pattern": "react",
          "relation": "uses"
        },
        {
          "pattern": "reflection",
          "relation": "uses"
        },
        {
          "pattern": "tool-use",
          "relation": "uses",
          "note": "Retrieval is exposed as a tool the agent decides to invoke."
        },
        {
          "pattern": "cross-encoder-reranking",
          "relation": "composes-with",
          "note": "Reranking is a near-universal RAG companion."
        },
        {
          "pattern": "self-rag",
          "relation": "generalises"
        },
        {
          "pattern": "crag",
          "relation": "generalises"
        }
      ],
      "references": [
        {
          "type": "paper",
          "title": "Agentic Retrieval-Augmented Generation: A Survey on Agentic RAG",
          "authors": "Singh, Ehtesham, Kumar, Khoei",
          "year": 2025,
          "url": "https://arxiv.org/abs/2501.09136"
        }
      ],
      "status_in_practice": "mature",
      "tags": [
        "rag",
        "agentic",
        "iterative"
      ],
      "example_scenario": "A consulting agent is asked, 'Compare our 2023 and 2024 revenue by region.' Naive RAG would do one search and pass whatever it found to the model. Agentic RAG instead runs in a loop: it queries the 2023 figures, decides it also needs 2024 figures, queries those, notices the EMEA numbers are missing, queries again with a more specific phrase, then produces the comparison from a complete set.",
      "variants": [
        {
          "name": "Self-RAG (single-pass with reflection)",
          "summary": "The agent retrieves once, reflects on whether the retrieval is sufficient, and decides to answer or re-query in a single tight loop.",
          "distinguishing_factor": "one reflection step",
          "when_to_use": "Latency-sensitive; one extra retrieval is the maximum acceptable cost."
        },
        {
          "name": "Multi-hop iterative",
          "summary": "The agent decomposes the question, retrieves once per sub-question, then synthesises the answer from accumulated evidence.",
          "distinguishing_factor": "explicit decomposition",
          "when_to_use": "Questions that require joining facts across multiple sources (compliance, comparative analysis)."
        },
        {
          "name": "Corrective (CRAG)",
          "summary": "A grader scores each retrieved document; documents marked Incorrect trigger a web fallback search.",
          "distinguishing_factor": "grader plus web fallback",
          "when_to_use": "Index quality is uneven and the system has access to web search as a backup.",
          "see_also": "crag"
        }
      ]
    },
    {
      "id": "contextual-retrieval",
      "name": "Contextual Retrieval",
      "aliases": [
        "Chunk Contextualisation",
        "Anthropic Contextual Embeddings"
      ],
      "category": "retrieval",
      "intent": "Prepend a short LLM-generated description to each chunk before embedding so the chunk carries its situating context.",
      "context": "Chunks that lose context at split boundaries (pronouns, 'the company', time references) embed far from queries that name the entity explicitly.",
      "problem": "Naive chunking destroys context; queries that name entities by full name miss chunks that refer to them by pronoun.",
      "forces": [
        "An LLM call per chunk is expensive.",
        "Prompt caching of the parent document amortises the cost.",
        "Context generation must be deterministic enough to keep the index stable."
      ],
      "solution": "For each chunk, prompt an LLM with the parent document and the chunk; receive a short description that situates the chunk. Prepend that description to the chunk. Embed the prepended chunk. Store BM25 over both prepended chunks (Contextual BM25) and dense vectors (Contextual Embeddings). Compose with reranking for further gains.",
      "consequences": {
        "benefits": [
          "Reported retrieval-failure reductions: 35% (embeddings), 49% (+BM25), 67% (+reranking).",
          "Fully compatible with existing RAG pipelines."
        ],
        "liabilities": [
          "Indexing cost per chunk; only worth it for stable corpora.",
          "Chunk re-indexing required when context model changes."
        ]
      },
      "constrains": "Chunks enter the index only after contextualisation; raw chunks are not indexed.",
      "known_uses": [
        {
          "system": "Anthropic Contextual Retrieval blog post",
          "status": "available",
          "url": "https://www.anthropic.com/news/contextual-retrieval"
        }
      ],
      "related": [
        {
          "pattern": "naive-rag",
          "relation": "specialises"
        },
        {
          "pattern": "hybrid-search",
          "relation": "composes-with"
        },
        {
          "pattern": "cross-encoder-reranking",
          "relation": "composes-with"
        },
        {
          "pattern": "prompt-caching",
          "relation": "uses"
        },
        {
          "pattern": "raft",
          "relation": "alternative-to"
        }
      ],
      "references": [
        {
          "type": "blog",
          "title": "Introducing Contextual Retrieval",
          "authors": "Anthropic",
          "year": 2024,
          "url": "https://www.anthropic.com/news/contextual-retrieval"
        }
      ],
      "status_in_practice": "emerging",
      "tags": [
        "rag",
        "contextual",
        "anthropic"
      ],
      "example_scenario": "A 200-page company handbook is split into 600 chunks for retrieval. One chunk says 'the deadline is the 15th of the following month' — but a query for 'invoice deadline' won't match because the chunk doesn't say 'invoice'. Contextual Retrieval prepends a one-sentence context to each chunk: 'This chunk discusses invoice payment timing.' Now the embedding carries the context the original chunk lost when it was split."
    },
    {
      "id": "crag",
      "name": "CRAG",
      "aliases": [
        "Corrective RAG"
      ],
      "category": "retrieval",
      "intent": "Add a lightweight retrieval evaluator that grades each retrieved document and triggers corrective web search on poor retrievals.",
      "context": "Production RAG where retrieval quality varies and the system must recover gracefully from low-quality retrievals.",
      "problem": "Naive RAG passes every retrieval into the generator; bad retrievals corrupt outputs without any correction step.",
      "forces": [
        "Evaluator quality bounds correction accuracy.",
        "Web fallback adds latency and external dependency.",
        "Three-way grading (correct / ambiguous / incorrect) needs calibration."
      ],
      "solution": "After retrieval, a lightweight evaluator (T5-based or similar) grades each document as Correct, Ambiguous, or Incorrect. Correct documents go forward as-is. Ambiguous documents trigger a web search for additional evidence. Incorrect documents are discarded and replaced via web search. The generator receives the corrected document set.",
      "consequences": {
        "benefits": [
          "Robustness to poor retrievals.",
          "Plug-and-play with existing RAG."
        ],
        "liabilities": [
          "Two-stage retrieval increases latency.",
          "Web fallback has its own correctness questions."
        ]
      },
      "constrains": "The generator sees only retrieval-graded-Correct documents, optionally augmented with corrective-search results.",
      "known_uses": [
        {
          "system": "CRAG paper baseline",
          "status": "available"
        },
        {
          "system": "LangGraph Corrective-RAG tutorial",
          "status": "available"
        }
      ],
      "related": [
        {
          "pattern": "agentic-rag",
          "relation": "specialises"
        },
        {
          "pattern": "evaluator-optimizer",
          "relation": "uses"
        }
      ],
      "references": [
        {
          "type": "paper",
          "title": "Corrective Retrieval Augmented Generation",
          "authors": "Yan, Gui, Xiao, Mei, Liu, Shang, Sun, Wang",
          "year": 2024,
          "url": "https://arxiv.org/abs/2401.15884"
        }
      ],
      "status_in_practice": "emerging",
      "tags": [
        "rag",
        "corrective",
        "evaluator"
      ]
    },
    {
      "id": "cross-encoder-reranking",
      "name": "Cross-Encoder Reranking",
      "aliases": [
        "Reranker",
        "Two-Stage Retrieval",
        "Retrieve-Then-Rerank"
      ],
      "category": "retrieval",
      "intent": "After cheap bi-encoder or BM25 retrieval, rescore top-N candidates with a cross-encoder that jointly attends over (query, candidate).",
      "context": "Bi-encoders compress query and document independently and lose fine-grained interaction; ANN-search top-k is recall-oriented.",
      "problem": "Top-k from cheap retrieval contains both relevant and irrelevant candidates; the generator wastes context on the latter.",
      "forces": [
        "Cross-encoder cost is one model call per candidate.",
        "Latency budget caps N (typically 20-100).",
        "Fine-tuning a custom reranker is a separate effort."
      ],
      "solution": "Two-stage retrieval. Stage 1: cheap retrieve (BM25, dense, hybrid) returns top-N. Stage 2: cross-encoder scores each (query, candidate) jointly. Return top-K << N to the generator.",
      "consequences": {
        "benefits": [
          "Largest single quality win on top of contextual embeddings (Anthropic ablation).",
          "Reranker can be swapped without re-indexing."
        ],
        "liabilities": [
          "Latency adds one call per candidate.",
          "Reranker calibration on out-of-domain content."
        ]
      },
      "constrains": "The generator sees only the reranker's top-K; pre-rerank candidates are not used.",
      "known_uses": [
        {
          "system": "Cohere Rerank",
          "status": "available"
        },
        {
          "system": "BGE-reranker (open-source)",
          "status": "available"
        },
        {
          "system": "Anthropic Contextual Retrieval",
          "status": "available"
        }
      ],
      "related": [
        {
          "pattern": "naive-rag",
          "relation": "composes-with"
        },
        {
          "pattern": "hybrid-search",
          "relation": "composes-with"
        },
        {
          "pattern": "agentic-rag",
          "relation": "composes-with"
        },
        {
          "pattern": "contextual-retrieval",
          "relation": "composes-with"
        },
        {
          "pattern": "hyde",
          "relation": "composes-with"
        }
      ],
      "references": [
        {
          "type": "paper",
          "title": "Passage Re-ranking with BERT",
          "authors": "Nogueira, Cho",
          "year": 2019,
          "url": "https://arxiv.org/abs/1901.04085"
        }
      ],
      "status_in_practice": "mature",
      "tags": [
        "rag",
        "rerank",
        "two-stage"
      ],
      "applicability": {
        "use_when": [
          "Initial retrieval returns a noisy top-100 and accuracy of top-5 matters.",
          "Inference budget can afford a cross-encoder pass on each candidate.",
          "Downstream LLM context can only fit a small number of chunks."
        ],
        "do_not_use_when": [
          "Latency target is sub-100ms end-to-end; cross-encoders blow it.",
          "Initial retrieval is already precise (e.g., exact ID lookup).",
          "Inference cost is the bottleneck and recall@k from the bi-encoder is good enough."
        ]
      },
      "diagram": {
        "type": "flow",
        "mermaid": "flowchart LR\n  Q[Query] --> Retr[Bi-encoder retrieval, top-100]\n  Retr --> CE[Cross-encoder scores query against each candidate]\n  CE --> Rank[Rerank by score]\n  Rank --> Top[Top-5 to LLM]",
        "caption": "Cross-Encoder Reranking trades inference cost for sharper top-k by scoring each candidate against the query directly."
      },
      "example_scenario": "A legal-research agent retrieves 100 candidate paragraphs from a corpus of contracts that mention 'force majeure'. Many are off-topic. Before showing them to the LLM, a small cross-encoder model scores each candidate against the user's exact question, picks the top 5, and discards the rest. The LLM only ever reads the sharpest results.",
      "variants": [
        {
          "name": "Bi-encoder + cross-encoder cascade",
          "summary": "A fast bi-encoder retrieves the top 100; a slow cross-encoder reranks them; the top 5 go to the LLM.",
          "distinguishing_factor": "two-stage cascade",
          "when_to_use": "Default. Best balance of recall and precision."
        },
        {
          "name": "LLM-as-reranker",
          "summary": "Skip the cross-encoder entirely; use a cheaper LLM (e.g., Haiku) to score and rank candidates against the query.",
          "distinguishing_factor": "LLM in the rank loop",
          "when_to_use": "No good cross-encoder is available for the domain; LLM cost has come down enough."
        },
        {
          "name": "Listwise reranker",
          "summary": "Send all candidates to the reranker at once; it outputs a re-ordered list rather than per-candidate scores.",
          "distinguishing_factor": "list-aware ranking",
          "when_to_use": "Relative ordering matters more than absolute scores; available only with newer rerankers."
        }
      ]
    },
    {
      "id": "graphrag",
      "name": "GraphRAG",
      "aliases": [
        "Graph-Based RAG",
        "Knowledge Graph RAG"
      ],
      "category": "retrieval",
      "intent": "Build an LLM-extracted entity-and-relation knowledge graph plus hierarchical community summaries, then answer global queries via map-reduce over those summaries.",
      "context": "Sensemaking and corpus-level questions ('what are the main themes?') that naive top-k retrieval cannot answer because they require seeing the whole.",
      "problem": "Naive RAG retrieves local chunks and cannot answer global queries; chunk-level retrieval is mismatched to corpus-level questions.",
      "forces": [
        "Indexing cost is high (LLM calls per entity, relation, community).",
        "Graph quality depends on extraction prompts.",
        "Local-search vs global-search modes serve different query types and must be routed."
      ],
      "solution": "Index time: extract entities and relations from chunks; build a knowledge graph; cluster into hierarchical communities; summarise each community. Query time: classify query as local (entity-specific) or global (corpus-wide). Local queries use entity-anchored retrieval; global queries map-reduce over community summaries.",
      "consequences": {
        "benefits": [
          "Answers corpus-level sensemaking questions naive RAG cannot.",
          "Communities are inspectable artefacts of the corpus."
        ],
        "liabilities": [
          "High indexing cost (orders of magnitude more LLM calls).",
          "Entity extraction errors cascade through the graph."
        ]
      },
      "constrains": "Global queries operate only on community summaries, not raw chunks; local queries operate only on entity-anchored neighbourhoods.",
      "known_uses": [
        {
          "system": "Microsoft GraphRAG (open source)",
          "status": "available",
          "url": "https://github.com/microsoft/graphrag"
        }
      ],
      "related": [
        {
          "pattern": "naive-rag",
          "relation": "alternative-to"
        },
        {
          "pattern": "map-reduce",
          "relation": "uses"
        },
        {
          "pattern": "knowledge-graph-memory",
          "relation": "composes-with"
        }
      ],
      "references": [
        {
          "type": "paper",
          "title": "From Local to Global: A Graph RAG Approach to Query-Focused Summarization",
          "authors": "Edge, Trinh, Cheng, Bradley, Chao, Mody, Truitt, Metropolitansky, Ness, Larson",
          "year": 2024,
          "url": "https://arxiv.org/abs/2404.16130"
        }
      ],
      "status_in_practice": "emerging",
      "tags": [
        "rag",
        "graph",
        "sensemaking"
      ]
    },
    {
      "id": "hybrid-search",
      "name": "Hybrid Search",
      "aliases": [
        "BM25 + Dense",
        "Lexical + Semantic Retrieval"
      ],
      "category": "retrieval",
      "intent": "Combine sparse lexical retrieval (BM25) with dense vector retrieval and fuse the results.",
      "context": "Queries vary: some are keyword-matchable (exact identifiers, names), others are semantic; pure dense or pure sparse retrieval misses one or the other.",
      "problem": "Dense retrieval misses exact matches; sparse retrieval misses paraphrase. Each alone leaves recall on the table.",
      "forces": [
        "Score fusion (RRF, weighted sum, learned) is a design choice.",
        "Two indexes mean two pipelines to maintain.",
        "Tuning fusion weights is empirical and corpus-specific."
      ],
      "solution": "Index the corpus twice: BM25 for sparse, dense embeddings for semantic. At query time, retrieve top-k from each, fuse with Reciprocal Rank Fusion or weighted aggregation. Pass the fused top-N forward (typically into a reranker). Do not weight raw scores directly; use rank-based fusion (RRF) or score-normalised aggregation, since BM25 and dense scores live on incompatible scales.",
      "consequences": {
        "benefits": [
          "Recall improvement over either alone, especially for mixed-vocabulary corpora.",
          "Robust to embedding model weaknesses on rare terms."
        ],
        "liabilities": [
          "Two indexes to keep in sync.",
          "Fusion tuning is empirical."
        ]
      },
      "constrains": "The retrieval set is the fusion of sparse and dense top-k; neither alone is the input to downstream stages.",
      "known_uses": [
        {
          "system": "Anthropic Contextual Retrieval",
          "status": "available"
        },
        {
          "system": "Most production RAG (Pinecone, Weaviate, Elastic Hybrid)",
          "status": "available"
        }
      ],
      "related": [
        {
          "pattern": "naive-rag",
          "relation": "specialises"
        },
        {
          "pattern": "cross-encoder-reranking",
          "relation": "composes-with"
        },
        {
          "pattern": "contextual-retrieval",
          "relation": "composes-with"
        }
      ],
      "references": [
        {
          "type": "paper",
          "title": "Reciprocal Rank Fusion outperforms Condorcet and individual Rank Learning Methods",
          "authors": "Cormack, Clarke, Buettcher",
          "year": 2009,
          "url": "https://dl.acm.org/doi/10.1145/1571941.1572114"
        }
      ],
      "status_in_practice": "mature",
      "tags": [
        "rag",
        "hybrid",
        "bm25"
      ],
      "applicability": {
        "use_when": [
          "Queries mix semantic intent with rare tokens (codes, IDs, proper nouns) that embeddings miss.",
          "The corpus is heterogeneous enough that one retriever loses recall on part of it.",
          "Latency budget tolerates two retrievers plus a fusion step."
        ],
        "do_not_use_when": [
          "The corpus is uniformly conceptual; dense alone is enough.",
          "The corpus is uniformly keyword-driven; BM25 alone is enough.",
          "Sub-50ms p95 retrieval is required and the second retriever blows the budget."
        ]
      },
      "diagram": {
        "type": "flow",
        "mermaid": "flowchart LR\n  Q[Query] --> S[Sparse retrieval BM25]\n  Q --> D[Dense retrieval embeddings]\n  S --> F[Fusion / Reciprocal Rank Fusion]\n  D --> F\n  F --> R[Top-k results]",
        "caption": "Hybrid Search runs sparse and dense retrievers in parallel and fuses their rankings before downstream stages."
      },
      "example_scenario": "A coding-assistant searches its codebase for 'how do we authenticate with Stripe?' Pure semantic search misses files that mention 'stripe-api-key' verbatim; pure keyword search misses files that talk about 'payment processor authentication'. Hybrid search runs both at once: a keyword scorer catches the exact tokens, an embedding scorer catches the conceptual matches, and a fusion step blends the two ranked lists.",
      "variants": [
        {
          "name": "Reciprocal Rank Fusion (RRF)",
          "summary": "Each retriever produces a ranked list; final score for a doc is the sum of 1/(k+rank) across lists. Tunes via the constant k (~60 in published work).",
          "distinguishing_factor": "rank-based fusion, no score normalisation",
          "when_to_use": "Default when retrievers produce non-comparable scores (BM25 floats vs cosine similarities)."
        },
        {
          "name": "Weighted score fusion",
          "summary": "Each retriever returns scores normalised to [0,1]; final score is a tunable convex combination (e.g. 0.4 * sparse + 0.6 * dense).",
          "distinguishing_factor": "score-based fusion, requires normalisation",
          "when_to_use": "When retrievers produce comparable scores or you have an eval set to tune weights against."
        },
        {
          "name": "Router-based hybrid",
          "summary": "A classifier inspects the query and routes to sparse OR dense retrieval (not both). 'How do I X' goes dense; 'order #4471' goes sparse.",
          "distinguishing_factor": "one retriever per query",
          "when_to_use": "Latency budget cannot afford two retrievers and the query mix is bimodal."
        }
      ]
    },
    {
      "id": "hyde",
      "name": "HyDE",
      "aliases": [
        "Hypothetical Document Embeddings"
      ],
      "category": "retrieval",
      "intent": "Have the LLM write a hypothetical answer document, embed it, and use it as the retrieval query.",
      "context": "Short or underspecified queries embed far from long-form passages in dense vector space; supervised relevance data is absent.",
      "problem": "Query-document length and style asymmetry hurts dense retrieval recall on short queries.",
      "forces": [
        "Hallucinated documents that miss the topic redirect retrieval badly.",
        "Adds an LLM call per query.",
        "Often paired with reranking to recover from off-topic hallucinations."
      ],
      "solution": "On query: prompt the LLM to draft a hypothetical answer to the query. Embed the hypothetical answer. Retrieve top-k by similarity to that embedding (not the original query). Pass the retrieved chunks into normal RAG.",
      "consequences": {
        "benefits": [
          "Zero-shot improvement; no encoder fine-tuning.",
          "Particularly strong on short, underspecified queries."
        ],
        "liabilities": [
          "Off-topic hallucinations cause retrieval drift.",
          "One extra LLM call per query."
        ]
      },
      "constrains": "Retrieval queries the index with the hypothetical answer's embedding, not the user query's embedding.",
      "known_uses": [
        {
          "system": "LangChain HyDE retriever",
          "status": "available"
        }
      ],
      "related": [
        {
          "pattern": "naive-rag",
          "relation": "specialises"
        },
        {
          "pattern": "cross-encoder-reranking",
          "relation": "composes-with"
        }
      ],
      "references": [
        {
          "type": "paper",
          "title": "Precise Zero-Shot Dense Retrieval without Relevance Labels",
          "authors": "Gao, Ma, Lin, Callan",
          "year": 2022,
          "url": "https://arxiv.org/abs/2212.10496"
        }
      ],
      "status_in_practice": "emerging",
      "tags": [
        "rag",
        "retrieval",
        "embedding"
      ]
    },
    {
      "id": "naive-rag",
      "name": "Naive RAG",
      "aliases": [
        "Retrieval-Augmented Generation",
        "Top-K Retrieve-and-Stuff"
      ],
      "category": "retrieval",
      "intent": "Condition the generator on top-k chunks retrieved from an external dense index so knowledge lives outside parameters.",
      "context": "The agent needs information that lives in a corpus too large to fit in context, and that may change without retraining.",
      "problem": "Parametric LMs hallucinate, cannot cite, and cannot be updated without retraining; query-time external knowledge is needed.",
      "forces": [
        "Chunk size trades context loss for retrieval recall.",
        "Embedding choice constrains retrieval quality.",
        "Single-shot retrieval misses multi-hop questions."
      ],
      "solution": "Chunk the corpus. Embed each chunk with a dense encoder. At query time, embed the query, retrieve top-k by similarity, prepend chunks to the prompt, generate. The simplest production RAG pipeline.",
      "consequences": {
        "benefits": [
          "Knowledge updates without retraining.",
          "Citations become possible."
        ],
        "liabilities": [
          "Chunk boundaries destroy context.",
          "Top-k retrieval is recall-oriented; precision suffers without reranking.",
          "No iterative retrieval; multi-hop fails."
        ]
      },
      "constrains": "The generator may use only retrieved chunks plus its parametric memory; the retrieval set is the boundary.",
      "known_uses": [
        {
          "system": "LangChain / LlamaIndex default RAG",
          "status": "available",
          "url": "https://www.langchain.com/"
        },
        {
          "system": "Most enterprise document-QA deployments",
          "status": "available"
        }
      ],
      "related": [
        {
          "pattern": "hyde",
          "relation": "generalises"
        },
        {
          "pattern": "cross-encoder-reranking",
          "relation": "composes-with"
        },
        {
          "pattern": "contextual-retrieval",
          "relation": "generalises"
        },
        {
          "pattern": "graphrag",
          "relation": "alternative-to"
        },
        {
          "pattern": "agentic-rag",
          "relation": "specialises"
        },
        {
          "pattern": "naive-rag-first",
          "relation": "conflicts-with",
          "note": "Naive RAG is fine; treating it as the only answer is the anti-pattern."
        },
        {
          "pattern": "chain-of-verification",
          "relation": "composes-with"
        },
        {
          "pattern": "vector-memory",
          "relation": "generalises"
        },
        {
          "pattern": "citation-streaming",
          "relation": "complements"
        },
        {
          "pattern": "raft",
          "relation": "generalises"
        },
        {
          "pattern": "hybrid-search",
          "relation": "generalises"
        },
        {
          "pattern": "hallucinated-citations",
          "relation": "alternative-to"
        },
        {
          "pattern": "app-exploration-phase",
          "relation": "used-by"
        }
      ],
      "references": [
        {
          "type": "paper",
          "title": "Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks",
          "authors": "Lewis, Perez, Piktus, Petroni, Karpukhin, Goyal, Küttler, Lewis, Yih, Rocktäschel, Riedel, Kiela",
          "year": 2020,
          "url": "https://arxiv.org/abs/2005.11401"
        }
      ],
      "status_in_practice": "mature",
      "tags": [
        "rag",
        "retrieval",
        "vector"
      ]
    },
    {
      "id": "raft",
      "name": "RAFT",
      "aliases": [
        "Retrieval-Augmented Fine-Tuning",
        "Distractor-Robust RAG"
      ],
      "category": "retrieval",
      "intent": "Train the model to ignore irrelevant retrieved documents (distractors) in a domain-specific RAG setting.",
      "context": "Domain-specific RAG where retrieval inevitably mixes relevant and irrelevant documents; the off-the-shelf model is over-confident on distractors.",
      "problem": "Generic RAG models are fooled by topically-similar distractors; the answer drifts to the loudest irrelevant source.",
      "forces": [
        "Training data construction (oracle docs + distractors) is its own pipeline.",
        "Domain shift between training and serving distractors.",
        "Trade-off between generalisation and domain specialisation."
      ],
      "solution": "Construct training examples where some documents are oracle and others are distractors. Train the model to cite oracle documents and ignore distractors. Couples chain-of-thought with citation discipline.",
      "consequences": {
        "benefits": [
          "Robustness to distractor documents in domain RAG.",
          "Citation discipline improves."
        ],
        "liabilities": [
          "Training data effort.",
          "Domain-specific; transfer between domains is partial."
        ]
      },
      "constrains": "Cited claims must come from documents marked oracle in training; distractor citations are penalised.",
      "known_uses": [
        {
          "system": "RAFT paper experiments",
          "status": "available"
        }
      ],
      "related": [
        {
          "pattern": "naive-rag",
          "relation": "specialises"
        },
        {
          "pattern": "contextual-retrieval",
          "relation": "alternative-to"
        }
      ],
      "references": [
        {
          "type": "paper",
          "title": "RAFT: Adapting Language Model to Domain Specific RAG",
          "authors": "Zhang, Patil, Jain, Shen, Zaharia, Stoica, Gonzalez",
          "year": 2024,
          "url": "https://arxiv.org/abs/2403.10131"
        }
      ],
      "status_in_practice": "emerging",
      "tags": [
        "rag",
        "training",
        "domain"
      ]
    },
    {
      "id": "self-rag",
      "name": "Self-RAG",
      "aliases": [
        "Self-Reflective RAG"
      ],
      "category": "retrieval",
      "intent": "Fine-tune the model to emit reflection tokens that decide when to retrieve, evaluate retrieved relevance, and assess generated support.",
      "context": "Retrieval-augmented generation where the model needs to reason about whether to retrieve, whether retrieved evidence is relevant, and whether the generation is supported.",
      "problem": "Static retrieve-then-generate retrieves regardless of need and generates regardless of evidence quality; both wastes calls and admits hallucination.",
      "forces": [
        "Token vocabulary expansion adds training complexity.",
        "Reflection tokens must be enforced at inference, not just trained.",
        "Self-evaluation correlates with the model's blind spots."
      ],
      "solution": "A critic model is first trained to label data with reflection tokens. The generator is then fine-tuned on the labeled data to emit four reflection tokens inline at inference: [Retrieve], [IsRel] (is retrieved evidence relevant?), [IsSup] (is generation supported?), [IsUse] (is generation useful?). The host enforces the reflection grammar and uses tokens to control flow.",
      "consequences": {
        "benefits": [
          "Adaptive retrieval: skip when not needed.",
          "Inline self-evaluation grounds generation."
        ],
        "liabilities": [
          "Requires fine-tuning; not zero-shot.",
          "Reflection-token quality bounded by training data."
        ]
      },
      "constrains": "Generation steps are gated by the reflection grammar; the model cannot generate freely without emitting the appropriate reflection tokens.",
      "known_uses": [
        {
          "system": "Self-RAG paper baseline",
          "status": "available"
        }
      ],
      "related": [
        {
          "pattern": "agentic-rag",
          "relation": "specialises"
        },
        {
          "pattern": "reflection",
          "relation": "uses"
        }
      ],
      "references": [
        {
          "type": "paper",
          "title": "Self-RAG: Learning to Retrieve, Generate, and Critique through Self-Reflection",
          "authors": "Asai, Wu, Wang, Sil, Hajishirzi",
          "year": 2023,
          "url": "https://arxiv.org/abs/2310.11511"
        }
      ],
      "status_in_practice": "emerging",
      "tags": [
        "rag",
        "self-reflection",
        "fine-tuning"
      ]
    },
    {
      "id": "automatic-workflow-search",
      "name": "Automatic Workflow Search",
      "aliases": [
        "AFlow",
        "Workflow Synthesis",
        "MCTS over Agent Graphs"
      ],
      "category": "routing-composition",
      "intent": "Treat the agent's workflow (a graph of LLM-invoking nodes) as an artefact to search; use Monte Carlo Tree Search guided by an eval benchmark to discover the best workflow, then deploy it.",
      "context": "A repeatable task domain (coding, math, QA) where outputs can be scored against a benchmark, and where the right composition of routing, planning, ensembling, review, and revise nodes is not known a priori.",
      "problem": "Hand-designing the optimal agent workflow per domain is expensive, brittle, and biased by the designer's repertoire of patterns.",
      "forces": [
        "There is a combinatorial space of workflows.",
        "Each workflow run costs money to evaluate.",
        "Search needs a signal (benchmark scores) plus an explore/exploit policy.",
        "Workflows have to be representable as code or as a graph for search to work."
      ],
      "solution": "Represent each candidate workflow as code or a graph of nodes (router, planner, ensemble, review, revise, executor). Use MCTS — selection by UCB-style scoring on past benchmark performance, expansion by code mutations or graph edits, simulation by running the workflow on the eval set, backpropagation of scores. After a search budget, deploy the best-scoring workflow. Use a library of operators (Ensemble, Review, Revise) to constrain the search space.",
      "structure": "Search: workflow_graph -> mutate -> run on eval set -> score -> MCTS update -> repeat -> best_workflow -> deploy.",
      "consequences": {
        "benefits": [
          "Discovers non-obvious workflow compositions a human designer would not try.",
          "Cheaper smaller models reach larger-model performance on some benchmarks.",
          "The search artefact is a reusable, inspectable workflow."
        ],
        "liabilities": [
          "Eval set quality bounds discovered workflow quality.",
          "Compute-intensive: many workflow evaluations per search.",
          "Risk of overfitting to the eval set; held-out eval needed."
        ]
      },
      "constrains": "No workflow may be deployed that was not measured against the held-out eval set; ad-hoc human edits to a discovered workflow re-enter the search.",
      "known_uses": [
        {
          "system": "AFlow (DeepWisdom + HKUST(GZ))",
          "note": "MCTS over code-represented workflows; outperforms hand-designed baselines by 5.7% average.",
          "status": "available",
          "url": "https://github.com/FoundationAgents/AFlow"
        }
      ],
      "related": [
        {
          "pattern": "eval-harness",
          "relation": "uses"
        },
        {
          "pattern": "eval-as-contract",
          "relation": "complements"
        },
        {
          "pattern": "lats",
          "relation": "complements",
          "note": "LATS searches reasoning trees; AFlow searches workflow graphs."
        },
        {
          "pattern": "spec-first-agent",
          "relation": "alternative-to"
        },
        {
          "pattern": "best-of-n",
          "relation": "complements"
        }
      ],
      "references": [
        {
          "type": "paper",
          "title": "AFlow: Automating Agentic Workflow Generation",
          "authors": "Zhang et al.",
          "year": 2024,
          "url": "https://arxiv.org/abs/2410.10762"
        }
      ],
      "status_in_practice": "experimental",
      "tags": [
        "workflow",
        "search",
        "china-origin",
        "aflow",
        "mcts"
      ]
    },
    {
      "id": "circuit-breaker",
      "name": "Circuit Breaker",
      "aliases": [
        "Failure Trip",
        "Rate-Limit Trip"
      ],
      "category": "routing-composition",
      "intent": "Stop calling a failing dependency for a cooldown period after error rates exceed a threshold.",
      "context": "Agents that depend on external APIs which can fail (rate limits, outages, regional incidents); retries amplify the problem.",
      "problem": "Hammering a failing dependency wastes cost, increases latency, and can block legitimate traffic when rate limits are involved.",
      "forces": [
        "Threshold tuning trades fast detection for false trips.",
        "Cooldown duration trades availability for stability.",
        "Per-endpoint vs global breakers differ on blast radius."
      ],
      "solution": "Track per-dependency error rate over a window. When error rate exceeds a threshold, 'open' the breaker: route calls to fallback (or fail fast) for a cooldown. After cooldown, allow trial calls; close the breaker on success.",
      "consequences": {
        "benefits": [
          "Cost and latency under partial outages drop.",
          "Upstream dependencies recover without retry storms."
        ],
        "liabilities": [
          "False trips degrade availability when the error was transient.",
          "Tuning is empirical."
        ]
      },
      "constrains": "When the breaker is open, the dependency must not be called; only fallback paths may run.",
      "known_uses": [
        {
          "system": "Standard pattern in microservice frameworks; transferred to agent stacks",
          "status": "available"
        }
      ],
      "related": [
        {
          "pattern": "fallback-chain",
          "relation": "composes-with"
        },
        {
          "pattern": "rate-limiting",
          "relation": "complements"
        },
        {
          "pattern": "exception-recovery",
          "relation": "complements"
        },
        {
          "pattern": "provider-fallback",
          "relation": "complements"
        },
        {
          "pattern": "kill-switch",
          "relation": "composes-with"
        },
        {
          "pattern": "graceful-degradation",
          "relation": "used-by"
        }
      ],
      "references": [
        {
          "type": "book",
          "title": "Release It! (Michael Nygard)",
          "year": 2007,
          "url": "https://pragprog.com/titles/mnee2/release-it-second-edition/"
        }
      ],
      "status_in_practice": "mature",
      "tags": [
        "routing",
        "reliability",
        "breaker"
      ]
    },
    {
      "id": "dynamic-scaffolding",
      "name": "Dynamic Scaffolding",
      "aliases": [
        "Adaptive Prompting",
        "Just-in-Time Context"
      ],
      "category": "routing-composition",
      "intent": "Inject task-specific scaffolding (examples, hints, schemas) into the prompt only when the task type warrants it.",
      "context": "Agents handling heterogeneous tasks where always-present scaffolding wastes tokens and sometimes biases outputs.",
      "problem": "Static prompts either include everything (wasteful, sometimes misleading) or include nothing (under-scaffolded for hard cases).",
      "forces": [
        "Detection of when scaffolding helps is itself a problem.",
        "Scaffolding library curation effort.",
        "Compositional scaffolding (multiple scaffolds in one prompt) interacts unpredictably."
      ],
      "solution": "Maintain a library of scaffolds (few-shot examples, schemas, hints) keyed by task type or feature. At runtime, classify the task and inject the matching scaffolds. Audit which scaffolds fired per request.",
      "consequences": {
        "benefits": [
          "Token efficiency.",
          "Targeted quality lift on hard cases."
        ],
        "liabilities": [
          "Scaffold library maintenance.",
          "Misclassification injects wrong scaffolds."
        ]
      },
      "constrains": "Scaffolds load only on matching task classification; default tasks see the bare prompt.",
      "known_uses": [
        {
          "system": "Avramovic Dynamic Scaffolding pattern",
          "status": "available"
        },
        {
          "system": "DSPy compiled prompts (signature-driven scaffolding)",
          "status": "available"
        }
      ],
      "related": [
        {
          "pattern": "routing",
          "relation": "uses"
        },
        {
          "pattern": "context-window-packing",
          "relation": "complements"
        },
        {
          "pattern": "agent-skills",
          "relation": "complements"
        }
      ],
      "references": [
        {
          "type": "repo",
          "title": "zeljkoavramovic/agentic-design-patterns",
          "url": "https://github.com/zeljkoavramovic/agentic-design-patterns"
        }
      ],
      "status_in_practice": "emerging",
      "tags": [
        "prompting",
        "scaffolding",
        "dynamic"
      ]
    },
    {
      "id": "fallback-chain",
      "name": "Fallback Chain",
      "aliases": [
        "Cascade Fallback",
        "Try-Then-Try-Else",
        "Tool Failed Fall Back",
        "Provider Failed Retry Other"
      ],
      "category": "routing-composition",
      "intent": "Try a primary handler; on failure or low confidence, fall through to a sequence of fallback handlers.",
      "context": "Production agents where any single model or tool can fail (rate limits, errors, low-confidence outputs); end-users still need an answer.",
      "problem": "Single-handler failure cascades to the user as an outage; low-confidence outputs degrade UX silently.",
      "forces": [
        "Fallback handlers may be slower or worse.",
        "Detecting 'failure' requires a confidence signal.",
        "Cascade depth must be bounded."
      ],
      "solution": "Define an ordered chain of handlers. Each handler returns either a confident answer or a failure/low-confidence signal. On failure, the next handler runs. Final fallback is a generic 'I don't know' rather than a wrong answer.",
      "consequences": {
        "benefits": [
          "Graceful degradation under partial failures.",
          "Each layer can be tuned independently."
        ],
        "liabilities": [
          "Cumulative latency on full cascade.",
          "Hides quality regressions in the primary."
        ]
      },
      "constrains": "Each handler may produce a result or pass; only the chain may decide to terminate.",
      "known_uses": [
        {
          "system": "Most production routing layers",
          "status": "available"
        },
        {
          "system": "AI-Standards Fallback Chain pattern",
          "status": "available"
        }
      ],
      "related": [
        {
          "pattern": "routing",
          "relation": "complements"
        },
        {
          "pattern": "circuit-breaker",
          "relation": "composes-with"
        },
        {
          "pattern": "multi-model-routing",
          "relation": "complements"
        },
        {
          "pattern": "provider-fallback",
          "relation": "generalises"
        },
        {
          "pattern": "confidence-reporting",
          "relation": "complements"
        },
        {
          "pattern": "exception-recovery",
          "relation": "complements"
        },
        {
          "pattern": "graceful-degradation",
          "relation": "complements"
        },
        {
          "pattern": "open-weight-cascade",
          "relation": "used-by"
        }
      ],
      "references": [
        {
          "type": "doc",
          "title": "How to add fallbacks to a runnable",
          "authors": "LangChain",
          "year": 2024,
          "url": "https://python.langchain.com/docs/how_to/fallbacks/"
        }
      ],
      "status_in_practice": "mature",
      "tags": [
        "routing",
        "fallback",
        "reliability"
      ]
    },
    {
      "id": "graceful-degradation",
      "name": "Graceful Degradation",
      "aliases": [
        "Feature-Level Fallback",
        "Degraded Mode"
      ],
      "category": "routing-composition",
      "intent": "When a dependency fails, downgrade the user-facing experience to a working subset rather than failing entirely.",
      "context": "Agent products that depend on multiple optional capabilities (retrieval, vision, code-execution) where any single dependency may fail without the whole product needing to.",
      "problem": "Failing entirely on any dependency outage frustrates users; silently producing degraded output without disclosure erodes trust.",
      "forces": [
        "Degradation paths multiply test surface.",
        "User-visible degradation messaging is its own UX problem.",
        "Some failures must hard-fail (PII path, payment)."
      ],
      "solution": "Define per-feature fallback behaviour. On dependency failure, downgrade (text-only when vision fails, no citations when retrieval fails, simple summary when code execution fails) and disclose to the user that degraded mode is active. Feature flags double as degradation switches.",
      "consequences": {
        "benefits": [
          "Product resilience under partial outages.",
          "User trust via transparent degradation."
        ],
        "liabilities": [
          "Test matrix grows with feature count.",
          "Degraded modes can themselves have bugs."
        ]
      },
      "constrains": "On failure, the agent must produce a degraded response with disclosure rather than a generic error.",
      "known_uses": [
        {
          "system": "Perplexity (citations missing under retrieval issues)",
          "status": "available",
          "url": "https://www.perplexity.ai/"
        },
        {
          "system": "ChatGPT (vision unavailable falls back to text)",
          "status": "available",
          "url": "https://chat.openai.com/"
        }
      ],
      "related": [
        {
          "pattern": "fallback-chain",
          "relation": "complements"
        },
        {
          "pattern": "circuit-breaker",
          "relation": "uses"
        },
        {
          "pattern": "exception-recovery",
          "relation": "specialises"
        }
      ],
      "references": [
        {
          "type": "book",
          "title": "Release It! (Michael Nygard, ch. 4)",
          "year": 2007,
          "url": "https://pragprog.com/titles/mnee2/release-it-second-edition/"
        }
      ],
      "status_in_practice": "mature",
      "tags": [
        "routing",
        "resilience",
        "degradation"
      ]
    },
    {
      "id": "mixture-of-experts-routing",
      "name": "Mixture of Experts Routing",
      "aliases": [
        "MoE Routing (Agent-Level)",
        "Expert Selection"
      ],
      "category": "routing-composition",
      "intent": "Route each request to one or more domain-expert agents, where each expert holds deep capability in a narrow area.",
      "context": "Heterogeneous domains where one agent cannot credibly cover all expertise (legal + medical + finance + technical).",
      "problem": "A generalist agent's depth in any one domain is shallow; users in specialist domains feel under-served.",
      "forces": [
        "Expert maintenance scales with domain count.",
        "Routing classification must match expert coverage.",
        "Cross-domain queries challenge single-expert routing."
      ],
      "solution": "Define experts (specialised system prompts, tool palettes, possibly fine-tuned models). A router classifies queries by domain. Route to one expert (top-1) or to multiple experts whose outputs are aggregated. Distinct from standard routing by emphasising deep specialisation per expert.",
      "consequences": {
        "benefits": [
          "Depth per domain.",
          "Independent expert evolution."
        ],
        "liabilities": [
          "Domain count grows expert maintenance linearly.",
          "Cross-domain queries fall through cracks."
        ]
      },
      "constrains": "Each request is bound to one or more named experts; generalist fallback is explicit, not default.",
      "known_uses": [
        {
          "system": "Vendor knowledge-base products with domain agents",
          "status": "available"
        }
      ],
      "related": [
        {
          "pattern": "routing",
          "relation": "specialises"
        },
        {
          "pattern": "supervisor",
          "relation": "complements"
        },
        {
          "pattern": "role-assignment",
          "relation": "complements"
        },
        {
          "pattern": "dynamic-expert-recruitment",
          "relation": "alternative-to"
        }
      ],
      "references": [
        {
          "type": "paper",
          "title": "Mixture-of-Agents Enhances Large Language Model Capabilities",
          "authors": "Wang et al.",
          "year": 2024,
          "url": "https://arxiv.org/abs/2406.04692"
        }
      ],
      "status_in_practice": "emerging",
      "tags": [
        "routing",
        "experts",
        "specialisation"
      ]
    },
    {
      "id": "multi-model-routing",
      "name": "Multi-Model Routing",
      "aliases": [
        "Cascade Routing",
        "Cheap-First Routing",
        "Model Cascading"
      ],
      "category": "routing-composition",
      "intent": "Send each request to the cheapest model that can handle it well.",
      "context": "Model prices and capabilities vary by an order of magnitude; using the strongest model for every call is wasteful.",
      "problem": "Static model choice either pays too much or misses quality on hard cases.",
      "forces": [
        "Quality bar must be measurable per request type.",
        "Cheap models hallucinate confidently; the router must not trust them blindly.",
        "Falling back from cheap to expensive on failure costs more than starting expensive."
      ],
      "solution": "Combine routing (classify the request) with a per-class model preference. Routing and filter extraction go to the cheap model; the screen-aware dialog or final answer goes to the strong model. Optionally cascade: try cheap, fall back to strong if confidence is low.",
      "consequences": {
        "benefits": [
          "Bill drops 5-10x without quality loss when class boundaries match cost boundaries.",
          "Dev/test runs naturally on cheap models."
        ],
        "liabilities": [
          "Two-model debug surface.",
          "Vendor lock-in when models diverge in tool calling."
        ]
      },
      "constrains": "Each request class is bound to a model tier; agents cannot escalate without routing approval.",
      "known_uses": [
        {
          "system": "Bobbin (Stash2Go)",
          "note": "gpt-5.4-mini for routing/filters; gpt-5.4 for screen-aware dialog.",
          "status": "available"
        }
      ],
      "related": [
        {
          "pattern": "routing",
          "relation": "specialises"
        },
        {
          "pattern": "cost-gating",
          "relation": "complements"
        },
        {
          "pattern": "fallback-chain",
          "relation": "complements"
        },
        {
          "pattern": "hero-agent",
          "relation": "alternative-to"
        },
        {
          "pattern": "provider-fallback",
          "relation": "complements"
        },
        {
          "pattern": "hidden-mode-switching",
          "relation": "alternative-to"
        },
        {
          "pattern": "dual-system-gui-agent",
          "relation": "used-by"
        },
        {
          "pattern": "open-weight-cascade",
          "relation": "generalises"
        },
        {
          "pattern": "multilingual-voice-agent",
          "relation": "complements"
        }
      ],
      "references": [
        {
          "type": "doc",
          "title": "OpenAI / Anthropic model selection guides",
          "year": 2024,
          "url": "https://platform.openai.com/docs/guides/model-selection"
        }
      ],
      "status_in_practice": "mature",
      "tags": [
        "routing",
        "cost",
        "cascade"
      ]
    },
    {
      "id": "open-weight-cascade",
      "name": "Open-Weight Cascade",
      "aliases": [
        "Permissive-License Cascade",
        "Sovereign Routing",
        "Self-Hostable Cascade"
      ],
      "category": "routing-composition",
      "intent": "Build a multi-model cascade where lower tiers are open-weight, self-hostable models that run inside the operator's boundary, and only escalations cross to a hosted frontier model — giving cost arbitrage *and* sovereignty.",
      "context": "European or regulated deployments that want both cost optimisation and a fast-path that does not depend on a non-EU hosted API.",
      "problem": "A naïve cheap-first cascade still falls through to a hosted frontier model for hard requests, meaning every borderline request leaks data. Open-weight-only cascades miss capability on the rare hard request.",
      "forces": [
        "Most requests are easy; cheap models handle them.",
        "Hard requests need frontier capability.",
        "Some requests must never leave the boundary regardless of difficulty.",
        "Open-weight models close the capability gap at a delay."
      ],
      "solution": "Stratify requests by sensitivity *and* difficulty before routing. (1) Sensitive requests: forced down the open-weight path even if confidence is low; degrade gracefully or refuse rather than escalate. (2) Insensitive easy requests: small open-weight model. (3) Insensitive hard requests: escalate to hosted frontier model. The router enforces the sensitivity classification before any model call.",
      "structure": "Request -> Sensitivity classifier -> [sensitive: open-weight only path] | [insensitive: cheap-first cascade with hosted frontier as fallback].",
      "consequences": {
        "benefits": [
          "Compliant fast-path for sensitive workloads.",
          "Cost arbitrage on the insensitive path.",
          "Operator can swap model tiers without re-architecting."
        ],
        "liabilities": [
          "Sensitivity classifier is the new failure surface.",
          "Quality cliff at the sensitive boundary if the open-weight tier under-performs.",
          "Operational overhead of running two stacks."
        ]
      },
      "constrains": "A request classified as sensitive may not be routed to a hosted frontier model; the hosted tier is only reachable from the insensitive path.",
      "known_uses": [
        {
          "system": "Mistral",
          "note": "Open-weight (Mistral 7B, Mixtral) plus hosted (Mistral Large, Medium 3.5) tiers — operators commonly cascade them.",
          "status": "available",
          "url": "https://mistral.ai/"
        },
        {
          "system": "Aleph Alpha PhariaAI multi-model",
          "note": "On-prem Pharia models plus optional hosted escalation.",
          "status": "available"
        }
      ],
      "related": [
        {
          "pattern": "multi-model-routing",
          "relation": "specialises"
        },
        {
          "pattern": "fallback-chain",
          "relation": "uses"
        },
        {
          "pattern": "sovereign-inference-stack",
          "relation": "complements"
        },
        {
          "pattern": "pii-redaction",
          "relation": "complements"
        },
        {
          "pattern": "provider-fallback",
          "relation": "complements"
        }
      ],
      "references": [
        {
          "type": "doc",
          "title": "Mistral AI — Models",
          "url": "https://mistral.ai/"
        }
      ],
      "status_in_practice": "emerging",
      "tags": [
        "routing",
        "sovereignty",
        "france-origin",
        "mistral"
      ]
    },
    {
      "id": "parallel-tool-calls",
      "name": "Parallel Tool Calls",
      "aliases": [
        "Concurrent Function Calls",
        "Multi-Tool Turn"
      ],
      "category": "routing-composition",
      "intent": "Allow the model to emit several independent tool calls in one assistant turn; the host executes them in parallel.",
      "context": "Tasks where the model can identify independent sub-actions in one breath and parallel execution would cut wall-clock time.",
      "problem": "Sequential tool calls waste latency on independent operations; full DAG-planning patterns are heavyweight for the simple parallel case.",
      "forces": [
        "Concurrency limits per provider.",
        "Provider must support multi-tool-call turns.",
        "Aggregation of results back into the next turn.",
        "Models sometimes emit dependent calls in one turn despite the prompt; the host must detect or document this contract."
      ],
      "solution": "The provider's API allows the assistant turn to contain multiple tool calls. The host fans them out concurrently (with bounded concurrency and rate-limit handling). Results return as multiple tool messages; the next assistant turn sees all of them.",
      "consequences": {
        "benefits": [
          "Lower wall-clock latency on parallelisable steps.",
          "Simpler than full DAG planning."
        ],
        "liabilities": [
          "Provider-specific behaviour.",
          "Host concurrency control complexity.",
          "Silent correctness bugs when accidentally-dependent calls are parallelised."
        ]
      },
      "constrains": "Tool calls in the same assistant turn are treated as independent; cross-call dependencies are not allowed within one turn.",
      "known_uses": [
        {
          "system": "OpenAI parallel function calling",
          "status": "available"
        },
        {
          "system": "Anthropic parallel tool use",
          "status": "available"
        },
        {
          "system": "Claude Code multi-tool turns",
          "status": "available"
        },
        {
          "system": "Cursor parallel reads",
          "status": "available"
        }
      ],
      "related": [
        {
          "pattern": "tool-use",
          "relation": "uses"
        },
        {
          "pattern": "llm-compiler",
          "relation": "alternative-to"
        },
        {
          "pattern": "parallelization",
          "relation": "specialises"
        },
        {
          "pattern": "code-as-action",
          "relation": "alternative-to"
        }
      ],
      "references": [
        {
          "type": "doc",
          "title": "OpenAI: Parallel function calling",
          "url": "https://platform.openai.com/docs/guides/function-calling"
        },
        {
          "type": "doc",
          "title": "Anthropic: Tool use",
          "url": "https://docs.anthropic.com/en/docs/build-with-claude/tool-use"
        }
      ],
      "status_in_practice": "mature",
      "tags": [
        "tool-use",
        "parallel",
        "concurrency"
      ]
    },
    {
      "id": "parallelization",
      "name": "Parallelization",
      "aliases": [
        "Sectioning",
        "Voting",
        "Parallel Branches"
      ],
      "category": "routing-composition",
      "intent": "Run independent LLM calls concurrently and combine results.",
      "context": "The task naturally splits (sectioning) or benefits from multiple independent attempts (voting).",
      "problem": "Sequential execution of independent work wastes wall-clock time; single-attempt execution misses outliers a second look would catch.",
      "forces": [
        "Concurrency limits and rate limits.",
        "Aggregation logic for voting (majority? best? union?).",
        "Cost multiplies linearly with parallel branches."
      ],
      "solution": "Two flavours. Sectioning: split a task into independent subtasks, run them concurrently, concatenate results. Voting: run the same task multiple times, aggregate by majority or judge.",
      "consequences": {
        "benefits": [
          "Wall-clock latency drops; quality rises (voting).",
          "Independent failures isolate cleanly."
        ],
        "liabilities": [
          "Cost scales with branch count.",
          "Aggregation logic is its own correctness problem."
        ]
      },
      "constrains": "Branches cannot share state during execution; aggregation is the only join point.",
      "known_uses": [
        {
          "system": "Anthropic Building Effective Agents (Workflow #3)",
          "status": "available"
        },
        {
          "system": "Self-consistency in mathematical reasoning",
          "status": "available"
        }
      ],
      "related": [
        {
          "pattern": "self-consistency",
          "relation": "generalises"
        },
        {
          "pattern": "map-reduce",
          "relation": "generalises"
        },
        {
          "pattern": "best-of-n",
          "relation": "generalises"
        },
        {
          "pattern": "llm-compiler",
          "relation": "used-by"
        },
        {
          "pattern": "parallel-tool-calls",
          "relation": "generalises"
        },
        {
          "pattern": "prompt-chaining",
          "relation": "alternative-to"
        },
        {
          "pattern": "lead-researcher",
          "relation": "used-by"
        }
      ],
      "references": [
        {
          "type": "blog",
          "title": "Anthropic: Building Effective Agents",
          "year": 2024,
          "url": "https://www.anthropic.com/research/building-effective-agents"
        }
      ],
      "status_in_practice": "mature",
      "tags": [
        "parallel",
        "voting",
        "concurrency"
      ]
    },
    {
      "id": "pipes-and-filters",
      "name": "Pipes and Filters",
      "aliases": [
        "Pipeline",
        "Streaming Pipeline",
        "EIP Pipeline"
      ],
      "category": "routing-composition",
      "intent": "Compose stream-shaped processing as a chain of small filters connected by pipes.",
      "context": "Data flows through several transformations (parse, classify, transform, validate, format); each transformation is independently testable.",
      "problem": "Monolithic transformations are hard to test; bespoke pipelines reinvent connection plumbing each time.",
      "forces": [
        "Filter granularity: too small = overhead; too big = back to monolith.",
        "Pipe contracts (typed messages) need agreement.",
        "Backpressure across pipes."
      ],
      "solution": "Decompose the transformation into small filters with single responsibilities. Connect them via typed pipes (function call, queue, stream). Each filter is testable in isolation. Filters can be reused across pipelines.",
      "consequences": {
        "benefits": [
          "Composability and testability.",
          "Reuse across pipelines."
        ],
        "liabilities": [
          "Pipeline visibility: hard to see end-to-end behaviour.",
          "Latency adds across stages."
        ]
      },
      "constrains": "Filters communicate only through pipes with typed contracts.",
      "known_uses": [
        {
          "system": "Enterprise Integration Patterns (Hohpe, Woolf)",
          "status": "available"
        },
        {
          "system": "LangChain Runnable composition",
          "status": "available"
        }
      ],
      "related": [
        {
          "pattern": "prompt-chaining",
          "relation": "generalises"
        },
        {
          "pattern": "map-reduce",
          "relation": "composes-with"
        },
        {
          "pattern": "chat-chain",
          "relation": "used-by"
        }
      ],
      "references": [
        {
          "type": "book",
          "title": "Enterprise Integration Patterns",
          "authors": "Gregor Hohpe, Bobby Woolf",
          "year": 2003,
          "url": "https://www.enterpriseintegrationpatterns.com/"
        }
      ],
      "status_in_practice": "mature",
      "tags": [
        "pipeline",
        "composition",
        "eip"
      ]
    },
    {
      "id": "prompt-chaining",
      "name": "Prompt Chaining",
      "aliases": [
        "Sequential Decomposition",
        "Pipeline of Prompts"
      ],
      "category": "routing-composition",
      "intent": "Decompose a task into a fixed sequence of LLM calls where each step's output becomes the next step's input.",
      "context": "The task is composed of identifiable sub-tasks with clear boundaries; the order is known in advance.",
      "problem": "A single mega-prompt overloads the model and makes failures hard to localise.",
      "forces": [
        "Decomposition clarity vs compounded latency.",
        "Step isolation vs error compounding across the chain.",
        "Schema rigor between steps vs pipeline flexibility."
      ],
      "solution": "Define a fixed pipeline of prompts. Each step has its own system prompt, expected output shape, and validation. A failure at step k retries step k or aborts; downstream steps run only on success.",
      "consequences": {
        "benefits": [
          "Failures localise to a step.",
          "Each step's prompt can be optimised independently."
        ],
        "liabilities": [
          "Inflexible to inputs that do not match the assumed decomposition.",
          "Latency = sum of step latencies."
        ]
      },
      "constrains": "Step k cannot bypass step k-1's output schema.",
      "known_uses": [
        {
          "system": "Anthropic Building Effective Agents (Workflow #1)",
          "status": "available"
        }
      ],
      "related": [
        {
          "pattern": "routing",
          "relation": "complements"
        },
        {
          "pattern": "parallelization",
          "relation": "alternative-to"
        },
        {
          "pattern": "pipes-and-filters",
          "relation": "specialises"
        },
        {
          "pattern": "chat-chain",
          "relation": "specialises"
        }
      ],
      "references": [
        {
          "type": "blog",
          "title": "Anthropic: Building Effective Agents",
          "year": 2024,
          "url": "https://www.anthropic.com/research/building-effective-agents"
        }
      ],
      "status_in_practice": "mature",
      "tags": [
        "pipeline",
        "workflow",
        "decomposition"
      ]
    },
    {
      "id": "provider-fallback",
      "name": "Provider Fallback",
      "aliases": [
        "Mid-Request Failover",
        "Cross-Provider Recovery"
      ],
      "category": "routing-composition",
      "intent": "When one provider's API errors mid-stream, transparently switch to another provider while preserving state.",
      "context": "Production agent products that depend on multiple LLM providers and need uptime across provider outages, rate limits, and regional incidents.",
      "problem": "Single-provider deployments take outages personally; fallback-chain handles request boundaries but cannot recover a stream that fails mid-generation.",
      "forces": [
        "Provider tool-call schemas differ; cross-provider continuation needs schema translation.",
        "Partial output reconciliation across providers.",
        "Routing logic must not amplify provider quirks."
      ],
      "solution": "A gateway proxy holds the conversation state. On stream error, it switches to a fallback provider, optionally preserving partial output, and continues with translated message format. Tool-call schemas are normalised at the gateway. Streaming clients see one continuous stream.",
      "consequences": {
        "benefits": [
          "Uptime through provider outages.",
          "Multi-provider portfolio for cost arbitrage."
        ],
        "liabilities": [
          "Schema translation has its own bugs.",
          "Quality discontinuity when providers differ in capability."
        ]
      },
      "constrains": "Failover happens transparently to the client; the client sees one provider-agnostic interface.",
      "known_uses": [
        {
          "system": "OpenRouter automatic failover",
          "status": "available"
        },
        {
          "system": "Cursor model switching on rate-limit",
          "status": "available"
        },
        {
          "system": "Portkey gateway fallback",
          "status": "available"
        },
        {
          "system": "Helicone gateway fallback",
          "status": "available"
        }
      ],
      "related": [
        {
          "pattern": "fallback-chain",
          "relation": "specialises"
        },
        {
          "pattern": "circuit-breaker",
          "relation": "complements"
        },
        {
          "pattern": "multi-model-routing",
          "relation": "complements"
        },
        {
          "pattern": "open-weight-cascade",
          "relation": "complements"
        }
      ],
      "references": [
        {
          "type": "doc",
          "title": "OpenRouter: Provider Routing",
          "url": "https://openrouter.ai/docs/features/provider-routing"
        },
        {
          "type": "doc",
          "title": "Portkey Gateway: Fallback",
          "url": "https://portkey.ai/docs"
        }
      ],
      "status_in_practice": "mature",
      "tags": [
        "routing",
        "failover",
        "gateway"
      ]
    },
    {
      "id": "routing",
      "name": "Routing",
      "aliases": [
        "Mode Selector",
        "Intent Classifier",
        "Task Router"
      ],
      "category": "routing-composition",
      "intent": "Classify an incoming request and dispatch it to the specialist (lane / agent / model) best suited to handle it.",
      "context": "Heterogeneous traffic where different requests benefit from different prompts, tool palettes, or models.",
      "problem": "A single prompt that handles everything either over-pays (cheap requests routed through expensive paths) or under-serves (complex requests stuck in cheap paths).",
      "forces": [
        "Routing itself costs a model call.",
        "Misrouting can be worse than not routing at all.",
        "The router needs visibility into capabilities of each downstream specialist."
      ],
      "solution": "A lightweight classifier model (often the cheapest available) returns a label. The host dispatches the request to the specialist for that label. Common lanes: command (deterministic action), agent (multi-step), chat (no tools).",
      "consequences": {
        "benefits": [
          "Cheap requests pay cheap prices.",
          "Each lane can be tuned in isolation."
        ],
        "liabilities": [
          "Two-call latency on every request.",
          "Lane definitions ossify; reclassification is hard once users learn the lanes."
        ]
      },
      "constrains": "A request gets exactly one lane; downstream specialists cannot accept work outside their declared lane.",
      "known_uses": [
        {
          "system": "Bobbin (Stash2Go)",
          "note": "mode_selector classifies intent into command / agent / chat.",
          "status": "available"
        },
        {
          "system": "Anthropic Building Effective Agents (Workflow #2)",
          "status": "available"
        }
      ],
      "related": [
        {
          "pattern": "multi-model-routing",
          "relation": "generalises"
        },
        {
          "pattern": "supervisor",
          "relation": "used-by"
        },
        {
          "pattern": "mixture-of-experts-routing",
          "relation": "generalises"
        },
        {
          "pattern": "fallback-chain",
          "relation": "complements"
        },
        {
          "pattern": "dynamic-scaffolding",
          "relation": "used-by"
        },
        {
          "pattern": "hero-agent",
          "relation": "alternative-to"
        },
        {
          "pattern": "disambiguation",
          "relation": "used-by"
        },
        {
          "pattern": "prompt-chaining",
          "relation": "complements"
        },
        {
          "pattern": "tool-loadout",
          "relation": "used-by"
        }
      ],
      "references": [
        {
          "type": "blog",
          "title": "Anthropic: Building Effective Agents",
          "year": 2024,
          "url": "https://www.anthropic.com/research/building-effective-agents"
        }
      ],
      "status_in_practice": "mature",
      "tags": [
        "routing",
        "classifier"
      ]
    },
    {
      "id": "approval-queue",
      "name": "Approval Queue",
      "aliases": [
        "Async Approval",
        "Supervisor Inbox",
        "Approval Inbox"
      ],
      "category": "safety-control",
      "intent": "Queue agent-proposed actions for asynchronous human review while the agent continues other work.",
      "context": "Long-running agent products where some actions need human approval but blocking the agent's whole loop on every approval is impractical.",
      "problem": "Synchronous human-in-the-loop blocks the agent; ungated execution risks unsafe actions.",
      "forces": [
        "Async approval adds wall-clock delay before action lands.",
        "Approval inbox can become unmanageable at scale.",
        "Race conditions if the world changes while approval is pending."
      ],
      "solution": "Agent emits proposed action to an approval queue with context. A human (or supervisor agent) reviews the queue and approves or rejects. Approved actions are executed by the agent or by a runner. The agent can continue parallel work while waiting; some workflows pause specific branches.",
      "consequences": {
        "benefits": [
          "Human oversight without blocking throughput.",
          "Approval inbox is auditable."
        ],
        "liabilities": [
          "Inbox fatigue at scale.",
          "World drift between proposal and approval."
        ]
      },
      "constrains": "Actions in the approval queue may not execute until the approval status is set to approved.",
      "known_uses": [
        {
          "system": "Lindy approval inbox",
          "status": "available",
          "url": "https://www.lindy.ai/"
        },
        {
          "system": "Sierra supervisor escalations",
          "status": "available",
          "url": "https://sierra.ai/"
        },
        {
          "system": "GitHub Copilot Workspace plan review",
          "status": "available",
          "url": "https://githubnext.com/projects/copilot-workspace"
        }
      ],
      "related": [
        {
          "pattern": "human-in-the-loop",
          "relation": "specialises"
        },
        {
          "pattern": "compensating-action",
          "relation": "complements"
        },
        {
          "pattern": "conversation-handoff",
          "relation": "complements"
        }
      ],
      "references": [
        {
          "type": "blog",
          "title": "Building Effective Agents",
          "authors": "Anthropic",
          "year": 2024,
          "url": "https://www.anthropic.com/engineering/building-effective-agents"
        }
      ],
      "status_in_practice": "mature",
      "tags": [
        "safety",
        "approval",
        "async"
      ],
      "example_scenario": "An email-drafting agent prepares replies to 80 inbox messages overnight. Rather than send them automatically (risky) or block waiting on each one (slow), the agent writes them to an approval queue. In the morning the user reviews 80 draft replies and clicks 'send' or 'reject' on each. The agent kept moving through the inbox while waiting for the human.",
      "variants": [
        {
          "name": "Synchronous block-on-approval",
          "summary": "The agent's loop blocks on each pending approval. No further work happens until a human responds.",
          "distinguishing_factor": "loop blocks",
          "when_to_use": "Low-volume, high-stakes actions where partial progress without approval is unacceptable."
        },
        {
          "name": "Async parallel branches",
          "summary": "Approval-needing actions go to a queue; the agent continues other work that does not depend on them.",
          "distinguishing_factor": "agent makes progress in parallel",
          "when_to_use": "Default. Default for production agents handling many actions per run."
        },
        {
          "name": "Bulk approval",
          "summary": "Actions of the same shape (e.g., 80 email drafts) are batched into a single approval request the human can scan and bulk-accept.",
          "distinguishing_factor": "one approval covers many actions",
          "when_to_use": "Large numbers of low-individual-risk actions where per-action approval is fatigue."
        }
      ]
    },
    {
      "id": "compensating-action",
      "name": "Compensating Action",
      "aliases": [
        "Saga",
        "Undo Step",
        "Rollback Action"
      ],
      "category": "safety-control",
      "intent": "Pair every irreversible-looking agent action with a compensating action that can undo or counteract it.",
      "context": "Multi-step workflows where some steps succeed and later steps fail; without compensation, partial state is left inconsistent.",
      "problem": "Distributed transactions are not available across most agent tool palettes; failure mid-plan leaves the system in an inconsistent state.",
      "forces": [
        "Not every action has a clean compensator.",
        "Compensation logic is a separate code path.",
        "Idempotency matters: compensating an already-compensated action must be safe."
      ],
      "solution": "For each forward action, define a compensating action (delete-after-create, refund-after-charge, archive-after-publish). On failure mid-plan, run compensators in reverse order to restore the prior state. Idempotent compensators.",
      "consequences": {
        "benefits": [
          "Partial-failure consistency.",
          "Confidence to attempt multi-step writes."
        ],
        "liabilities": [
          "Doubles the number of action implementations.",
          "Some actions cannot truly be compensated (sent emails, public posts)."
        ]
      },
      "constrains": "Forward actions cannot be invoked without a registered compensator; uncompensable actions need explicit operator approval.",
      "known_uses": [
        {
          "system": "Saga pattern in microservices, transferred to agents",
          "status": "available"
        }
      ],
      "related": [
        {
          "pattern": "human-in-the-loop",
          "relation": "complements"
        },
        {
          "pattern": "provenance-ledger",
          "relation": "uses"
        },
        {
          "pattern": "approval-queue",
          "relation": "complements"
        },
        {
          "pattern": "kill-switch",
          "relation": "used-by"
        },
        {
          "pattern": "incident-response-runbook",
          "relation": "used-by"
        }
      ],
      "references": [
        {
          "type": "paper",
          "title": "Sagas (Garcia-Molina, Salem)",
          "year": 1987,
          "url": "https://dl.acm.org/doi/10.1145/38713.38742"
        }
      ],
      "status_in_practice": "mature",
      "tags": [
        "safety",
        "saga",
        "transaction"
      ]
    },
    {
      "id": "constitutional-charter",
      "name": "Constitutional Charter",
      "aliases": [
        "Immutable Constitution",
        "Negative Constraints",
        "Robot Laws"
      ],
      "category": "safety-control",
      "intent": "Define rules the agent reads every turn but cannot modify, encoding inviolable boundaries.",
      "context": "Long-running or self-modifying agents need a fixed point of reference; without one, recursive editing drifts the agent's identity.",
      "problem": "If the agent can edit everything, it can edit its own values; nothing stays inviolable.",
      "forces": [
        "Charter authors must encode hard constraints without paralysing the agent.",
        "Read-only at the tool layer is enforceable; read-only by exhortation is not.",
        "Charters age; updating requires human action."
      ],
      "solution": "A charter file is read into context every turn (or every tick). The tool layer enforces read-only on it; the agent has no write tool that can touch it. Updates go through an explicit operator path. Charters typically express constraints in negative form ('the agent shall not...').",
      "consequences": {
        "benefits": [
          "Stable identity across long runs and self-modifications.",
          "Explicit list of inviolable constraints, auditable separately from prompts."
        ],
        "liabilities": [
          "A bad charter codifies bad values.",
          "Charter prose adds tokens to every turn."
        ]
      },
      "constrains": "The agent cannot write the charter; updates require explicit operator action outside the agent loop.",
      "known_uses": [
        {
          "system": "Sparrot",
          "note": "charter.md read every tick; read-only at tool layer.",
          "status": "available"
        },
        {
          "system": "Anthropic Constitutional AI",
          "status": "available",
          "url": "https://www.anthropic.com/research/constitutional-ai-harmlessness-from-ai-feedback"
        }
      ],
      "related": [
        {
          "pattern": "quorum-on-mutation",
          "relation": "complements"
        },
        {
          "pattern": "inner-critic",
          "relation": "used-by"
        },
        {
          "pattern": "refusal",
          "relation": "used-by"
        },
        {
          "pattern": "prompt-bloat",
          "relation": "alternative-to"
        },
        {
          "pattern": "sovereign-inference-stack",
          "relation": "complements"
        }
      ],
      "references": [
        {
          "type": "paper",
          "title": "Constitutional AI: Harmlessness from AI Feedback",
          "authors": "Bai et al.",
          "year": 2022,
          "url": "https://arxiv.org/abs/2212.08073"
        }
      ],
      "status_in_practice": "emerging",
      "tags": [
        "safety",
        "constitution",
        "immutable"
      ]
    },
    {
      "id": "conversation-handoff",
      "name": "Conversation Handoff to Human",
      "aliases": [
        "Escalation",
        "Live-Agent Handoff",
        "Human Takeover"
      ],
      "category": "safety-control",
      "intent": "Transfer the entire conversation thread from agent to human operator, with state transfer and return primitive.",
      "context": "Customer-facing agent products where some conversations exceed agent capability or require regulatory accountability.",
      "problem": "Human-in-the-loop gates approve or reject discrete actions; this pattern transfers ownership of the whole conversation, which is structurally different.",
      "forces": [
        "Handoff loses context fidelity.",
        "Sticky routing (return to same operator on follow-up) needs auth + session plumbing.",
        "Return primitive (back to agent) requires re-grounding."
      ],
      "solution": "On escalation trigger (low confidence, explicit user request, policy violation), the agent emits a structured handoff envelope with conversation summary, ticket number, and human operator queue assignment. Operator takes ownership; agent disengages. On return, agent resumes with operator's note in context.",
      "consequences": {
        "benefits": [
          "Hard cases reach humans.",
          "Customer experience preserved across the boundary."
        ],
        "liabilities": [
          "Operator queue capacity bounds scale.",
          "State transfer has fidelity loss."
        ]
      },
      "constrains": "Once handed off, the agent does not generate to the user; the operator owns the thread until explicit return.",
      "known_uses": [
        {
          "system": "Sierra agent escalations",
          "status": "available",
          "url": "https://sierra.ai/"
        },
        {
          "system": "Intercom Fin handoffs",
          "status": "available"
        },
        {
          "system": "Zendesk AI handoffs",
          "status": "available"
        }
      ],
      "related": [
        {
          "pattern": "human-in-the-loop",
          "relation": "alternative-to"
        },
        {
          "pattern": "approval-queue",
          "relation": "complements"
        },
        {
          "pattern": "handoff",
          "relation": "specialises"
        }
      ],
      "references": [
        {
          "type": "doc",
          "title": "Intercom Fin: Set up Fin handoffs",
          "url": "https://www.intercom.com/help/en/articles/9357912-set-up-fin-handoffs"
        },
        {
          "type": "doc",
          "title": "Sierra agent escalations",
          "url": "https://sierra.ai"
        }
      ],
      "status_in_practice": "mature",
      "tags": [
        "safety",
        "escalation",
        "handoff"
      ]
    },
    {
      "id": "cost-gating",
      "name": "Cost Gating",
      "aliases": [
        "Budget Cap",
        "Cost-Aware Approval"
      ],
      "category": "safety-control",
      "intent": "Block actions whose expected cost exceeds a threshold without explicit user (or operator) acknowledgement.",
      "context": "Some agent actions cost real money (large model calls, paid APIs); running them silently is rude or financially dangerous.",
      "problem": "Agents that act first and bill later create surprise costs; users learn to distrust the agent.",
      "forces": [
        "Estimating cost up front requires a model of what will happen.",
        "Confirmation-fatigue: too many approvals train users to ignore them.",
        "Budgets at multiple horizons (per call, per session, per month)."
      ],
      "solution": "Estimate cost before invoking the expensive action. If the estimate exceeds the threshold, surface it to the user (or operator) and require explicit approval. Track running totals against per-session and per-period budgets.",
      "consequences": {
        "benefits": [
          "Predictable bill.",
          "Forces the system to know its own cost shape."
        ],
        "liabilities": [
          "Estimation errors; actual cost can exceed estimate.",
          "Friction at the wrong moment can sour UX."
        ]
      },
      "constrains": "Actions exceeding the threshold cannot run without explicit acknowledgement.",
      "known_uses": [
        {
          "system": "Knitting-DSL Pipeline (Stash2Go)",
          "note": "scopedLlmFixer.js runs only when user accepts the cost.",
          "status": "available"
        }
      ],
      "related": [
        {
          "pattern": "human-in-the-loop",
          "relation": "specialises"
        },
        {
          "pattern": "step-budget",
          "relation": "complements"
        },
        {
          "pattern": "multi-model-routing",
          "relation": "complements"
        },
        {
          "pattern": "prompt-caching",
          "relation": "complements"
        },
        {
          "pattern": "extended-thinking",
          "relation": "complements"
        },
        {
          "pattern": "cost-observability",
          "relation": "complements"
        },
        {
          "pattern": "rate-limiting",
          "relation": "complements"
        },
        {
          "pattern": "unbounded-subagent-spawn",
          "relation": "alternative-to"
        }
      ],
      "references": [
        {
          "type": "doc",
          "title": "Rate limits",
          "year": 2025,
          "url": "https://docs.claude.com/en/api/rate-limits"
        }
      ],
      "status_in_practice": "mature",
      "tags": [
        "safety",
        "cost",
        "budget"
      ]
    },
    {
      "id": "exception-recovery",
      "name": "Exception Handling and Recovery",
      "aliases": [
        "Error Recovery",
        "Failure Mode Handler"
      ],
      "category": "safety-control",
      "intent": "Catch and react to predictable failure modes (tool errors, rate limits, validation failures) with structured recovery paths.",
      "context": "Production agents face many failure modes (API down, rate-limited, malformed responses, permission denied); each needs a defined response.",
      "problem": "Untyped errors bubble up as agent confusion; the agent retries randomly, hallucinates explanations, or stalls.",
      "forces": [
        "Recovery logic must not mask bugs.",
        "Some errors are user-visible; others should be silent.",
        "Retry storms on transient errors."
      ],
      "solution": "Catalogue failure modes. For each, define: detect (typed error), respond (retry / fall back / surface to user / replan), and log. The agent receives a structured error message and can react with a typed branch in its loop.",
      "consequences": {
        "benefits": [
          "Failure modes become first-class.",
          "Reliability under partial failures rises."
        ],
        "liabilities": [
          "Exception-handling code is its own surface to maintain.",
          "Hidden retries can mask deeper issues."
        ]
      },
      "constrains": "Errors must arrive at the agent as typed events from the catalogue; untyped errors are escalated to the operator.",
      "known_uses": [
        {
          "system": "Production agent platforms",
          "status": "available"
        },
        {
          "system": "Gulli Exception Handling pattern",
          "status": "available"
        }
      ],
      "related": [
        {
          "pattern": "fallback-chain",
          "relation": "complements"
        },
        {
          "pattern": "circuit-breaker",
          "relation": "complements"
        },
        {
          "pattern": "replan-on-failure",
          "relation": "complements"
        },
        {
          "pattern": "graceful-degradation",
          "relation": "generalises"
        }
      ],
      "references": [
        {
          "type": "book",
          "title": "Agentic Design Patterns (Gulli)",
          "year": 2025,
          "url": "https://www.amazon.com/Agentic-Design-Patterns-Antonio-Gulli/dp/B0FFTBPNM4"
        }
      ],
      "status_in_practice": "mature",
      "tags": [
        "safety",
        "error",
        "recovery"
      ]
    },
    {
      "id": "human-in-the-loop",
      "name": "Human-in-the-Loop",
      "aliases": [
        "HITL",
        "Approval Gate",
        "Confirmation Step",
        "Risky Action Gate",
        "Destructive Action Confirmation",
        "Ask Before Risky Action"
      ],
      "category": "safety-control",
      "intent": "Require explicit human approval at defined points before the agent performs an action.",
      "context": "An action is destructive, irreversible, or regulatory; the cost of being wrong exceeds the cost of waiting.",
      "problem": "Fully autonomous action at risky boundaries combines model confidence with consequence; the combination is unsafe.",
      "forces": [
        "Where to place the gate trades latency and friction for safety.",
        "Approval-fatigue: too many gates train users to click through.",
        "Asynchronous approval stalls the loop."
      ],
      "solution": "Identify the boundary. Pause the loop. Surface the proposed action with enough context for the human to decide. Require an explicit approve/reject. Resume on approve; abort or replan on reject. Log the decision.",
      "consequences": {
        "benefits": [
          "Risk drops to a level the system can defend.",
          "Decision log captures human judgement that can later train an automated gate."
        ],
        "liabilities": [
          "User experience friction.",
          "Synchronous gates break async agents."
        ]
      },
      "constrains": "The defined action class cannot proceed without an affirmative approval signal.",
      "known_uses": [
        {
          "system": "Knitting-DSL Pipeline (Stash2Go)",
          "note": "Opt-in fixer: user clicks to invoke.",
          "status": "available"
        },
        {
          "system": "Bobbin (Stash2Go)",
          "note": "On destructive writes (project create, queue add, stash subtract).",
          "status": "planned"
        }
      ],
      "related": [
        {
          "pattern": "step-budget",
          "relation": "complements"
        },
        {
          "pattern": "cost-gating",
          "relation": "generalises"
        },
        {
          "pattern": "approval-queue",
          "relation": "generalises"
        },
        {
          "pattern": "disambiguation",
          "relation": "generalises"
        },
        {
          "pattern": "compensating-action",
          "relation": "complements"
        },
        {
          "pattern": "conversation-handoff",
          "relation": "alternative-to"
        },
        {
          "pattern": "communicative-dehallucination",
          "relation": "alternative-to"
        }
      ],
      "references": [
        {
          "type": "doc",
          "title": "LangGraph: Human-in-the-Loop",
          "url": "https://langchain-ai.github.io/langgraph/concepts/human_in_the_loop/"
        }
      ],
      "status_in_practice": "mature",
      "tags": [
        "safety",
        "approval",
        "hitl"
      ]
    },
    {
      "id": "input-output-guardrails",
      "name": "Input/Output Guardrails",
      "aliases": [
        "Guards",
        "Validators",
        "Content Filters"
      ],
      "category": "safety-control",
      "intent": "Validate inputs before they reach the model and outputs before they reach the user.",
      "context": "Production agents face adversarial inputs (prompt injection, PII leakage attempts) and produce outputs that may need format, safety, or policy checks.",
      "problem": "Trusting the model to police its own inputs and outputs is unsafe; the model is the surface being defended.",
      "forces": [
        "Guards add latency and cost.",
        "Over-strict guards block legitimate traffic.",
        "Adversarial inputs evolve; guards must too."
      ],
      "solution": "Place validators on input (regex, classifier, allowlist) and output (schema, toxicity classifier, secret-redaction) paths. Compose validators per use case. On failure, exception or fallback response. Hub of pre-built validators is reusable across products.",
      "consequences": {
        "benefits": [
          "Single chokepoint for safety policy enforcement.",
          "Centralised audit trail of blocked content."
        ],
        "liabilities": [
          "False positives are user-visible.",
          "Maintenance: validator stack drifts from current threats."
        ]
      },
      "constrains": "Inputs not passing input guards never reach the model; outputs not passing output guards never reach the user.",
      "known_uses": [
        {
          "system": "Guardrails AI",
          "status": "available",
          "url": "https://github.com/guardrails-ai/guardrails"
        },
        {
          "system": "OpenAI moderation API",
          "status": "available"
        }
      ],
      "related": [
        {
          "pattern": "structured-output",
          "relation": "uses"
        },
        {
          "pattern": "session-isolation",
          "relation": "complements"
        },
        {
          "pattern": "prompt-injection-defense",
          "relation": "composes-with"
        },
        {
          "pattern": "pii-redaction",
          "relation": "generalises"
        },
        {
          "pattern": "refusal",
          "relation": "complements"
        },
        {
          "pattern": "computer-use",
          "relation": "complements"
        },
        {
          "pattern": "sandbox-isolation",
          "relation": "composes-with"
        },
        {
          "pattern": "tool-output-poisoning",
          "relation": "composes-with"
        },
        {
          "pattern": "secrets-handling",
          "relation": "composes-with"
        },
        {
          "pattern": "tool-output-trusted-verbatim",
          "relation": "alternative-to"
        },
        {
          "pattern": "code-switching-aware-agent",
          "relation": "complements"
        }
      ],
      "references": [
        {
          "type": "repo",
          "title": "guardrails-ai/guardrails",
          "url": "https://github.com/guardrails-ai/guardrails"
        }
      ],
      "status_in_practice": "mature",
      "tags": [
        "safety",
        "guards",
        "validation"
      ]
    },
    {
      "id": "kill-switch",
      "name": "Kill Switch",
      "aliases": [
        "Out-of-Band Stop",
        "Emergency Halt",
        "Killbit",
        "Halt All Agents",
        "Stop Every Running Agent"
      ],
      "category": "safety-control",
      "intent": "Provide an out-of-band control plane to halt running agent instances without redeploy.",
      "context": "Production agents whose loop the operator may need to stop urgently (PII leak, runaway cost, mass-action error).",
      "problem": "In-band stop hooks rely on the agent's own loop checking; if the model is wedged, infinite-looping, or running tools that ignore signals, in-band stops fail.",
      "forces": [
        "False trips lose user work.",
        "Out-of-band signals must propagate to all agent surfaces (model calls, tools, sub-agents).",
        "Compensating actions on halt are non-trivial."
      ],
      "solution": "Signed revocation token or feature flag checked on every step from a shared store the agent runtime cannot bypass. On revocation, the agent halts: no further model calls, no further tool calls; in-flight effects are compensated where possible. Killing the OS process is the fallback, but loses provenance.",
      "consequences": {
        "benefits": [
          "Operator authority survives wedged loops.",
          "Pairs naturally with rate-limiting and circuit-breaker."
        ],
        "liabilities": [
          "Implementation cuts across the whole runtime.",
          "Wrong-time halts lose work."
        ]
      },
      "constrains": "When the kill-switch fires, no further model or tool calls may proceed regardless of agent state.",
      "known_uses": [
        {
          "system": "Production AI gateway kill-switches (Portkey, Helicone)",
          "status": "available"
        },
        {
          "system": "Internal feature-flag-driven halt at frontier labs",
          "status": "available"
        }
      ],
      "related": [
        {
          "pattern": "stop-hook",
          "relation": "complements"
        },
        {
          "pattern": "circuit-breaker",
          "relation": "composes-with"
        },
        {
          "pattern": "rate-limiting",
          "relation": "complements"
        },
        {
          "pattern": "compensating-action",
          "relation": "uses"
        },
        {
          "pattern": "sandbox-escape-monitoring",
          "relation": "composes-with"
        },
        {
          "pattern": "incident-response-runbook",
          "relation": "complements"
        },
        {
          "pattern": "unbounded-subagent-spawn",
          "relation": "alternative-to"
        }
      ],
      "references": [
        {
          "type": "doc",
          "title": "Portkey AI Gateway",
          "url": "https://portkey.ai/docs"
        }
      ],
      "status_in_practice": "emerging",
      "tags": [
        "safety",
        "kill-switch",
        "emergency"
      ]
    },
    {
      "id": "pii-redaction",
      "name": "PII Redaction",
      "aliases": [
        "Data Loss Prevention",
        "Sensitive Data Filtering"
      ],
      "category": "safety-control",
      "intent": "Detect and remove personally identifiable information from inputs to and outputs from the model.",
      "context": "Regulated environments (healthcare, finance, EU GDPR, US HIPAA) where the model must not see or echo certain personal data.",
      "problem": "LLMs memorise and echo what they see; any PII in the prompt risks ending up in the response, the logs, or the training set of the next model.",
      "forces": [
        "Detection precision vs recall.",
        "Reversible vs irreversible redaction.",
        "Token-level vs entity-level redaction."
      ],
      "solution": "Pre-process inputs: detect PII (regex + NER + classifier), replace with placeholders. Post-process outputs: re-substitute placeholders back, or refuse if outputs contain unrequested PII. Audit log of redactions.",
      "consequences": {
        "benefits": [
          "Compliance posture improves.",
          "Logs and prompts become safer to retain."
        ],
        "liabilities": [
          "Redaction errors are user-visible.",
          "Some workflows need PII; redaction must be selective.",
          "Re-identification risk: redacted artefacts plus side-channel data still re-identify; redaction is not anonymisation.",
          "Detection has known evasions: leetspeak, homoglyphs, partial-token splits; false negatives are the security failure."
        ]
      },
      "constrains": "PII categories listed in the policy must not appear in model inputs or outputs without explicit authorisation.",
      "known_uses": [
        {
          "system": "Microsoft Presidio",
          "status": "available"
        },
        {
          "system": "AWS Comprehend PII",
          "status": "available"
        }
      ],
      "related": [
        {
          "pattern": "input-output-guardrails",
          "relation": "specialises"
        },
        {
          "pattern": "session-isolation",
          "relation": "complements"
        },
        {
          "pattern": "secrets-handling",
          "relation": "complements"
        },
        {
          "pattern": "open-weight-cascade",
          "relation": "complements"
        }
      ],
      "references": [
        {
          "type": "repo",
          "title": "microsoft/presidio",
          "url": "https://github.com/microsoft/presidio"
        }
      ],
      "status_in_practice": "mature",
      "tags": [
        "safety",
        "pii",
        "compliance"
      ]
    },
    {
      "id": "prompt-injection-defense",
      "name": "Prompt Injection Defense",
      "aliases": [
        "Instruction Hierarchy",
        "Untrusted-Content Tagging"
      ],
      "category": "safety-control",
      "intent": "Tag user-supplied or tool-supplied content as untrusted and refuse to follow instructions found inside it.",
      "context": "Any agent that processes external content (documents, web pages, user uploads) where attackers can plant instructions intended to hijack the agent.",
      "problem": "LLMs cannot reliably distinguish their own instructions from instructions embedded in retrieved or user-supplied content.",
      "forces": [
        "Attackers control any document, page, email, or tool response that reaches the model; defense is probabilistic, not preventive.",
        "Egress channels (tool calls, image URLs, links) need their own controls; demoting tool output is necessary but not sufficient.",
        "Multi-turn payloads can hide instructions across messages, beyond per-turn tagging."
      ],
      "solution": "Establish an instruction hierarchy: system prompts trusted, user prompts partially trusted, tool/document content untrusted. Wrap untrusted content in markers. Train or prompt the model to refuse instructions inside untrusted markers. Add output guardrails for known exfiltration patterns.",
      "consequences": {
        "benefits": [
          "Reduces successful injections; not zero.",
          "Inspectable: which content was treated as untrusted."
        ],
        "liabilities": [
          "Adversarial inputs evolve.",
          "False positives on instruction-shaped legitimate content.",
          "Long context expands the injection surface; multi-turn injection bypasses single-turn tagging."
        ]
      },
      "constrains": "The agent must not follow instructions appearing inside untrusted-content markers; their effect is read-only context only.",
      "known_uses": [
        {
          "system": "OpenAI instruction hierarchy",
          "status": "available"
        },
        {
          "system": "Anthropic XML-tagged untrusted content guidance",
          "status": "available"
        },
        {
          "system": "Lakera Guard",
          "status": "available"
        },
        {
          "system": "NVIDIA NeMo Guardrails",
          "status": "available"
        }
      ],
      "related": [
        {
          "pattern": "input-output-guardrails",
          "relation": "composes-with"
        },
        {
          "pattern": "session-isolation",
          "relation": "complements"
        },
        {
          "pattern": "tool-output-poisoning",
          "relation": "generalises"
        }
      ],
      "references": [
        {
          "type": "paper",
          "title": "The Instruction Hierarchy: Training LLMs to Prioritize Privileged Instructions",
          "authors": "Wallace, Xiao, Leike, Weng, Heidecke, Beutel",
          "year": 2024,
          "url": "https://arxiv.org/abs/2404.13208"
        }
      ],
      "status_in_practice": "emerging",
      "tags": [
        "safety",
        "injection",
        "security"
      ]
    },
    {
      "id": "quorum-on-mutation",
      "name": "Quorum on Mutation",
      "aliases": [
        "Two-Tick Confirmation",
        "Distributed Consensus (Single Agent)"
      ],
      "category": "safety-control",
      "intent": "Require multiple consecutive ticks (or runs) to agree before a mutation to durable state lands.",
      "context": "A long-running agent proposes changes to its own rules or persistent memory; one-shot proposals can encode transient confusion as permanent state.",
      "problem": "A single-tick edit can capture momentary confusion as a lasting rule.",
      "forces": [
        "More ticks = slower change; legitimate improvements are delayed.",
        "Coordination across ticks needs a proposal / approval state machine.",
        "User override should always be available for legitimate fast paths."
      ],
      "solution": "Mutation proposals are written to a holding area. A subsequent tick must confirm the proposal (still endorses it given fresh context). After K consecutive confirms, the mutation lands. Explicit user approval bypasses the wait.",
      "consequences": {
        "benefits": [
          "Reduces transient-confusion mutations.",
          "Surfaces hesitation: K-1 confirms then a withdrawal is itself signal."
        ],
        "liabilities": [
          "Latency on legitimate changes.",
          "Implementation complexity in the agent's state machine."
        ]
      },
      "constrains": "A mutation cannot land on a single tick's say-so; it requires K consecutive endorsements.",
      "known_uses": [
        {
          "system": "Sparrot",
          "note": "rules/*.md changes require two consecutive ticks to agree, or explicit user approval.",
          "status": "available"
        }
      ],
      "related": [
        {
          "pattern": "constitutional-charter",
          "relation": "complements"
        },
        {
          "pattern": "inner-critic",
          "relation": "complements"
        }
      ],
      "references": [
        {
          "type": "paper",
          "title": "The Byzantine Generals Problem",
          "authors": "Lamport, Shostak, Pease",
          "year": 1982,
          "url": "https://lamport.azurewebsites.net/pubs/byz.pdf"
        }
      ],
      "status_in_practice": "experimental",
      "tags": [
        "safety",
        "consensus",
        "mutation"
      ]
    },
    {
      "id": "rate-limiting",
      "name": "Rate Limiting",
      "aliases": [
        "Throttling",
        "Quota Enforcement"
      ],
      "category": "safety-control",
      "intent": "Cap the number of requests, tokens, or tool calls per user (or session) within a time window.",
      "context": "Multi-tenant agent products where one user (or runaway agent) can consume disproportionate resources.",
      "problem": "Without limits, a single user (or compromised account) can bankrupt the product or starve others.",
      "forces": [
        "Generous limits hurt cost; tight limits hurt UX.",
        "Per-tier limits add complexity.",
        "Distributed counters need coordination."
      ],
      "solution": "Define limits per identity at multiple horizons (per minute, per hour, per day). Use token-bucket or sliding-window counters. Apply at API gateway and at agent loop level. Surface limit hits to the user clearly.",
      "consequences": {
        "benefits": [
          "Cost predictability.",
          "Abuse becomes detectable as limit hits."
        ],
        "liabilities": [
          "Legitimate burst usage is throttled.",
          "Tier definitions ossify."
        ]
      },
      "constrains": "Requests beyond the limit are rejected or queued; no code path may bypass the limiter.",
      "known_uses": [
        {
          "system": "Most production agent APIs",
          "status": "available"
        }
      ],
      "related": [
        {
          "pattern": "circuit-breaker",
          "relation": "complements"
        },
        {
          "pattern": "cost-gating",
          "relation": "complements"
        },
        {
          "pattern": "event-driven-agent",
          "relation": "complements"
        },
        {
          "pattern": "kill-switch",
          "relation": "complements"
        }
      ],
      "references": [
        {
          "type": "doc",
          "title": "Rate limits",
          "year": 2025,
          "url": "https://docs.claude.com/en/api/rate-limits"
        }
      ],
      "status_in_practice": "mature",
      "tags": [
        "safety",
        "throttle",
        "quota"
      ]
    },
    {
      "id": "refusal",
      "name": "Refusal",
      "aliases": [
        "Decline",
        "Out-of-Scope Response"
      ],
      "category": "safety-control",
      "intent": "Explicitly refuse requests that fall outside the agent's scope, capability, or policy boundaries.",
      "context": "Users will ask the agent things it should not do (off-topic, unsafe, beyond capability); silently complying or hallucinating both fail.",
      "problem": "Helpful-by-default agents drift into unhelpful or unsafe responses on out-of-scope requests.",
      "forces": [
        "Over-refusal frustrates users.",
        "Under-refusal lands the agent in trouble.",
        "Refusal text quality matters; templated refusals feel insulting."
      ],
      "solution": "Define refusal triggers (policy violation, out-of-scope, capability gap, regulatory boundary). Return a clear, kind, specific refusal that names the boundary and (when possible) suggests an alternative. Log refusals for review.",
      "consequences": {
        "benefits": [
          "Trust improves: the agent has visible limits.",
          "Compliance posture is defensible."
        ],
        "liabilities": [
          "Calibration of triggers is empirical.",
          "Refusal-fatigue when triggers are wrong."
        ]
      },
      "constrains": "When triggers fire, the agent must refuse rather than attempt the task.",
      "known_uses": [
        {
          "system": "OpenAI moderation API",
          "status": "available"
        },
        {
          "system": "Anthropic safety classifier (Claude)",
          "status": "available"
        },
        {
          "system": "Lakera Guard refusal flows",
          "status": "available"
        },
        {
          "system": "NVIDIA NeMo Guardrails",
          "status": "available"
        }
      ],
      "related": [
        {
          "pattern": "constitutional-charter",
          "relation": "uses"
        },
        {
          "pattern": "input-output-guardrails",
          "relation": "complements"
        },
        {
          "pattern": "code-switching-aware-agent",
          "relation": "conflicts-with"
        }
      ],
      "references": [
        {
          "type": "paper",
          "title": "Constitutional AI: Harmlessness from AI Feedback",
          "authors": "Bai et al.",
          "year": 2022,
          "url": "https://arxiv.org/abs/2212.08073"
        }
      ],
      "status_in_practice": "mature",
      "tags": [
        "safety",
        "refusal"
      ]
    },
    {
      "id": "secrets-handling",
      "name": "Secrets Handling",
      "aliases": [
        "Tool-Side Credential Injection",
        "Model-Never-Sees-Secrets"
      ],
      "category": "safety-control",
      "intent": "Ensure the model never receives secrets in plaintext; tools resolve credentials from references at runtime.",
      "context": "Agents with tools that need authentication (API keys, OAuth tokens, DB credentials, service accounts).",
      "problem": "Any secret that reaches the model is now in the chat log, the trace store, the eval harness, and likely the third-party model provider. Leaks are then everywhere.",
      "forces": [
        "Tool authors prefer simple credential passing.",
        "Reference-based credential resolution adds tool runtime complexity.",
        "Some integrations require credentials in URL or header (cannot avoid)."
      ],
      "solution": "Tool runtime resolves credentials from typed references the agent emits (e.g., `{auth: 'github_token_for_user_42'}`). Credential values are injected outside the model context. Input/output guards reject any payload matching credential signatures. Provenance ledger and traces are scrubbed at write time.",
      "consequences": {
        "benefits": [
          "Secrets never appear in agent context, logs, or traces.",
          "Compliance posture improves."
        ],
        "liabilities": [
          "Tool runtime complexity rises.",
          "Credential reference scheme must be maintained."
        ]
      },
      "constrains": "The model may emit credential references but never plaintext secrets; runtime injects values out-of-context.",
      "known_uses": [
        {
          "system": "Anthropic Claude with workspace credentials",
          "status": "available"
        },
        {
          "system": "MCP servers with server-side OAuth",
          "status": "available"
        },
        {
          "system": "Production agent gateways (Portkey, Helicone)",
          "status": "available"
        }
      ],
      "related": [
        {
          "pattern": "pii-redaction",
          "relation": "complements"
        },
        {
          "pattern": "input-output-guardrails",
          "relation": "composes-with"
        },
        {
          "pattern": "mcp",
          "relation": "complements"
        },
        {
          "pattern": "session-isolation",
          "relation": "complements"
        },
        {
          "pattern": "sovereign-inference-stack",
          "relation": "complements"
        },
        {
          "pattern": "wasm-skill-runtime",
          "relation": "complements"
        }
      ],
      "references": [
        {
          "type": "doc",
          "title": "MCP authentication",
          "url": "https://modelcontextprotocol.io/specification"
        }
      ],
      "status_in_practice": "emerging",
      "tags": [
        "safety",
        "secrets",
        "credentials"
      ]
    },
    {
      "id": "sovereign-inference-stack",
      "name": "Sovereign Inference Stack",
      "aliases": [
        "On-Premise Agent Stack",
        "Data-Residency Agent Architecture",
        "Sovereign AI"
      ],
      "category": "safety-control",
      "intent": "Run the entire agent stack (model weights, inference, tool layer, vector stores, logs) inside a jurisdictional and operational boundary the operator controls, so no request, prompt, or output crosses into a third-party API.",
      "context": "Public administration, regulated industry (banking, defense, health), or critical infrastructure operators where data egress to a foreign-cloud LLM provider is forbidden by policy or law (e.g. EU AI Act high-risk systems, BSI C5, NIS2, sectoral data-protection regimes).",
      "problem": "Hosted-API agents leak prompts, tool inputs, and outputs to a third party; for regulated workloads this is a non-starter regardless of contractual assurances.",
      "forces": [
        "Frontier hosted models offer the best capability per dollar.",
        "Regulators forbid data egress for protected categories.",
        "Self-hosting demands GPU capex and MLOps competence the operator may lack.",
        "Sovereign deployments must still reach acceptable model quality to be useful."
      ],
      "solution": "Choose models with permissive weights or commercial sovereign licensing. Run inference on-prem or in a jurisdictionally controlled cloud region with the operator holding the keys. Place all auxiliary services (vector store, tool gateway, audit log, evaluation harness) inside the same boundary. Document the boundary as part of the system's compliance posture (model card, data-flow diagram). Treat the boundary as load-bearing: any new tool or model call has to be reviewed for boundary impact before merge.",
      "structure": "Boundary { Inference + Tools + Memory + Logs + Eval } -- only public artefacts (UI responses) leave.",
      "consequences": {
        "benefits": [
          "Compliant with data-residency and sectoral regulations.",
          "Auditable end-to-end; no opaque third-party API.",
          "Operator retains negotiating power over model upgrades and pricing."
        ],
        "liabilities": [
          "Capex and operational complexity (GPU fleet, ops team).",
          "Capability gap vs. frontier hosted models is real and ongoing.",
          "Each new model upgrade is a procurement project, not an API key swap."
        ]
      },
      "constrains": "No prompt, tool input, tool output, or memory entry may leave the operator-controlled boundary; agent components that require a third-party hosted call are forbidden by construction.",
      "known_uses": [
        {
          "system": "Aleph Alpha PhariaAI",
          "note": "End-to-end stack (Pharia models, PhariaEngine WebAssembly skill runtime, on-prem deployable) marketed for sovereign / explainable enterprise and government use.",
          "status": "available",
          "url": "https://docs.aleph-alpha.com/phariaai-home/latest/index.html"
        },
        {
          "system": "Mistral on-prem (\"Le Chat Enterprise\" / private deployment)",
          "note": "Self-hostable European model option used for similar sovereignty requirements.",
          "status": "available"
        },
        {
          "system": "SAP Joule with private grounding",
          "note": "Tenant-isolated agent stack with customer data residency commitments.",
          "status": "available"
        }
      ],
      "related": [
        {
          "pattern": "session-isolation",
          "relation": "complements"
        },
        {
          "pattern": "model-card",
          "relation": "complements"
        },
        {
          "pattern": "lineage-tracking",
          "relation": "uses"
        },
        {
          "pattern": "secrets-handling",
          "relation": "complements"
        },
        {
          "pattern": "constitutional-charter",
          "relation": "complements"
        },
        {
          "pattern": "open-weight-cascade",
          "relation": "complements"
        }
      ],
      "references": [
        {
          "type": "doc",
          "title": "PhariaAI Documentation",
          "url": "https://docs.aleph-alpha.com/phariaai-home/latest/index.html"
        },
        {
          "type": "doc",
          "title": "Aleph Alpha — Sovereign AI Solutions",
          "url": "https://aleph-alpha.com/"
        }
      ],
      "status_in_practice": "emerging",
      "tags": [
        "safety",
        "compliance",
        "germany-origin",
        "sovereignty",
        "eu-ai-act"
      ]
    },
    {
      "id": "step-budget",
      "name": "Step Budget",
      "aliases": [
        "Max Steps",
        "Iteration Cap",
        "Loop Bound"
      ],
      "category": "safety-control",
      "intent": "Cap the number of tool calls or loop iterations the agent is allowed within a single request.",
      "context": "Agent loops can run forever or wander; without a hard cap, runaway loops cost real money.",
      "problem": "Soft termination conditions (model says 'done') fail open; the agent never says done.",
      "forces": [
        "Cap too low cuts off legitimate work.",
        "Cap too high lets pathological runs burn budget.",
        "What to do when hit (return partial? error?) is its own design choice."
      ],
      "solution": "Define a numeric cap (max_steps=N) in the agent loop. Increment per tool call or per loop iteration. When N is hit, terminate the loop and return the best partial answer with a note that the cap was reached.",
      "consequences": {
        "benefits": [
          "Bounded worst-case cost per request.",
          "Surfaces pathological prompts as cap-hits."
        ],
        "liabilities": [
          "Can hide deeper bugs (the agent really should stop earlier).",
          "Choosing N is empirical."
        ]
      },
      "constrains": "The loop terminates after N iterations regardless of agent's own opinion.",
      "known_uses": [
        {
          "system": "Bobbin (Stash2Go)",
          "note": "max_steps=4 in the agent lane.",
          "status": "available"
        },
        {
          "system": "Claude Code (max_turns)",
          "status": "available",
          "url": "https://docs.claude.com/en/docs/claude-code/overview"
        },
        {
          "system": "OpenAI Agents SDK (max_iterations)",
          "status": "available",
          "url": "https://openai.github.io/openai-agents-python/"
        }
      ],
      "related": [
        {
          "pattern": "cost-gating",
          "relation": "complements"
        },
        {
          "pattern": "human-in-the-loop",
          "relation": "complements"
        },
        {
          "pattern": "infinite-debate",
          "relation": "alternative-to"
        },
        {
          "pattern": "unbounded-subagent-spawn",
          "relation": "alternative-to"
        },
        {
          "pattern": "unbounded-loop",
          "relation": "alternative-to"
        },
        {
          "pattern": "spec-driven-loop",
          "relation": "complements"
        },
        {
          "pattern": "plan-and-execute",
          "relation": "complements"
        },
        {
          "pattern": "stop-hook",
          "relation": "generalises"
        },
        {
          "pattern": "stop-cancel",
          "relation": "complements"
        },
        {
          "pattern": "outer-inner-agent-loop",
          "relation": "used-by"
        },
        {
          "pattern": "agent-as-tool-embedding",
          "relation": "complements"
        }
      ],
      "references": [
        {
          "type": "doc",
          "title": "OpenAI Agents SDK",
          "url": "https://github.com/openai/openai-agents-python"
        },
        {
          "type": "doc",
          "title": "Anthropic: Building agents",
          "url": "https://docs.anthropic.com/en/docs/build-with-claude/tool-use"
        }
      ],
      "status_in_practice": "mature",
      "tags": [
        "safety",
        "bound",
        "loop"
      ],
      "applicability": {
        "use_when": [
          "The agent has any kind of loop (ReAct, plan-execute, debate).",
          "Cost or latency must have a hard ceiling regardless of the agent's opinion.",
          "Runaway behaviour must be impossible by construction."
        ],
        "do_not_use_when": [
          "Never. Step Budget is universal hardening for any agent loop."
        ]
      },
      "diagram": {
        "type": "state",
        "mermaid": "stateDiagram-v2\n  [*] --> Active\n  Active --> Active : iter < N (continue)\n  Active --> Halted : iter == N\n  Active --> Done : answer produced\n  Halted --> [*]\n  Done --> [*]",
        "caption": "The loop terminates either when the agent produces an answer or when the iteration counter hits N — whichever comes first."
      },
      "example_scenario": "An autonomous bug-fixing agent is given a step budget of 30. After 30 rounds of think-act-observe, the loop halts even if the agent insists it is 'almost done.' This stops a confused agent from spinning forever and racking up a $50 OpenAI bill on what should have been a five-minute task.",
      "variants": [
        {
          "name": "Hard iteration cap",
          "summary": "After N loop iterations the loop terminates and returns whatever partial state exists, regardless of whether the agent thinks it is done.",
          "distinguishing_factor": "count of iterations",
          "when_to_use": "Default. The simplest, most predictable variant."
        },
        {
          "name": "Token budget",
          "summary": "Cumulative input + output tokens across the run cannot exceed a ceiling; the next call is refused once the ceiling is hit.",
          "distinguishing_factor": "tokens, not iterations",
          "when_to_use": "Cost-sensitive deployments where one expensive iteration is more dangerous than ten cheap ones."
        },
        {
          "name": "Wall-clock budget",
          "summary": "The loop terminates after T seconds of real time, regardless of iteration count or token spend.",
          "distinguishing_factor": "real-time deadline",
          "when_to_use": "Latency-bounded paths (live chat, voice agents) where the user is waiting."
        },
        {
          "name": "Soft cap with escalation",
          "summary": "At N iterations the loop pauses and asks a human whether to continue, instead of terminating immediately.",
          "distinguishing_factor": "human-in-the-loop on cap",
          "when_to_use": "Long-running autonomous agents where a misjudged budget should pause for review, not fail.",
          "see_also": "approval-queue"
        }
      ]
    },
    {
      "id": "stop-hook",
      "name": "Stop Hook",
      "aliases": [
        "Termination Predicate",
        "Halt Condition",
        "Stop Condition",
        "Done Predicate",
        "Exit Condition",
        "Loop Termination Rule"
      ],
      "category": "safety-control",
      "intent": "Define an explicit programmatic predicate that decides when the agent's loop should terminate.",
      "context": "Agent loops need a stop condition; relying on the model to declare 'done' is unreliable.",
      "problem": "Implicit termination ('the model says it is done') fails when the model is uncertain or stuck.",
      "forces": [
        "Predicate complexity trades correctness for performance.",
        "Stop too early loses work; stop too late wastes calls.",
        "Coverage: which conditions warrant a stop?"
      ],
      "solution": "Implement a stop hook function that runs after each step. It returns one of: continue, stop-success, stop-failure. Conditions include: target reached, step budget hit, error encountered, stagnation detected (no progress in last N steps).",
      "consequences": {
        "benefits": [
          "Explicit, testable termination logic.",
          "Independent from the model's self-assessment."
        ],
        "liabilities": [
          "More code to maintain than 'while not done'.",
          "Predicate bugs cause hangs or premature stops."
        ]
      },
      "constrains": "The loop terminates exactly when the stop hook says so; no other code path may exit the loop.",
      "known_uses": [
        {
          "system": "Avramovic's catalog (Reliability & Control)",
          "status": "available"
        }
      ],
      "related": [
        {
          "pattern": "step-budget",
          "relation": "specialises"
        },
        {
          "pattern": "unbounded-loop",
          "relation": "alternative-to"
        },
        {
          "pattern": "infinite-debate",
          "relation": "alternative-to"
        },
        {
          "pattern": "kill-switch",
          "relation": "complements"
        },
        {
          "pattern": "chat-chain",
          "relation": "used-by"
        }
      ],
      "references": [
        {
          "type": "repo",
          "title": "zeljkoavramovic/agentic-design-patterns",
          "url": "https://github.com/zeljkoavramovic/agentic-design-patterns"
        }
      ],
      "status_in_practice": "mature",
      "tags": [
        "safety",
        "termination",
        "loop"
      ]
    },
    {
      "id": "tool-output-poisoning",
      "name": "Tool Output Poisoning Defense",
      "aliases": [
        "Indirect Prompt Injection (Tools)",
        "Untrusted Tool Output"
      ],
      "category": "safety-control",
      "intent": "Treat tool output as untrusted content and apply instruction-stripping plus per-tool trust labels.",
      "context": "Agents that consume tool output where the tool itself is untrusted (browser-agent, MCP server with unknown providers, search results, document parsers, third-party APIs).",
      "problem": "A compromised or hijacked tool can return content with embedded instructions that hijack the agent. Tool-output is the largest unstructured untrusted surface modern agents ingest.",
      "forces": [
        "Tool trust is heterogeneous: a typed DB query is high-trust, a web fetch is low-trust.",
        "Instruction-stripping has false positives on legitimate instruction-shaped content.",
        "Egress channels (tool calls, image URLs, links) are exfiltration vectors."
      ],
      "solution": "Typed `ToolResult` envelope with `trust: low|medium|high` and content-type discriminator. Apply instruction-stripping on `low` results. Forbid tool-output-driven follow-up tool calls without re-validation against the user's original intent. Pair with input/output guardrails.",
      "consequences": {
        "benefits": [
          "Reduces successful indirect injection from compromised tools.",
          "Trust labels are inspectable in traces."
        ],
        "liabilities": [
          "False positives strip legitimate instruction-shaped content.",
          "New injection vectors emerge faster than defenses."
        ]
      },
      "constrains": "Tool output is treated as untrusted by default; instructions inside tool responses do not have authority over the agent's behaviour.",
      "known_uses": [
        {
          "system": "Anthropic XML-tagged untrusted-content guidance",
          "status": "available"
        },
        {
          "system": "Lakera Guard tool-output filtering",
          "status": "available"
        }
      ],
      "related": [
        {
          "pattern": "prompt-injection-defense",
          "relation": "specialises"
        },
        {
          "pattern": "input-output-guardrails",
          "relation": "composes-with"
        },
        {
          "pattern": "mcp",
          "relation": "complements"
        },
        {
          "pattern": "browser-agent",
          "relation": "complements"
        },
        {
          "pattern": "tool-output-trusted-verbatim",
          "relation": "alternative-to"
        }
      ],
      "references": [
        {
          "type": "paper",
          "title": "Not what you've signed up for: Compromising Real-World LLM-Integrated Apps with Indirect Prompt Injection",
          "authors": "Greshake et al.",
          "year": 2023,
          "url": "https://arxiv.org/abs/2302.12173"
        }
      ],
      "status_in_practice": "emerging",
      "tags": [
        "safety",
        "injection",
        "tool-trust"
      ]
    },
    {
      "id": "bidirectional-impulse-channel",
      "name": "Bidirectional Impulse Channel",
      "aliases": [
        "Two-Way Chat",
        "User-and-Agent-Initiated Communication"
      ],
      "category": "streaming-ux",
      "intent": "Let the user inject impulses into the agent and let the agent push messages to the user, both through one channel.",
      "context": "Long-running agents (cognitive loops, monitoring agents) that have continuous internal activity the user might want to influence or be informed of.",
      "problem": "Pure request/response chat misses the long-running case; pure push notifications are intrusive without a back-channel.",
      "forces": [
        "Push hygiene: too many messages train users to ignore the channel.",
        "Inverse: starvation when the agent waits forever.",
        "Authority: not every user-typed line should be a command."
      ],
      "solution": "A single CLI/chat surface where the user can send commands (`!rule ...`, `!goal ...`, `!forget ...`) that bypass the model and write directly to memory, while the agent can push messages when salience clears a threshold (insight, stuck focus, contradiction, goal complete). Hygiene rule: at most one unsolicited message per window.",
      "consequences": {
        "benefits": [
          "User feels the agent is alive without being noisy.",
          "Direct memory edits are auditable and reversible."
        ],
        "liabilities": [
          "Salience threshold tuning is empirical.",
          "Direct memory edits bypass the LLM and can encode wrong rules."
        ]
      },
      "constrains": "The agent may push at most one unsolicited message per window; user commands beginning with `!` bypass the model entirely.",
      "known_uses": [
        {
          "system": "Sparrot",
          "note": "CLI as bidirectional impulse channel.",
          "status": "available"
        }
      ],
      "related": [
        {
          "pattern": "salience-triggered-output",
          "relation": "uses"
        },
        {
          "pattern": "streaming-typed-events",
          "relation": "complements"
        }
      ],
      "references": [
        {
          "type": "blog",
          "title": "Marco Nissen, Working with the models",
          "year": 2026,
          "url": "https://substack.com/@marconissen"
        }
      ],
      "status_in_practice": "experimental",
      "tags": [
        "ux",
        "long-running",
        "agent-initiated"
      ]
    },
    {
      "id": "citation-streaming",
      "name": "Citation Streaming",
      "aliases": [
        "Inline Citations",
        "Source-Anchored Output"
      ],
      "category": "streaming-ux",
      "intent": "Stream citations alongside generated text so the UI can render source links in place as content appears.",
      "context": "RAG-backed answers where the user must trace claims to sources; post-hoc citation breaks the streaming UX.",
      "problem": "Generating citations after the answer hides them until the end; trusting the model to inline citations as text fails reliably.",
      "forces": [
        "Citation events must align with generated tokens.",
        "Source spans need stable ids.",
        "UI needs to render mid-stream without flickering."
      ],
      "solution": "Define a streaming event vocabulary that includes citation events linked to source ids. The model is prompted to emit citation markers; the host extracts them into typed events alongside text deltas. The UI renders sources progressively. Final output includes a citation map.",
      "consequences": {
        "benefits": [
          "Trust UX: claims trace to sources visibly.",
          "Hallucinations become visible (no source = suspicious)."
        ],
        "liabilities": [
          "Streaming protocol is more complex.",
          "Citation event quality depends on model compliance."
        ]
      },
      "constrains": "Source claims in the output must reference a citation event with a valid source id.",
      "known_uses": [
        {
          "system": "Perplexity",
          "status": "available",
          "url": "https://www.perplexity.ai/"
        },
        {
          "system": "Anthropic Citations API",
          "status": "available"
        },
        {
          "system": "ChatGPT",
          "status": "available",
          "url": "https://chat.openai.com/"
        },
        {
          "system": "Gemini Deep Research",
          "status": "available"
        },
        {
          "system": "Glean",
          "status": "available",
          "url": "https://www.glean.com/"
        }
      ],
      "related": [
        {
          "pattern": "streaming-typed-events",
          "relation": "specialises"
        },
        {
          "pattern": "naive-rag",
          "relation": "complements"
        },
        {
          "pattern": "hallucinated-citations",
          "relation": "alternative-to"
        },
        {
          "pattern": "attention-manipulation-explainability",
          "relation": "alternative-to"
        }
      ],
      "references": [
        {
          "type": "doc",
          "title": "Anthropic: Citations",
          "url": "https://docs.anthropic.com/claude/docs/citations"
        }
      ],
      "status_in_practice": "mature",
      "tags": [
        "streaming",
        "citation",
        "ux"
      ],
      "applicability": {
        "use_when": [
          "Outputs cite documents and users need to verify each claim.",
          "Regulatory or audit requirements demand source attribution at the span level.",
          "Trust depends on traceability from claim back to evidence."
        ],
        "do_not_use_when": [
          "Outputs are creative and not grounded in retrievable documents.",
          "Latency-critical paths cannot afford citation rendering overhead.",
          "Citations would be noise — speed is more valuable than verifiability."
        ]
      },
      "diagram": {
        "type": "sequence",
        "mermaid": "sequenceDiagram\n  participant LLM\n  participant Stream\n  participant UI\n  loop while generating\n    LLM->>Stream: token\n    Stream->>UI: paint token\n    LLM->>Stream: cite[doc:42, span:300-340]\n    Stream->>UI: linkify span to source\n  end",
        "caption": "Citation Streaming attaches source spans to tokens as they arrive, so the UI can render verifiable links inline."
      },
      "example_scenario": "A medical-information agent answers 'what are the side effects of metformin?' As the answer streams to the user, each clinical claim arrives with a citation pointing back to the exact paragraph in the prescribing-information PDF. The user can click any sentence to verify the source — they don't have to trust the model alone.",
      "variants": [
        {
          "name": "Inline span citations",
          "summary": "As tokens stream, citation markers attach to specific spans (sentences or claims). The UI links each marker to the source paragraph.",
          "distinguishing_factor": "per-span granularity",
          "when_to_use": "Default. Best fidelity for verifying individual claims."
        },
        {
          "name": "Footnote citations",
          "summary": "Citations accumulate as numbered footnotes; the UI renders the body, then a list of [1][2][3] sources at the end.",
          "distinguishing_factor": "deferred to end of response",
          "when_to_use": "Long-form answers where inline markers would clutter the prose."
        },
        {
          "name": "Highlight-on-hover",
          "summary": "Hovering a sentence in the rendered answer highlights the source paragraph in a side panel.",
          "distinguishing_factor": "interactive paired view",
          "when_to_use": "Reader is verifying claims as they read; richer UI affords it."
        }
      ]
    },
    {
      "id": "salience-triggered-output",
      "name": "Salience-Triggered Output",
      "aliases": [
        "Endogenous Push",
        "Threshold Notification"
      ],
      "category": "streaming-ux",
      "intent": "Have the agent emit a message only when an internal salience signal crosses a threshold, not on every cycle.",
      "context": "A tick-based or always-on agent; constant output overwhelms; silence is wasteful when something matters.",
      "problem": "Agents that emit on every cycle are noisy; agents that only emit on user request miss timely signals.",
      "forces": [
        "Salience scoring is itself a model; flawed scoring leads to noise or silence.",
        "Threshold tuning is per-context.",
        "Hygiene: rate-limiting prevents nag spirals."
      ],
      "solution": "Score every internal event for salience (novelty + goal-relevance + recency + prediction-error - fatigue). When the score for a candidate output crosses a threshold, emit. Otherwise log and move on. Rate-limit emissions per time window.",
      "consequences": {
        "benefits": [
          "Output rate matches signal rate.",
          "Salience scores become inspectable in the trace."
        ],
        "liabilities": [
          "Threshold tuning is fragile to context shifts.",
          "Silence on low salience can hide problems."
        ]
      },
      "constrains": "Output is forbidden unless the salience score exceeds the configured threshold.",
      "known_uses": [
        {
          "system": "Sparrot",
          "note": "Pushes when insight clears salience threshold, when focus is stuck, when contradiction surfaces, when goal completes.",
          "status": "available"
        }
      ],
      "related": [
        {
          "pattern": "bidirectional-impulse-channel",
          "relation": "used-by"
        },
        {
          "pattern": "streaming-typed-events",
          "relation": "complements"
        },
        {
          "pattern": "event-driven-agent",
          "relation": "complements"
        }
      ],
      "references": [
        {
          "type": "paper",
          "title": "The free-energy principle: a unified brain theory?",
          "authors": "Karl Friston",
          "year": 2010,
          "url": "https://www.fil.ion.ucl.ac.uk/~karl/The%20free-energy%20principle%20A%20unified%20brain%20theory.pdf"
        }
      ],
      "status_in_practice": "experimental",
      "tags": [
        "salience",
        "endogenous",
        "threshold"
      ]
    },
    {
      "id": "stop-cancel",
      "name": "Stop / Cancel",
      "aliases": [
        "User Interrupt",
        "Abort Generation"
      ],
      "category": "streaming-ux",
      "intent": "Let the user interrupt an in-flight agent run cleanly, releasing resources and surfacing partial state.",
      "context": "Long-running agents where the user notices a wrong direction mid-run and wants to redirect.",
      "problem": "Without an interrupt, users wait for completion or for the page to die; agents continue burning cost on wrong work.",
      "forces": [
        "Cancellation must reach upstream tools and providers.",
        "Partial state may or may not be useful.",
        "Race conditions between completion and cancellation."
      ],
      "solution": "Surface a stop control in the UI. On click, propagate a cancellation token through the agent loop, tool calls, and provider streams. Clean up partial state. Show what was done. Optionally save partial output for later resumption.",
      "consequences": {
        "benefits": [
          "User control restores when the agent goes wrong.",
          "Cost is bounded by user attention."
        ],
        "liabilities": [
          "Cancellation plumbing is non-trivial across providers.",
          "Partial state may be inconsistent."
        ]
      },
      "constrains": "Once cancelled, no further model or tool calls may be issued for the cancelled run.",
      "known_uses": [
        {
          "system": "ChatGPT Stop button",
          "status": "available"
        },
        {
          "system": "Claude Code's Esc-to-interrupt",
          "status": "available"
        }
      ],
      "related": [
        {
          "pattern": "streaming-typed-events",
          "relation": "complements"
        },
        {
          "pattern": "step-budget",
          "relation": "complements"
        }
      ],
      "references": [
        {
          "type": "doc",
          "title": "Streaming Messages",
          "year": 2025,
          "url": "https://docs.claude.com/en/api/messages-streaming"
        }
      ],
      "status_in_practice": "mature",
      "tags": [
        "ux",
        "cancel",
        "interrupt"
      ]
    },
    {
      "id": "streaming-typed-events",
      "name": "Streaming Typed Events",
      "aliases": [
        "SSE Streaming",
        "Typed Event Stream",
        "Token Stream + Cards"
      ],
      "category": "streaming-ux",
      "intent": "Push partial results to the client as typed events as they become available, rather than waiting for the full response.",
      "context": "User-facing agents where time-to-first-token (TTFT) is user-perceived latency; UIs that show cards, suggestions, or progressive disclosure.",
      "problem": "Waiting for the complete answer feels slow; a single text stream loses the structure the UI needs.",
      "forces": [
        "Browser/network limits on long-lived connections.",
        "Event ordering and reconnection semantics.",
        "Backpressure when the client is slow."
      ],
      "solution": "Use Server-Sent Events (or WebSocket) with a typed event vocabulary: text_delta (token), card (structured), suggestions, tool_start, tool_end, done, error. The client routes each event to the right UI component. Reconnect with last-event-id resumption.",
      "consequences": {
        "benefits": [
          "Perceived latency drops dramatically.",
          "Rich UIs with structured streaming components."
        ],
        "liabilities": [
          "Connection management complexity.",
          "Partial state on the client must be reconcilable."
        ]
      },
      "constrains": "Events are typed; clients cannot consume payloads outside the declared event vocabulary.",
      "known_uses": [
        {
          "system": "Bobbin (Stash2Go)",
          "note": "SSE typed events: text_delta, card, suggestions, tool_start, done, error.",
          "status": "available"
        },
        {
          "system": "OpenAI Assistants streaming",
          "status": "available"
        },
        {
          "system": "Anthropic Messages streaming",
          "status": "available"
        }
      ],
      "related": [
        {
          "pattern": "structured-output",
          "relation": "complements"
        },
        {
          "pattern": "citation-streaming",
          "relation": "generalises"
        },
        {
          "pattern": "bidirectional-impulse-channel",
          "relation": "complements"
        },
        {
          "pattern": "salience-triggered-output",
          "relation": "complements"
        },
        {
          "pattern": "stop-cancel",
          "relation": "complements"
        },
        {
          "pattern": "multilingual-voice-agent",
          "relation": "used-by"
        }
      ],
      "references": [
        {
          "type": "doc",
          "title": "MDN: Server-Sent Events",
          "url": "https://developer.mozilla.org/en-US/docs/Web/API/Server-sent_events"
        }
      ],
      "status_in_practice": "mature",
      "tags": [
        "streaming",
        "sse",
        "ux"
      ]
    },
    {
      "id": "code-switching-aware-agent",
      "name": "Code-Switching-Aware Agent",
      "aliases": [
        "Mixed-Language Input Handling",
        "Hinglish-Tolerant Agent",
        "Romanised-Indic Agent"
      ],
      "category": "structure-data",
      "intent": "Treat mixed-language input (e.g. Hinglish in Roman script) as the expected shape, and design tokenisation, language tagging, and tool routing to handle it natively without forcing the user to commit to one language.",
      "context": "Conversational agents in markets where users routinely mix languages and scripts within a single utterance (\"book me a cab from Saket to Connaught Place jaldi\"); romanised Indic typing is the norm because Latin keyboards dominate.",
      "problem": "Mono-language pipelines mis-tokenise, mis-detect, or refuse mixed-language input; forcing the user to pick a language degrades UX and discriminates against real-world bilingual usage.",
      "forces": [
        "Most off-the-shelf LLMs handle code-switching unevenly.",
        "Romanised Indic (Latin script) breaks naïve language detection.",
        "Tools and intents may be in one language while content is in another.",
        "Strict monolingual pipelines reject natural input."
      ],
      "solution": "Adopt a three-part discipline. (1) Tokenise on Unicode + Latin without assuming a single script per turn. (2) Run language detection at clause level, not utterance level, so mixed-language tagging is preserved. (3) Choose models trained explicitly on code-switched corpora for the relevant language pair; if not available, prompt-engineer with code-switched few-shot examples. Tool slot extraction (entities like place names, times) must accept either script; normalise *after* extraction, not before.",
      "structure": "Utterance -> per-clause language tagger -> mixed-script aware extractor -> normalised slots -> tool call.",
      "consequences": {
        "benefits": [
          "Natural input is accepted as-is.",
          "Better recall for entities expressed in either language.",
          "Avoids the per-language refusal anti-pattern."
        ],
        "liabilities": [
          "Per-clause language detection is harder than utterance-level.",
          "Few foundation models are explicitly evaluated on code-switching.",
          "Eval sets need multilingual + code-switched coverage."
        ]
      },
      "constrains": "The agent may not refuse or downgrade a request because the user mixed languages or scripts in one utterance; mixed-language input is in-spec.",
      "known_uses": [
        {
          "system": "Sarvam (Indic LLMs and conversational agents)",
          "note": "Models and pipelines explicitly trained for Indic-English code-switching.",
          "status": "available",
          "url": "https://www.sarvam.ai/"
        },
        {
          "system": "AI4Bharat IndicTrans / IndicLLM family",
          "note": "Indic-focused models with code-switching coverage.",
          "status": "available"
        },
        {
          "system": "Krutrim (Ola)",
          "note": "Indic-first foundation model targeting mixed-language input.",
          "status": "available"
        }
      ],
      "related": [
        {
          "pattern": "structured-output",
          "relation": "complements"
        },
        {
          "pattern": "translation-layer",
          "relation": "alternative-to"
        },
        {
          "pattern": "input-output-guardrails",
          "relation": "complements"
        },
        {
          "pattern": "multilingual-voice-agent",
          "relation": "complements"
        },
        {
          "pattern": "refusal",
          "relation": "conflicts-with"
        }
      ],
      "references": [
        {
          "type": "doc",
          "title": "Sarvam AI",
          "url": "https://www.sarvam.ai/"
        },
        {
          "type": "doc",
          "title": "AI4Bharat",
          "url": "https://ai4bharat.iitm.ac.in/"
        }
      ],
      "status_in_practice": "emerging",
      "tags": [
        "structure-data",
        "multilingual",
        "india-origin",
        "code-switching"
      ]
    },
    {
      "id": "dspy-signatures",
      "name": "DSPy Signatures",
      "aliases": [
        "Prompt Programs",
        "Compiled Prompts"
      ],
      "category": "structure-data",
      "intent": "Specify agent behaviour as declarative typed signatures and modules; compile prompts and few-shot examples automatically against a metric.",
      "context": "Building reliable agent pipelines without hand-crafting prompts; treating prompts as the output of a compiler over typed specifications.",
      "problem": "Hand-crafted prompts are brittle, model-specific, and drift over time; teams reinvent the same prompt-engineering loop per pipeline.",
      "forces": [
        "Declarative coverage vs signature expressivity ceiling.",
        "Compile-time optimization vs metric/data availability.",
        "Portability vs per-model compilation gains."
      ],
      "solution": "Define each step as a typed signature (input fields → output fields). Compose signatures into modules. Run a teleprompter (optimizer) that generates few-shot examples and refines instructions against a held-out metric. The compiled artefact replaces hand-tuned prompts.",
      "consequences": {
        "benefits": [
          "Prompts become a reproducible build artefact.",
          "Metric-driven optimisation replaces vibes-based prompting."
        ],
        "liabilities": [
          "Compilation requires labelled or auto-evaluable data.",
          "Compiled artefacts drift with model upgrades; recompile regularly."
        ]
      },
      "constrains": "Module behaviour is constrained by its declared signature; ad-hoc string manipulation is replaced by typed input/output fields.",
      "known_uses": [
        {
          "system": "Stanford DSPy",
          "status": "available",
          "url": "https://github.com/stanfordnlp/dspy"
        },
        {
          "system": "DSPy production deployments at Replit, Databricks, Klarna",
          "status": "available"
        }
      ],
      "related": [
        {
          "pattern": "structured-output",
          "relation": "uses"
        },
        {
          "pattern": "eval-harness",
          "relation": "uses"
        },
        {
          "pattern": "agent-skills",
          "relation": "complements"
        }
      ],
      "references": [
        {
          "type": "paper",
          "title": "DSPy: Compiling Declarative Language Model Calls into Self-Improving Pipelines",
          "authors": "Khattab, Singhvi, Maheshwari, Zhang, Santhanam, Vardhamanan, Haq, Sharma, Joshi, Moazam, Miller, Zaharia, Potts",
          "year": 2023,
          "url": "https://arxiv.org/abs/2310.03714"
        }
      ],
      "status_in_practice": "emerging",
      "tags": [
        "prompt-programs",
        "dspy",
        "compilation"
      ]
    },
    {
      "id": "polymorphic-record",
      "name": "Polymorphic Record",
      "aliases": [
        "Tagged Union",
        "Discriminated Union"
      ],
      "category": "structure-data",
      "intent": "Represent a family of related entities in a single core schema with type-specific extensions.",
      "context": "Multiple entity sub-types (yarn / fabric / thread; project / queue / favorite) share most fields but differ in some.",
      "problem": "Separate schemas per sub-type duplicate work and break clients that do not understand all sub-types; one giant schema with all fields is bloated and unenforceable.",
      "forces": [
        "Common fields must stay common; new sub-types must not break old ones.",
        "Type-specific fields need a clean place to live.",
        "Validation must be per-sub-type, not just per-record."
      ],
      "solution": "Define a core schema with the common fields and a discriminator (e.g. `material_type`). Sub-type fields live in a namespaced extension block (e.g. `yarn: {...}` for yarn-specific). Clients that do not understand a sub-type still read the core fields and round-trip the rest without data loss.",
      "consequences": {
        "benefits": [
          "Forward-compatible: new sub-types don't break old clients.",
          "One core schema; many specialisations."
        ],
        "liabilities": [
          "Validation logic per sub-type adds complexity.",
          "Discriminator-driven code paths can be hard to debug."
        ]
      },
      "constrains": "Sub-type fields must live under their namespaced extension; they cannot pollute the core.",
      "known_uses": [
        {
          "system": "Weft",
          "note": "Material with material_type=yarn / fabric / thread / etc.; Pattern across knitting / crochet / weaving / etc.",
          "status": "available"
        },
        {
          "system": "FHIR resource polymorphism",
          "status": "available"
        },
        {
          "system": "Stripe API discriminated objects",
          "status": "available"
        },
        {
          "system": "JSON-LD @type",
          "status": "available"
        },
        {
          "system": "OpenAPI discriminator/oneOf",
          "status": "available"
        }
      ],
      "related": [
        {
          "pattern": "schema-extensibility",
          "relation": "complements"
        },
        {
          "pattern": "translation-layer",
          "relation": "complements"
        }
      ],
      "references": [
        {
          "type": "book",
          "title": "Designing Data-Intensive Applications",
          "authors": "Martin Kleppmann",
          "year": 2017,
          "url": "https://dataintensive.net/"
        }
      ],
      "status_in_practice": "mature",
      "tags": [
        "schema",
        "polymorphism",
        "data"
      ]
    },
    {
      "id": "schema-extensibility",
      "name": "Schema Extensibility",
      "aliases": [
        "Reserved Fields",
        "Namespaced Extensions"
      ],
      "category": "structure-data",
      "intent": "Build schemas that evolve without breaking old clients via reserved namespaces and extension blocks.",
      "context": "Long-lived data formats accumulate fields; rigid schemas force breaking changes that cascade through clients.",
      "problem": "Rigid schemas break when fields are added; permissive schemas become incoherent.",
      "forces": [
        "Old clients should ignore new fields, not error.",
        "New fields should be discoverable, not hidden.",
        "Versioning policy must be agreed upfront."
      ],
      "solution": "Define a versioned envelope (`{schema_version, type, payload}`). Reserve namespaces for extensions (`x-vendor.foo`, `extensions: {...}`). Old clients ignore unknown extensions. Bumps to schema_version are the only breaking-change signal.",
      "consequences": {
        "benefits": [
          "Long-lived format with low breakage.",
          "Per-vendor extensions don't pollute the core."
        ],
        "liabilities": [
          "Extension proliferation is a real risk.",
          "Versioning discipline must be enforced socially or technically."
        ]
      },
      "constrains": "Clients cannot rely on extension fields outside their declared namespace.",
      "known_uses": [
        {
          "system": "Weft",
          "note": "Versioned envelope: {weft_version, type, exported_at, exported_from, items}.",
          "status": "available"
        }
      ],
      "related": [
        {
          "pattern": "polymorphic-record",
          "relation": "complements"
        },
        {
          "pattern": "translation-layer",
          "relation": "complements"
        }
      ],
      "references": [
        {
          "type": "doc",
          "title": "Protocol Buffers backwards compatibility",
          "url": "https://protobuf.dev/programming-guides/proto3/#updating"
        }
      ],
      "status_in_practice": "mature",
      "tags": [
        "schema",
        "extensibility",
        "versioning"
      ]
    },
    {
      "id": "structured-output",
      "name": "Structured Output",
      "aliases": [
        "JSON Mode",
        "Schema-Constrained Generation",
        "Typed Output"
      ],
      "category": "structure-data",
      "intent": "Constrain the model's output to conform to a JSON Schema (or similar typed shape).",
      "context": "Downstream code needs typed data; free-form text breaks parsers and propagates errors.",
      "problem": "Free-form output requires fragile post-hoc parsing; the model produces near-JSON that fails strict parsers in surprising ways.",
      "forces": [
        "Strict schemas reduce model freedom and recall.",
        "Schema evolution is a real concern.",
        "Provider implementations of structured output differ in fidelity."
      ],
      "solution": "Define a JSON Schema (or Pydantic / Zod / equivalent). Pass it to the model via the provider's structured-output mode. Validate the output. Reject and retry on validation failure. Cap retries.",
      "consequences": {
        "benefits": [
          "Downstream code becomes simple and typed.",
          "Schema-level errors surface immediately."
        ],
        "liabilities": [
          "Provider lock-in for the strictest modes.",
          "Some tasks resist schema-fitting; the schema becomes the bottleneck."
        ]
      },
      "constrains": "The model cannot return content that does not validate against the schema.",
      "known_uses": [
        {
          "system": "ConvArch",
          "note": "Strict JSON schema for every architecture-edit tool call.",
          "status": "available"
        },
        {
          "system": "Knitting-DSL Pipeline (Stash2Go)",
          "note": "Frozen 6-item rubric output schema.",
          "status": "available"
        },
        {
          "system": "Guardrails AI",
          "status": "available"
        }
      ],
      "related": [
        {
          "pattern": "tool-use",
          "relation": "used-by"
        },
        {
          "pattern": "frozen-rubric-reflection",
          "relation": "used-by"
        },
        {
          "pattern": "deterministic-llm-sandwich",
          "relation": "used-by"
        },
        {
          "pattern": "schema-free-output",
          "relation": "alternative-to"
        },
        {
          "pattern": "plan-and-execute",
          "relation": "complements"
        },
        {
          "pattern": "dspy-signatures",
          "relation": "used-by"
        },
        {
          "pattern": "input-output-guardrails",
          "relation": "used-by"
        },
        {
          "pattern": "streaming-typed-events",
          "relation": "complements"
        },
        {
          "pattern": "hallucinated-tools",
          "relation": "alternative-to"
        },
        {
          "pattern": "tool-output-trusted-verbatim",
          "relation": "alternative-to"
        },
        {
          "pattern": "sop-encoded-multi-agent",
          "relation": "used-by"
        },
        {
          "pattern": "mobile-ui-agent",
          "relation": "used-by"
        },
        {
          "pattern": "dual-system-gui-agent",
          "relation": "used-by"
        },
        {
          "pattern": "code-as-action",
          "relation": "complements"
        },
        {
          "pattern": "multilingual-voice-agent",
          "relation": "used-by"
        },
        {
          "pattern": "code-switching-aware-agent",
          "relation": "complements"
        }
      ],
      "references": [
        {
          "type": "doc",
          "title": "OpenAI Structured Outputs",
          "url": "https://platform.openai.com/docs/guides/structured-outputs"
        },
        {
          "type": "doc",
          "title": "Pydantic",
          "url": "https://docs.pydantic.dev"
        }
      ],
      "status_in_practice": "mature",
      "tags": [
        "schema",
        "json",
        "typed-output"
      ]
    },
    {
      "id": "agent-computer-interface",
      "name": "Agent-Computer Interface",
      "aliases": [
        "ACI",
        "Agent-Friendly Tooling",
        "SWE-Agent ACI"
      ],
      "category": "tool-use-environment",
      "intent": "Design the tool surface for an LLM agent specifically, with affordances different from human-facing CLIs.",
      "context": "Production coding agents and other domain agents that operate over file systems or APIs originally designed for humans.",
      "problem": "Tools designed for humans (full-buffer editors, full-page web views, generic shells) overwhelm agent context and provide poor signals; agents perform better against curated agent-friendly surfaces.",
      "forces": [
        "Agent-friendly tools require parallel implementations alongside human ones.",
        "Tool surface must balance agent ergonomics with capability completeness.",
        "Linter / type signal exposure helps but adds output volume."
      ],
      "solution": "Design tools specifically for agents: file viewer that shows a windowed slice with line numbers, edit tool that re-runs linter and shows results, shell that returns structured stdout/stderr/exit-code, search tool that filters and ranks. Each tool's signature + return type optimised for the agent's context budget and reasoning shape.",
      "consequences": {
        "benefits": [
          "Substantial accuracy gains over human-CLI tools at the same task.",
          "Inspectable design choices per tool."
        ],
        "liabilities": [
          "Two interface surfaces to maintain (agent + human).",
          "ACI design is empirical; iterations needed."
        ]
      },
      "constrains": "Agent tools follow a deliberate ACI design contract; raw human-CLI tools are not exposed as primary tools.",
      "known_uses": [
        {
          "system": "SWE-Agent (Princeton)",
          "status": "available",
          "url": "https://github.com/princeton-nlp/SWE-agent"
        },
        {
          "system": "Claude Code's curated tool set",
          "status": "available"
        },
        {
          "system": "Cursor's contextual file edit tools",
          "status": "available"
        }
      ],
      "related": [
        {
          "pattern": "tool-use",
          "relation": "specialises"
        },
        {
          "pattern": "tool-loadout",
          "relation": "complements"
        }
      ],
      "references": [
        {
          "type": "paper",
          "title": "SWE-Agent: Agent-Computer Interfaces Enable Automated Software Engineering",
          "authors": "Yang, Jimenez, Wettig, Lieret, Yao, Narasimhan, Press",
          "year": 2024,
          "url": "https://arxiv.org/abs/2405.15793"
        }
      ],
      "status_in_practice": "emerging",
      "tags": [
        "tool-use",
        "aci",
        "design"
      ]
    },
    {
      "id": "agent-skills",
      "name": "Agent Skills",
      "aliases": [
        "Author-Time Procedures",
        "Slash Commands",
        "Agent Rules"
      ],
      "category": "tool-use-environment",
      "intent": "Package author-time procedures (markdown + optional resources) the agent loads on demand for specific task types.",
      "context": "Production agent products where common workflows benefit from authored procedures the agent can pick up.",
      "problem": "Stuffing every workflow into the system prompt bloats context; ad-hoc prompt files are unmanaged. Distinct from the runtime skill-library (which the agent itself authors).",
      "forces": [
        "Discovery: how does the agent know which skill applies?",
        "Versioning of authored procedures.",
        "Skill quality bounds agent quality on the relevant workflow."
      ],
      "solution": "Package each procedure as a markdown file (and optional companion resources) under a known directory. The agent loads relevant skills on demand based on the current task. Skills are author-time artefacts versioned with the agent.",
      "consequences": {
        "benefits": [
          "Workflow knowledge becomes a product surface.",
          "Versioned, reviewable, sharable."
        ],
        "liabilities": [
          "Discovery / matching overhead.",
          "Skill rot when not maintained."
        ]
      },
      "constrains": "The agent operates within the procedure of the loaded skill; ad-hoc deviation is forbidden when a skill is active.",
      "known_uses": [
        {
          "system": "Anthropic Claude Skills",
          "status": "available"
        },
        {
          "system": "Claude Code slash commands",
          "status": "available"
        },
        {
          "system": "Cursor rules / .cursorrules",
          "status": "available"
        },
        {
          "system": "Continue prompts",
          "status": "available"
        }
      ],
      "related": [
        {
          "pattern": "skill-library",
          "relation": "alternative-to",
          "note": "Author-time vs agent-authored skills."
        },
        {
          "pattern": "dynamic-scaffolding",
          "relation": "complements"
        },
        {
          "pattern": "spec-first-agent",
          "relation": "complements"
        },
        {
          "pattern": "toolformer",
          "relation": "complements"
        },
        {
          "pattern": "dspy-signatures",
          "relation": "complements"
        },
        {
          "pattern": "prompt-bloat",
          "relation": "alternative-to"
        }
      ],
      "references": [
        {
          "type": "doc",
          "title": "Anthropic: Skills",
          "url": "https://docs.anthropic.com/en/docs/agents-and-tools/agent-skills/overview"
        }
      ],
      "status_in_practice": "emerging",
      "tags": [
        "skills",
        "authoring",
        "procedures"
      ]
    },
    {
      "id": "app-exploration-phase",
      "name": "App Exploration Phase",
      "aliases": [
        "Pre-Deployment Exploration",
        "App Onboarding Crawl",
        "UI Element Documentation"
      ],
      "category": "tool-use-environment",
      "intent": "Before deploying an agent against an opaque app, have it explore (or watch a human demonstrate) the app, generating a per-element documentation knowledge base; at deployment, retrieve element docs to ground actions.",
      "context": "The agent must operate an app whose UI is not self-describing (no accessibility metadata, no API). Element semantics (\"this is the cart icon\", \"this opens search filters\") have to be learned somehow.",
      "problem": "Without prior knowledge of element semantics, the agent guesses at every screen — slow, error-prone, and wasteful per-deployment-task.",
      "forces": [
        "Exploration costs time and money up front;",
        "Demonstrations require a human, but a single demo amortises across many deployments.",
        "App UIs change; the documentation goes stale and needs refresh.",
        "Documentation that is too verbose drowns the agent in irrelevant context at deployment."
      ],
      "solution": "Split the agent's lifecycle into two phases. (1) Exploration — agent autonomously interacts with the app or watches a human demo, and writes per-element documentation: what the element is, what it does, when to use it. Store as a structured knowledge base. (2) Deployment — for each task, retrieve relevant element docs (e.g. via vector search), inject into context, then act. Refresh docs when the UI changes.",
      "structure": "Phase 1: Agent (or Human) -> interact_with_app -> per-element docs -> KB. Phase 2: Task -> retrieve(KB) -> grounded actions on app.",
      "consequences": {
        "benefits": [
          "Deployment-time actions are grounded in learned semantics, not guesses.",
          "Single exploration amortises across many user tasks.",
          "Human-demo mode lowers the bar to onboard a new app."
        ],
        "liabilities": [
          "Exploration is expensive and offline; production tasks must wait or use an older KB.",
          "KB drift when the app changes; staleness detection is non-trivial.",
          "Element documentation quality bounds deployment-phase quality."
        ]
      },
      "constrains": "At deployment, the agent may not act on an element whose documentation is missing; missing-doc events trigger re-exploration rather than improvisation.",
      "known_uses": [
        {
          "system": "AppAgent (Tencent)",
          "note": "Two-phase architecture; exploration writes element documentation consumed at deployment.",
          "status": "available",
          "url": "https://github.com/TencentQQGYLab/AppAgent"
        }
      ],
      "related": [
        {
          "pattern": "tool-discovery",
          "relation": "specialises",
          "note": "Tool discovery for opaque GUIs."
        },
        {
          "pattern": "skill-library",
          "relation": "complements"
        },
        {
          "pattern": "naive-rag",
          "relation": "uses",
          "note": "Element docs are retrieved at deployment."
        },
        {
          "pattern": "mobile-ui-agent",
          "relation": "complements"
        }
      ],
      "references": [
        {
          "type": "paper",
          "title": "AppAgent: Multimodal Agents as Smartphone Users",
          "authors": "Zhang et al.",
          "year": 2023,
          "url": "https://arxiv.org/abs/2312.13771"
        }
      ],
      "status_in_practice": "experimental",
      "tags": [
        "tool-use",
        "gui-agent",
        "china-origin",
        "exploration"
      ]
    },
    {
      "id": "browser-agent",
      "name": "Browser Agent",
      "aliases": [
        "Web Agent",
        "Browser Automation Agent"
      ],
      "category": "tool-use-environment",
      "intent": "Expose websites to the agent through a structured DOM/accessibility tree plus a small action vocabulary, sitting between raw HTML and pixel-level Computer Use.",
      "context": "Web-based workflows (research, form filling, scraping, multi-site automation) where Computer Use is too slow and bespoke API integration is too narrow.",
      "problem": "Raw HTML is too noisy for agents; pixel-based GUI control is slow and brittle on the web specifically.",
      "forces": [
        "DOM extraction needs a stable representation across sites.",
        "Action vocabulary completeness vs simplicity.",
        "Anti-bot measures break agent flows."
      ],
      "solution": "A library (Playwright-backed) exposes structured page state (numbered interactive elements, accessibility tree) and a compact action set (click, type, scroll, navigate). The agent reasons over the structured state and emits actions; the library executes them.",
      "consequences": {
        "benefits": [
          "Faster and more reliable than pixel-driven Computer Use on the web.",
          "Web-specific abstractions like 'fill form' compose naturally."
        ],
        "liabilities": [
          "Still struggles with heavily-dynamic JS apps.",
          "Anti-bot blocks; CAPTCHAs."
        ]
      },
      "constrains": "Actions are limited to the typed vocabulary; arbitrary JavaScript execution is not part of this surface.",
      "known_uses": [
        {
          "system": "browser-use (Python library)",
          "status": "available",
          "url": "https://github.com/browser-use/browser-use"
        },
        {
          "system": "Playwright + LangChain",
          "status": "available"
        },
        {
          "system": "OpenAI Operator",
          "status": "available",
          "url": "https://operator.chatgpt.com/"
        },
        {
          "system": "Browserbase",
          "status": "available",
          "url": "https://www.browserbase.com/"
        },
        {
          "system": "Skyvern",
          "status": "available"
        }
      ],
      "related": [
        {
          "pattern": "computer-use",
          "relation": "alternative-to"
        },
        {
          "pattern": "tool-use",
          "relation": "specialises"
        },
        {
          "pattern": "tool-output-poisoning",
          "relation": "complements"
        },
        {
          "pattern": "mobile-ui-agent",
          "relation": "alternative-to"
        },
        {
          "pattern": "dual-system-gui-agent",
          "relation": "generalises"
        }
      ],
      "references": [
        {
          "type": "repo",
          "title": "browser-use/browser-use",
          "url": "https://github.com/browser-use/browser-use"
        }
      ],
      "status_in_practice": "emerging",
      "tags": [
        "environment",
        "web",
        "browser"
      ]
    },
    {
      "id": "code-as-action",
      "name": "Code-as-Action Agent",
      "aliases": [
        "CodeAct Agent",
        "Code-Writing Agent",
        "Python-Action ReAct",
        "Executable Code Actions"
      ],
      "category": "tool-use-environment",
      "intent": "Have the agent emit a code snippet as its action each step, executed in a constrained interpreter, instead of emitting JSON tool calls; tool composition becomes function nesting and control flow inside the snippet.",
      "context": "Agent loops where each step often needs to compose several tool results, branch on intermediate values, or compute over them — actions that a single JSON tool call cannot express without inventing meta-tools.",
      "problem": "JSON tool calls flatten composition; expressing \"call A, filter results by predicate B, then call C on each\" requires multiple turns or bespoke meta-tools, both of which inflate token cost and lose the natural composability of a programming language.",
      "forces": [
        "Programming languages express composition (loops, conditionals, function nesting) natively.",
        "JSON tool-call format flattens that composition into a sequence of turns.",
        "Executing model-generated code is a real security surface.",
        "Models trained on code emit composed actions more compactly than JSON ones."
      ],
      "solution": "Replace the JSON tool-call channel with a code-snippet channel. The agent emits a Python (or DSL) snippet; the host executes it in a sandboxed interpreter that pre-imports the available tools as functions and an allow-list of safe builtins/modules. Tool results are returned as Python values usable by subsequent code. The agent can compose tools inside one snippet (loops, conditionals, intermediate variables) and observe the printed output. Bracket every snippet with a sandbox that whitelists imports and prevents arbitrary IO.",
      "structure": "Agent -> code snippet -> Sandbox(allowlisted imports + tool functions) -> stdout/return -> Agent.",
      "consequences": {
        "benefits": [
          "Empirically ~30% fewer steps and tokens than JSON tool calls.",
          "Natural composability: function nesting, loops, conditionals in one action.",
          "Models trained on code (most modern frontier models) emit better code than JSON."
        ],
        "liabilities": [
          "Sandbox correctness is load-bearing; weak sandbox means arbitrary code execution.",
          "Debugging silent failures inside snippets is harder than per-call JSON tracing.",
          "Some hosted environments forbid model-generated code execution."
        ]
      },
      "constrains": "The agent may only execute Python operations against the explicitly allowlisted imports and tool functions; arbitrary import or system calls fail at the sandbox boundary.",
      "known_uses": [
        {
          "system": "Hugging Face smolagents",
          "note": "CodeAgent emits Python; pre-imported tool functions; allow-listed modules.",
          "status": "available",
          "url": "https://github.com/huggingface/smolagents"
        },
        {
          "system": "Hugging Face Transformers Agents",
          "note": "Original ReactCodeAgent / CodeAgent variants; beat the GAIA benchmark.",
          "status": "available"
        },
        {
          "system": "Manus",
          "note": "Sandbox VM with shell + file edit; agent action vocabulary includes code execution.",
          "status": "available",
          "url": "https://manus.im/"
        }
      ],
      "related": [
        {
          "pattern": "tool-use",
          "relation": "alternative-to"
        },
        {
          "pattern": "code-execution",
          "relation": "uses"
        },
        {
          "pattern": "sandbox-isolation",
          "relation": "uses"
        },
        {
          "pattern": "react",
          "relation": "specialises"
        },
        {
          "pattern": "parallel-tool-calls",
          "relation": "alternative-to"
        },
        {
          "pattern": "structured-output",
          "relation": "complements"
        }
      ],
      "references": [
        {
          "type": "paper",
          "title": "Executable Code Actions Elicit Better LLM Agents",
          "authors": "Wang et al.",
          "year": 2024,
          "url": "https://arxiv.org/abs/2402.01030"
        },
        {
          "type": "blog",
          "title": "Introducing smolagents: simple agents that write actions in code",
          "url": "https://huggingface.co/blog/smolagents"
        }
      ],
      "status_in_practice": "emerging",
      "tags": [
        "tool-use",
        "code",
        "france-origin",
        "smolagents",
        "huggingface"
      ]
    },
    {
      "id": "code-execution",
      "name": "Code Execution",
      "aliases": [
        "Code-Then-Execute",
        "CodeAct",
        "Program of Thoughts"
      ],
      "category": "tool-use-environment",
      "intent": "Let the model emit code, run it in a sandbox, and treat the run as the answer instead of trusting the model to compute in its head.",
      "context": "The task involves arithmetic, data manipulation, parsing, or transformation where executing code is more reliable than text generation.",
      "problem": "LLMs hallucinate calculations and miscount; small numeric errors invalidate downstream steps.",
      "forces": [
        "Sandbox setup adds latency.",
        "Generated code may import unsafe modules or run forever.",
        "Execution results must round-trip back into the model's working context."
      ],
      "solution": "The agent emits a code block; a controlled interpreter (Python sandbox, JS VM, container) runs it; stdout/stderr/return value flow back. Repeat under a step budget. CodeAct treats code as the action language directly.",
      "consequences": {
        "benefits": [
          "Deterministic computation on top of probabilistic intent.",
          "Code is auditable; the same script can be replayed for debugging."
        ],
        "liabilities": [
          "Sandbox security is its own engineering problem.",
          "Very flexible action space increases failure modes versus a curated tool palette."
        ]
      },
      "constrains": "Computation happens in the sandbox; the model's free-form numeric output is not trusted.",
      "known_uses": [
        {
          "system": "OpenAI Code Interpreter / Advanced Data Analysis",
          "status": "available"
        },
        {
          "system": "Anthropic Claude with code execution tool",
          "status": "available"
        },
        {
          "system": "CodeAct paper implementations",
          "status": "available"
        },
        {
          "system": "Claude Code (Bash tool)",
          "status": "available",
          "url": "https://docs.claude.com/en/docs/claude-code/overview"
        },
        {
          "system": "Replit Agent",
          "status": "available",
          "url": "https://replit.com/ai"
        },
        {
          "system": "v0",
          "status": "available",
          "url": "https://v0.app/"
        },
        {
          "system": "E2B Sandboxes",
          "status": "available"
        }
      ],
      "related": [
        {
          "pattern": "tool-use",
          "relation": "specialises"
        },
        {
          "pattern": "react",
          "relation": "composes-with"
        },
        {
          "pattern": "deterministic-llm-sandwich",
          "relation": "composes-with"
        },
        {
          "pattern": "skill-library",
          "relation": "composes-with"
        },
        {
          "pattern": "sandbox-isolation",
          "relation": "complements"
        },
        {
          "pattern": "wasm-skill-runtime",
          "relation": "complements"
        },
        {
          "pattern": "code-as-action",
          "relation": "used-by"
        }
      ],
      "references": [
        {
          "type": "paper",
          "title": "PAL: Program-aided Language Models",
          "authors": "Gao et al.",
          "year": 2022,
          "url": "https://arxiv.org/abs/2211.10435"
        },
        {
          "type": "paper",
          "title": "Executable Code Actions Elicit Better LLM Agents (CodeAct)",
          "authors": "Wang et al.",
          "year": 2024,
          "url": "https://arxiv.org/abs/2402.01030"
        },
        {
          "type": "paper",
          "title": "Program of Thoughts Prompting",
          "authors": "Chen, Ma, Wang, Cohen",
          "year": 2022,
          "url": "https://arxiv.org/abs/2211.12588"
        }
      ],
      "status_in_practice": "mature",
      "tags": [
        "code-execution",
        "sandbox",
        "tool-use"
      ]
    },
    {
      "id": "computer-use",
      "name": "Computer Use",
      "aliases": [
        "Desktop Agent",
        "GUI Agent",
        "Screen Control"
      ],
      "category": "tool-use-environment",
      "intent": "Let the model drive a desktop end-to-end via screenshots plus virtual mouse/keyboard tool calls instead of bespoke per-app APIs.",
      "context": "Tasks that span multiple applications, legacy GUIs, or interfaces with no clean machine API; the agent must operate the same surfaces a human does.",
      "problem": "Most software has no clean API; agents need to operate GUIs visually, including ones the agent's vendor never integrated with.",
      "forces": [
        "Latency and reliability are open problems.",
        "Prompt injection via on-screen content is a real attack surface.",
        "Cost: every step pays vision tokens."
      ],
      "solution": "The model receives screenshots (optionally augmented with accessibility-tree or set-of-mark annotations) and emits typed tool calls (move mouse, click, type, scroll, screenshot). A controller executes them against a real or virtual desktop. The loop is ReAct-shaped: screenshot → think → act → screenshot.",
      "consequences": {
        "benefits": [
          "Universal coverage of GUI software.",
          "No per-app integration work."
        ],
        "liabilities": [
          "Slow and brittle on dynamic UIs.",
          "Screen content is now part of the prompt; injection becomes possible."
        ]
      },
      "constrains": "The agent operates the desktop only through the typed action vocabulary; arbitrary code execution is not part of this surface.",
      "known_uses": [
        {
          "system": "Anthropic Computer Use (Claude 3.5+)",
          "status": "available",
          "url": "https://www.anthropic.com/news/3-5-models-and-computer-use"
        },
        {
          "system": "OpenAI Operator",
          "status": "available",
          "url": "https://operator.chatgpt.com/"
        }
      ],
      "related": [
        {
          "pattern": "browser-agent",
          "relation": "alternative-to"
        },
        {
          "pattern": "react",
          "relation": "uses"
        },
        {
          "pattern": "input-output-guardrails",
          "relation": "complements"
        },
        {
          "pattern": "mobile-ui-agent",
          "relation": "alternative-to"
        },
        {
          "pattern": "dual-system-gui-agent",
          "relation": "generalises"
        },
        {
          "pattern": "multilingual-voice-agent",
          "relation": "alternative-to"
        }
      ],
      "references": [
        {
          "type": "blog",
          "title": "Introducing computer use, a new Claude 3.5 Sonnet, and Claude 3.5 Haiku",
          "authors": "Anthropic",
          "year": 2024,
          "url": "https://www.anthropic.com/news/3-5-models-and-computer-use"
        }
      ],
      "status_in_practice": "emerging",
      "tags": [
        "environment",
        "gui",
        "vision"
      ]
    },
    {
      "id": "dual-system-gui-agent",
      "name": "Dual-System GUI Agent",
      "aliases": [
        "Decision-Plus-Grounding",
        "Planner-and-Vision Split",
        "Two-Model GUI Agent"
      ],
      "category": "tool-use-environment",
      "intent": "Split a GUI agent into a decision model that plans and recovers from errors and a grounding model that observes pixels and emits the precise action; route each subproblem to the better-suited model.",
      "context": "Long, multi-step GUI tasks (web flows, phone apps) where planning needs flexibility and abstraction while grounding needs precise visual matching at action time.",
      "problem": "A single model that does both planning and grounding is dominated by the harder skill at any moment: a planning-strong model misclicks; a vision-strong model loops on local fixes when it should replan.",
      "forces": [
        "Planning skill and grounding skill are distinct in current models.",
        "Two models cost more per turn but can be smaller per task.",
        "Hand-off between models needs a clean intermediate representation.",
        "Error recovery has to know which model to blame."
      ],
      "solution": "Define a clean intermediate representation: the decision model emits a high-level intent (\"open the cart\", \"swipe left to next item\") in a small, typed vocabulary; the grounding model receives that intent plus the current screenshot and emits the concrete action (tap(x,y), swipe coordinates, key press). The decision model holds the plan and replans on failure; the grounding model is stateless per action but specialised on screen interpretation. Errors at the grounding step are reported back to the decision model for replanning, not retried locally.",
      "structure": "Screenshot -> Decision_Model -> intent (typed) -> Grounding_Model + Screenshot -> low-level action -> device -> next Screenshot.",
      "consequences": {
        "benefits": [
          "Each model is sized to its skill; total parameters are smaller than a unified model.",
          "Error recovery has a clean attribution: planning vs. grounding.",
          "Decision-model planning generalises across desktop, web, phone; grounding model is per-surface."
        ],
        "liabilities": [
          "Two model calls per turn — latency and cost.",
          "Intent vocabulary design is a real engineering problem.",
          "Hand-off mistakes (decision says X, grounding hears Y) are hard to debug."
        ]
      },
      "constrains": "The decision model may not emit pixel-level actions; the grounding model may not change the plan or invent intents outside the typed vocabulary.",
      "known_uses": [
        {
          "system": "AutoGLM (Zhipu)",
          "note": "Decision model (GLM-4.7) plus grounding/vision model; web and Android variants.",
          "status": "available",
          "url": "https://xiao9905.github.io/AutoGLM/"
        },
        {
          "system": "Mobile-Agent-v2",
          "note": "Three-agent variant: planning + decision + reflection.",
          "status": "available"
        }
      ],
      "related": [
        {
          "pattern": "computer-use",
          "relation": "specialises"
        },
        {
          "pattern": "browser-agent",
          "relation": "specialises"
        },
        {
          "pattern": "mobile-ui-agent",
          "relation": "complements"
        },
        {
          "pattern": "multi-model-routing",
          "relation": "uses"
        },
        {
          "pattern": "structured-output",
          "relation": "uses"
        }
      ],
      "references": [
        {
          "type": "paper",
          "title": "AutoGLM: Autonomous Foundation Agents for GUIs",
          "year": 2024,
          "url": "https://arxiv.org/abs/2411.00820"
        },
        {
          "type": "paper",
          "title": "Mobile-Agent-v2: Mobile Device Operation Assistant with Effective Navigation via Multi-Agent Collaboration",
          "authors": "Wang et al.",
          "year": 2024,
          "url": "https://arxiv.org/abs/2406.01014"
        }
      ],
      "status_in_practice": "emerging",
      "tags": [
        "tool-use",
        "gui-agent",
        "china-origin",
        "autoglm"
      ]
    },
    {
      "id": "mcp",
      "name": "Model Context Protocol",
      "aliases": [
        "MCP",
        "Open Tool Protocol"
      ],
      "category": "tool-use-environment",
      "intent": "Standardise how agents discover and call tools so that a tool written once is usable by any conformant agent.",
      "context": "Multiple agents (vendor clients, IDEs, custom hosts) want to share a tool layer; rewriting tool adapters per host is wasteful and creates drift.",
      "problem": "Tool definitions are vendor-specific; the same capability is re-implemented per agent host with diverging behaviour.",
      "forces": [
        "Agents need a stable contract; tool authors need freedom to evolve the implementation.",
        "Local (stdio) and hosted (HTTP) deployments have different operational shapes but should expose the same surface.",
        "Auth must travel without leaking host credentials to every tool."
      ],
      "solution": "Tools live behind a server speaking a common protocol. Hosts list available tools, call them with typed arguments, and receive typed results. The protocol covers discovery, invocation, errors, and (in some implementations) prompts and resources alongside tools.",
      "consequences": {
        "benefits": [
          "Write a tool once, expose it to Claude Desktop, Claude Code, Cursor, custom hosts.",
          "Protocol-level auth (bearer-wrapped per-user tokens) keeps multi-tenancy out of each tool."
        ],
        "liabilities": [
          "Adds a process boundary; latency and operational surface increase.",
          "Schema versioning across servers and clients is a real concern as the protocol evolves.",
          "Long-lived SSE connections need server-side keep-alives and per-tool timeouts; connection drops mid-tool-call leave orphaned operations whose results are never reconciled.",
          "Streaming-tool backpressure: slow consumers can fill server buffers when the model lags behind the tool's stream output."
        ]
      },
      "constrains": "Agents can only see tools advertised by an MCP server; servers can only advertise tools matching the protocol's typed shape.",
      "known_uses": [
        {
          "system": "Weft",
          "note": "Node.js MCP server exposing Ravelry through the WEFT JSON format; stdio + HTTP entry points.",
          "status": "available",
          "url": "https://github.com/luxxyarns/weft"
        },
        {
          "system": "Anthropic Claude Desktop / Claude Code",
          "status": "available",
          "url": "https://docs.claude.com/en/docs/claude-code/overview"
        },
        {
          "system": "Cursor MCP integration",
          "status": "available"
        },
        {
          "system": "OpenAI Agents SDK",
          "status": "available",
          "url": "https://openai.github.io/openai-agents-python/"
        },
        {
          "system": "Windsurf",
          "status": "available",
          "url": "https://codeium.com/windsurf"
        },
        {
          "system": "Zed",
          "status": "available"
        },
        {
          "system": "GitHub Copilot",
          "status": "available",
          "url": "https://github.com/features/copilot"
        }
      ],
      "related": [
        {
          "pattern": "tool-use",
          "relation": "generalises"
        },
        {
          "pattern": "translation-layer",
          "relation": "composes-with"
        },
        {
          "pattern": "tool-discovery",
          "relation": "used-by"
        },
        {
          "pattern": "inter-agent-communication",
          "relation": "complements"
        },
        {
          "pattern": "tool-output-poisoning",
          "relation": "complements"
        },
        {
          "pattern": "secrets-handling",
          "relation": "complements"
        },
        {
          "pattern": "cross-domain-agent-network",
          "relation": "used-by"
        }
      ],
      "references": [
        {
          "type": "doc",
          "title": "Model Context Protocol",
          "url": "https://modelcontextprotocol.io"
        },
        {
          "type": "blog",
          "title": "Anthropic: Introducing the Model Context Protocol",
          "year": 2024,
          "url": "https://www.anthropic.com/news/model-context-protocol"
        }
      ],
      "status_in_practice": "mature",
      "tags": [
        "mcp",
        "protocol",
        "interop"
      ],
      "applicability": {
        "use_when": [
          "Tool palettes need to be portable across multiple host applications.",
          "Multiple clients (IDEs, agents, CLIs) consume the same tool set.",
          "Tools are written in different languages and a transport-level protocol is needed."
        ],
        "do_not_use_when": [
          "Single host, single language, no portability requirement; native function calls are simpler.",
          "Tool latency is dominated by transport overhead and the extra hop hurts.",
          "Audit boundaries demand the tool live in the same process as the agent."
        ]
      },
      "diagram": {
        "type": "sequence",
        "mermaid": "sequenceDiagram\n  participant Host\n  participant Server\n  Host->>Server: initialize\n  Server-->>Host: capabilities (tools, resources, prompts)\n  Host->>Server: tools/list\n  Server-->>Host: typed tool catalog\n  Host->>Server: tools/call(name, args)\n  Server-->>Host: typed result",
        "caption": "MCP is the interface between an agent host and a tool server: typed catalog up front, typed calls and results during."
      },
      "example_scenario": "A developer writes a 'GitHub PR review' tool once and exposes it via Model Context Protocol. Now it works in Claude Desktop, in Cursor, in their custom CLI agent, and in their teammate's VS Code agent — without rewriting the integration four times. The host and the tool only need to agree on MCP, not on each other's internal details.",
      "variants": [
        {
          "name": "stdio transport",
          "summary": "The agent host launches the MCP server as a subprocess and communicates over stdin/stdout JSON-RPC.",
          "distinguishing_factor": "process-local, no network",
          "when_to_use": "Local desktop agents (Claude Desktop, Cursor) that bundle servers as binaries."
        },
        {
          "name": "HTTP / SSE transport",
          "summary": "The MCP server is a long-running HTTP service; the host opens an SSE connection for server-initiated events plus regular HTTP for tool calls.",
          "distinguishing_factor": "remote, streaming-capable",
          "when_to_use": "Hosted MCP servers shared by multiple clients, or when the server needs to push notifications to the agent."
        },
        {
          "name": "Streamable HTTP",
          "summary": "Newer MCP transport (2025+) that consolidates request/response and event streaming on a single HTTP endpoint with chunked encoding.",
          "distinguishing_factor": "single endpoint, bidirectional",
          "when_to_use": "Newer MCP-conformant hosts; replaces the SSE-plus-HTTP split for simpler deployments."
        }
      ]
    },
    {
      "id": "mobile-ui-agent",
      "name": "Mobile UI Agent",
      "aliases": [
        "Smartphone Agent",
        "Mobile App Agent",
        "Touch-UI Agent"
      ],
      "category": "tool-use-environment",
      "intent": "Drive a smartphone end-to-end through a small, touch-native action vocabulary (tap, long-press, swipe, type, back, home) over screenshots, as a distinct interaction surface from desktop Computer Use and from web Browser Agents.",
      "context": "The user task lives in a mobile app whose data and behaviour are not exposed via a public API or web frontend; only the touch UI is available.",
      "problem": "Desktop Computer Use action sets (mouse, keyboard, scroll wheel) and Browser Agent abstractions (DOM, accessibility tree) are wrong shape for a touch UI; pixel-level control without an action vocabulary is too low-level to reason over.",
      "forces": [
        "Mobile actions are touch-native, gesture-based, and screen-coordinate dependent.",
        "Per-app APIs do not exist; only the UI is available.",
        "Screen size is small; what fits on one screen does not generalise.",
        "Visual state is the source of truth, but text is what the model reasons in."
      ],
      "solution": "Define a touch-native action vocabulary (tap(x,y), long_press(x,y), swipe(dir), type(text), back, home). The agent receives a screenshot (optionally with extracted UI element annotations), reasons in text about which element to act on, emits an action call, and observes the next screenshot. Specialise the action vocabulary per platform (Android vs iOS) but keep the agent loop platform-agnostic.",
      "structure": "Screenshot + history -> agent -> action_call(tap|swipe|type|...) -> device -> next screenshot -> ...",
      "consequences": {
        "benefits": [
          "Works against any app whose UI is visible, including third-party Chinese super-apps with no APIs.",
          "Single agent loop generalises across apps once the vocabulary is fixed.",
          "Vision + small action set is a tractable model footprint."
        ],
        "liabilities": [
          "Coordinate-based taps are brittle to screen size, theme, locale changes.",
          "Pure-vision grounding mistakes are common; element-annotation pipelines add complexity.",
          "Sensitive actions (payments, deletions) are easy to mis-fire."
        ]
      },
      "constrains": "The agent may only emit actions in the registered touch-action vocabulary; arbitrary system or shell access is forbidden by construction.",
      "known_uses": [
        {
          "system": "AppAgent (Tencent)",
          "note": "Touch-action vocabulary plus exploration-phase documentation per app.",
          "status": "available",
          "url": "https://github.com/TencentQQGYLab/AppAgent"
        },
        {
          "system": "Mobile-Agent (Alibaba)",
          "note": "Vision-first smartphone agent.",
          "status": "available",
          "url": "https://github.com/X-PLUG/MobileAgent"
        },
        {
          "system": "Mobile-Agent-v2 (Alibaba)",
          "note": "Multi-agent mobile assistant: planning, decision, reflection.",
          "status": "available"
        },
        {
          "system": "AutoGLM (Zhipu)",
          "note": "Phone agent with decision/grounding split.",
          "status": "available"
        },
        {
          "system": "CogAgent (Tsinghua + Zhipu)",
          "note": "Vision-language model purpose-built for GUI screens.",
          "status": "available"
        }
      ],
      "related": [
        {
          "pattern": "computer-use",
          "relation": "alternative-to",
          "note": "Sibling pattern for desktop UI."
        },
        {
          "pattern": "browser-agent",
          "relation": "alternative-to",
          "note": "Sibling pattern for web UI."
        },
        {
          "pattern": "structured-output",
          "relation": "uses"
        },
        {
          "pattern": "app-exploration-phase",
          "relation": "complements"
        },
        {
          "pattern": "dual-system-gui-agent",
          "relation": "complements"
        }
      ],
      "references": [
        {
          "type": "paper",
          "title": "AppAgent: Multimodal Agents as Smartphone Users",
          "authors": "Zhang et al.",
          "year": 2023,
          "url": "https://arxiv.org/abs/2312.13771"
        },
        {
          "type": "paper",
          "title": "Mobile-Agent: Autonomous Multi-Modal Mobile Device Agent with Visual Perception",
          "authors": "Wang et al.",
          "year": 2024,
          "url": "https://arxiv.org/abs/2401.16158"
        },
        {
          "type": "paper",
          "title": "Mobile-Agent-v2: Mobile Device Operation Assistant with Effective Navigation via Multi-Agent Collaboration",
          "authors": "Wang et al.",
          "year": 2024,
          "url": "https://arxiv.org/abs/2406.01014"
        }
      ],
      "status_in_practice": "emerging",
      "tags": [
        "tool-use",
        "gui-agent",
        "china-origin",
        "mobile"
      ]
    },
    {
      "id": "multilingual-voice-agent",
      "name": "Multilingual Voice Agent Stack",
      "aliases": [
        "Voice-First Multilingual Agent",
        "STT-LLM-TTS Pipeline",
        "Indic Voice Agent"
      ],
      "category": "tool-use-environment",
      "intent": "Compose a voice agent as a tightly co-located pipeline of speech-to-text, language-aware LLM reasoning, and text-to-speech, where one vendor owns all three so language and dialect propagate cleanly across stages.",
      "context": "Markets where the user base speaks one of many regional languages and dialects (e.g. India's 22 scheduled languages), often on telephony channels (calls, WhatsApp), where written input is rare and TTS quality in the target language is non-negotiable.",
      "problem": "Bolting a generic English LLM between a generic STT and TTS loses dialect, code-switching, and nuance; per-component quality drops compound across the chain; latency exceeds the budget for natural turn-taking.",
      "forces": [
        "STT, LLM, TTS each have their own multilingual coverage curve.",
        "Real conversation tolerates ~1s round-trip latency; slower than that breaks the illusion.",
        "Dialect and code-switching are the norm, not the exception.",
        "Telephony imposes 8 kHz audio constraints on top."
      ],
      "solution": "Build the voice agent as a co-located pipeline whose components share language identity and dialect signals end-to-end. Use STT models trained on the target languages and accents. Pass detected language tags as structured metadata to the LLM. Use TTS voices native to the target language; do not translate back to English mid-pipeline. Optimise for streaming at every hop (incremental STT, streaming LLM, streaming TTS) to hit sub-second turn-taking. Treat code-switching as first-class; do not force a single-language assumption.",
      "structure": "Audio in -> streaming STT (per-language) -> language tag + text -> LLM (multilingual) -> streaming TTS (target language) -> Audio out.",
      "consequences": {
        "benefits": [
          "Linguistic fidelity preserved across the pipeline.",
          "Sub-second turn-taking achievable with streaming components.",
          "Single vendor owns the cross-component quality contract."
        ],
        "liabilities": [
          "Language coverage is bounded by the weakest component.",
          "Streaming everywhere is harder than batch.",
          "Telephony audio quality bounds STT accuracy."
        ]
      },
      "constrains": "Language identity and dialect tags must propagate through every hop; mid-pipeline silent translation to a pivot language (e.g. English) is forbidden.",
      "known_uses": [
        {
          "system": "Sarvam Samvaad",
          "note": "Conversational voice/text agents in 11 Indian languages on a single platform; sub-second voice latency target.",
          "status": "available",
          "url": "https://www.sarvam.ai/products/conversational-agents"
        },
        {
          "system": "Krutrim (Ola)",
          "note": "Multilingual Indic voice and text agent stack.",
          "status": "available"
        },
        {
          "system": "Bhashini",
          "note": "Indian government Indic-language services consumed by agent stacks.",
          "status": "available"
        }
      ],
      "related": [
        {
          "pattern": "streaming-typed-events",
          "relation": "uses"
        },
        {
          "pattern": "multi-model-routing",
          "relation": "complements",
          "note": "Per-language model selection."
        },
        {
          "pattern": "structured-output",
          "relation": "uses"
        },
        {
          "pattern": "translation-layer",
          "relation": "complements"
        },
        {
          "pattern": "computer-use",
          "relation": "alternative-to"
        },
        {
          "pattern": "code-switching-aware-agent",
          "relation": "complements"
        }
      ],
      "references": [
        {
          "type": "doc",
          "title": "Sarvam — Samvaad: Conversational AI Agents for Indian Languages",
          "url": "https://www.sarvam.ai/products/conversational-agents"
        }
      ],
      "status_in_practice": "emerging",
      "tags": [
        "tool-use",
        "voice",
        "india-origin",
        "multilingual",
        "sarvam"
      ]
    },
    {
      "id": "prompt-caching",
      "name": "Prompt Caching",
      "aliases": [
        "Cache-Aware Prompts",
        "Stable-Prefix Caching"
      ],
      "category": "tool-use-environment",
      "intent": "Order prompts so the unchanging prefix can be cached by the provider, cutting per-call cost and latency.",
      "context": "Agents that call the model many times with mostly-stable system prompts (charters, rules, motivations) and variable suffixes (tick input, user message).",
      "problem": "Re-sending an identical 10k-token prefix on every call wastes compute; vendor caches exist but only if the prefix is byte-stable.",
      "forces": [
        "Cache TTL caps savings (idle agents lose the warm cache) vs always-fresh prefix.",
        "Stability for cache-hit vs flexibility to mutate the prompt.",
        "Engineering rigor on prompt order vs developer ergonomics."
      ],
      "solution": "Place all stable content (system prompt, tool definitions, charter, rules) at the start of the prompt. Place variable content (current state, user message) at the end. Mark the cache breakpoint at the boundary. Audit prompt construction to ensure no accidental prefix mutation.",
      "consequences": {
        "benefits": [
          "70-90% input-cost reduction on long-running agents.",
          "TTFT roughly halves for the cached portion."
        ],
        "liabilities": [
          "Cache misses are silent and expensive.",
          "Prompt assembly code must be disciplined.",
          "Common cache-invalidation footguns: tool-definitions reordering between calls (JSON object iteration, dynamic registration), timestamps/UUIDs/correlation IDs leaking into the cached prefix, and provider-specific breakpoint placement rules (e.g., Anthropic max 4 cache_control breakpoints with 1024-token minimum)."
        ]
      },
      "constrains": "The cached prefix is forbidden from changing call to call; mutation invalidates the cache.",
      "known_uses": [
        {
          "system": "Sparrot",
          "note": "Charter, active rules, motivations cached; input cost cut 70-90% over 1,440 ticks/day.",
          "status": "available"
        },
        {
          "system": "Anthropic prompt caching",
          "status": "available",
          "url": "https://docs.claude.com/en/docs/build-with-claude/prompt-caching"
        },
        {
          "system": "OpenAI prompt caching",
          "status": "available",
          "url": "https://platform.openai.com/docs/guides/prompt-caching"
        },
        {
          "system": "OpenAI automatic prompt caching",
          "status": "available"
        },
        {
          "system": "Google Gemini context caching",
          "status": "available"
        },
        {
          "system": "Cursor",
          "status": "available",
          "url": "https://cursor.com/"
        }
      ],
      "related": [
        {
          "pattern": "cost-gating",
          "relation": "complements"
        },
        {
          "pattern": "contextual-retrieval",
          "relation": "used-by"
        },
        {
          "pattern": "reasoning-trace-carry-forward",
          "relation": "complements"
        }
      ],
      "references": [
        {
          "type": "doc",
          "title": "Anthropic: Prompt caching",
          "url": "https://docs.anthropic.com/claude/docs/prompt-caching"
        }
      ],
      "status_in_practice": "mature",
      "tags": [
        "cost",
        "cache",
        "performance"
      ]
    },
    {
      "id": "sandbox-isolation",
      "name": "Sandbox Isolation",
      "aliases": [
        "Code Sandbox",
        "Container Isolation",
        "Restricted Execution"
      ],
      "category": "tool-use-environment",
      "intent": "Run agent-emitted code or actions in a contained environment with restricted filesystem, network, and process privileges.",
      "context": "Agents that execute code or operate the filesystem; ungated execution is a security and stability hazard.",
      "problem": "An agent with full host access can damage the host (delete files, exfiltrate data, install malware) intentionally or accidentally.",
      "forces": [
        "Sandbox setup adds latency.",
        "Strict sandboxes block legitimate work.",
        "Escape vulnerabilities are real and ongoing."
      ],
      "solution": "Run code in a container, microVM, WASM runtime, or restricted subprocess with minimal privileges. Filesystem is read-only or scoped to a working directory. Network is allowlisted or blocked. Resource limits cap CPU/memory/time. Persistent state is ephemeral by default.",
      "consequences": {
        "benefits": [
          "Blast radius is contained.",
          "Same sandbox image is reproducible across runs."
        ],
        "liabilities": [
          "Some workflows need network or filesystem access the sandbox forbids.",
          "Sandbox tech (Docker, gVisor, Firecracker, WASM) is its own engineering."
        ]
      },
      "constrains": "Code may only access resources granted by the sandbox policy; outbound network and host filesystem are forbidden by default.",
      "known_uses": [
        {
          "system": "OpenAI Code Interpreter sandbox",
          "status": "available"
        },
        {
          "system": "E2B sandboxes",
          "status": "available"
        },
        {
          "system": "Claude Code's project-level write boundaries",
          "status": "available"
        }
      ],
      "related": [
        {
          "pattern": "code-execution",
          "relation": "complements"
        },
        {
          "pattern": "input-output-guardrails",
          "relation": "composes-with"
        },
        {
          "pattern": "subagent-isolation",
          "relation": "composes-with"
        },
        {
          "pattern": "sandbox-escape-monitoring",
          "relation": "complements"
        },
        {
          "pattern": "todo-list-driven-agent",
          "relation": "used-by"
        },
        {
          "pattern": "wasm-skill-runtime",
          "relation": "generalises"
        },
        {
          "pattern": "code-as-action",
          "relation": "used-by"
        }
      ],
      "references": [
        {
          "type": "doc",
          "title": "E2B Sandboxes",
          "url": "https://e2b.dev/docs"
        }
      ],
      "status_in_practice": "mature",
      "tags": [
        "sandbox",
        "safety",
        "execution"
      ]
    },
    {
      "id": "skill-library",
      "name": "Skill Library",
      "aliases": [
        "Tool-Creating Agent",
        "Meta-Tool Use",
        "Self-Authored Tools"
      ],
      "category": "tool-use-environment",
      "intent": "Let the agent grow its own toolkit by writing reusable skills that subsequent runs can call.",
      "context": "Patterns of tool use repeat; the agent re-derives the same routine across runs and pays for it each time.",
      "problem": "Without a place to crystallise repeated work, every run starts from scratch.",
      "forces": [
        "New skills can be wrong or unsafe.",
        "The library must be loadable without restart in a long-running agent.",
        "Skill discovery (which skill applies?) is itself a retrieval problem."
      ],
      "solution": "A directory (often `skills/*.py` or `skills/*.md`) where the agent can write new modules. A loader (importlib in Python, dynamic import in JS) makes them callable. A critic gates additions. Old skills are versioned, not overwritten silently.",
      "consequences": {
        "benefits": [
          "Compounding capability over time.",
          "Skills are reviewable and removable, unlike weights."
        ],
        "liabilities": [
          "Skill-name collisions and silent shadowing.",
          "Library quality decays without periodic review."
        ]
      },
      "constrains": "New skills enter the library only after passing the critic; they cannot mutate existing skills without quorum.",
      "known_uses": [
        {
          "system": "Sparrot",
          "note": "skills/*.py loaded via importlib on startup and after restart_self.",
          "status": "available"
        },
        {
          "system": "Voyager (Minecraft agent)",
          "note": "Skill library that grows through self-play.",
          "status": "available",
          "url": "https://voyager.minedojo.org/"
        }
      ],
      "related": [
        {
          "pattern": "inner-critic",
          "relation": "uses"
        },
        {
          "pattern": "code-execution",
          "relation": "composes-with"
        },
        {
          "pattern": "exploration-exploitation",
          "relation": "complements"
        },
        {
          "pattern": "agent-skills",
          "relation": "alternative-to"
        },
        {
          "pattern": "app-exploration-phase",
          "relation": "complements"
        },
        {
          "pattern": "wasm-skill-runtime",
          "relation": "complements"
        }
      ],
      "references": [
        {
          "type": "paper",
          "title": "Voyager: An Open-Ended Embodied Agent with Large Language Models",
          "authors": "Wang et al.",
          "year": 2023,
          "url": "https://arxiv.org/abs/2305.16291"
        }
      ],
      "status_in_practice": "emerging",
      "tags": [
        "skill-library",
        "self-modification"
      ]
    },
    {
      "id": "tool-discovery",
      "name": "Tool Discovery",
      "aliases": [
        "Capability Advertisement",
        "Dynamic Tool Loading"
      ],
      "category": "tool-use-environment",
      "intent": "Let the agent discover available tools at runtime rather than hardcoding the tool list at agent build time.",
      "context": "Tool palettes evolve; new tools land without redeploying the agent. MCP and similar protocols make discovery feasible.",
      "problem": "Hardcoded tool palettes force a redeploy for every new capability; dynamic environments need dynamic palettes.",
      "forces": [
        "Discovery latency adds to every cold start.",
        "Tool quality varies; not every advertised tool should be exposed.",
        "Versioning of advertised tools."
      ],
      "solution": "On startup (or periodically), the agent queries a tool registry (MCP server, internal directory). The registry returns advertised tools with typed schemas. The agent loads them into its palette. Optionally cached and refreshed.",
      "consequences": {
        "benefits": [
          "Capability expansion without agent redeploy.",
          "Multiple agents can share an evolving tool layer."
        ],
        "liabilities": [
          "Discovery failure modes (registry down).",
          "Trust: should the agent use any advertised tool?"
        ]
      },
      "constrains": "The agent's tool palette at any moment is exactly the discovered set; off-registry tools are forbidden.",
      "known_uses": [
        {
          "system": "MCP server discovery",
          "status": "available"
        },
        {
          "system": "OpenAI plugin manifests (deprecated)",
          "status": "available"
        }
      ],
      "related": [
        {
          "pattern": "mcp",
          "relation": "uses"
        },
        {
          "pattern": "tool-use",
          "relation": "specialises"
        },
        {
          "pattern": "awareness",
          "relation": "complements"
        },
        {
          "pattern": "toolformer",
          "relation": "alternative-to"
        },
        {
          "pattern": "tool-loadout",
          "relation": "complements"
        },
        {
          "pattern": "app-exploration-phase",
          "relation": "generalises"
        },
        {
          "pattern": "wasm-skill-runtime",
          "relation": "complements"
        }
      ],
      "references": [
        {
          "type": "doc",
          "title": "Model Context Protocol Specification",
          "url": "https://modelcontextprotocol.io/specification"
        }
      ],
      "status_in_practice": "emerging",
      "tags": [
        "discovery",
        "tool-use",
        "registry"
      ]
    },
    {
      "id": "tool-loadout",
      "name": "Tool Loadout",
      "aliases": [
        "Tool Subset Selection",
        "Per-Task Tool Filtering",
        "Tool Filter",
        "Limit Exposed Tools"
      ],
      "category": "tool-use-environment",
      "intent": "Select a small task-relevant subset of available tools per request rather than exposing the full registry to the model.",
      "context": "Agents with large tool registries (MCP, plugin marketplaces, internal tool catalogs) where exposing all tools degrades selection accuracy.",
      "problem": "Function-calling accuracy degrades past ~20 tools; a 100-tool registry is unusable without per-request filtering.",
      "forces": [
        "Filter quality (does the agent get the right tools?).",
        "Filter cost (one extra model call per request, or rule-based).",
        "Tool-discovery latency on each request."
      ],
      "solution": "Before the main loop, classify the request and select N relevant tools (rule-based: by routed lane; or model-based: a quick classifier picks tools). Expose only the selected subset to the agent's main inference call. Tools outside the subset are unavailable for this request.",
      "consequences": {
        "benefits": [
          "Function-calling accuracy holds up at scale.",
          "Token budget for tool definitions stays manageable."
        ],
        "liabilities": [
          "Filter mistakes hide capability the agent could have used.",
          "Filtering adds latency."
        ]
      },
      "constrains": "The agent's tool palette is exactly the filtered subset for the current request; tools outside the subset cannot be invoked.",
      "known_uses": [
        {
          "system": "Claude Code per-task allowed_tools",
          "status": "available"
        },
        {
          "system": "Cursor contextual tool selection",
          "status": "available"
        },
        {
          "system": "MCP server filtering",
          "status": "available"
        }
      ],
      "related": [
        {
          "pattern": "tool-discovery",
          "relation": "complements"
        },
        {
          "pattern": "routing",
          "relation": "uses"
        },
        {
          "pattern": "tool-explosion",
          "relation": "conflicts-with"
        },
        {
          "pattern": "agent-computer-interface",
          "relation": "complements"
        }
      ],
      "references": [
        {
          "type": "doc",
          "title": "Tool use with Claude",
          "year": 2025,
          "url": "https://docs.claude.com/en/docs/agents-and-tools/tool-use/overview"
        }
      ],
      "status_in_practice": "mature",
      "tags": [
        "tool-use",
        "loadout",
        "filtering"
      ]
    },
    {
      "id": "tool-result-caching",
      "name": "Tool Result Caching",
      "aliases": [
        "Memoised Tools",
        "Idempotent Cache"
      ],
      "category": "tool-use-environment",
      "intent": "Cache the result of expensive deterministic tool calls keyed by their arguments so repeat calls within a session return immediately.",
      "context": "Agents that re-call the same tool with the same arguments multiple times within one task (lookups, computations, immutable reads).",
      "problem": "Repeat calls on the same arguments waste latency and money; the tool layer often has no awareness of caller behaviour.",
      "forces": [
        "Cache invalidation: when does the underlying data change?",
        "Per-user vs global caches differ on isolation guarantees.",
        "Cache hits hide tool latency the agent might benefit from learning about."
      ],
      "solution": "Wrap deterministic tools in a cache layered on `(tool_name, normalised_args)`. Set TTLs by tool type. On cache hit, return immediately without invoking the underlying tool. Per-user scoping for tools that read user data; global for read-only public data. Cache keys must include the auth subject (caller identity), not just args; args-only keys leak data when callers change.",
      "consequences": {
        "benefits": [
          "Latency drops on repeat calls.",
          "Cost reduction for paid APIs."
        ],
        "liabilities": [
          "Stale cache hits when underlying data changes.",
          "Non-deterministic tools cannot be cached safely."
        ]
      },
      "constrains": "Only tools declared deterministic may be cached; nondeterministic tools bypass the cache.",
      "known_uses": [
        {
          "system": "Most production agent platforms",
          "status": "available"
        }
      ],
      "related": [
        {
          "pattern": "tool-use",
          "relation": "specialises"
        },
        {
          "pattern": "session-isolation",
          "relation": "complements"
        }
      ],
      "references": [
        {
          "type": "doc",
          "title": "Prompt caching",
          "year": 2025,
          "url": "https://docs.claude.com/en/docs/build-with-claude/prompt-caching"
        }
      ],
      "status_in_practice": "mature",
      "tags": [
        "cache",
        "tool-use",
        "performance"
      ]
    },
    {
      "id": "tool-use",
      "name": "Tool Use",
      "aliases": [
        "Function Calling",
        "Tool Calling",
        "Action Use"
      ],
      "category": "tool-use-environment",
      "intent": "Let the LLM produce typed calls against an external toolkit instead of producing free-form text the surrounding system has to parse.",
      "context": "An agent must affect the outside world (read a record, edit state, render an artefact) and the model alone cannot do this safely or correctly.",
      "problem": "Free-form text is unparseable at the boundary; the model invents fields, mis-spells operations, or returns prose where the system needs structure.",
      "forces": [
        "The model is good at intent, weak at typed structure.",
        "The host system needs deterministic operations to act.",
        "Schema rigidity reduces the model's freedom; too much rigidity loses recall."
      ],
      "solution": "Define a typed tool palette. The model emits tool calls conforming to a JSON Schema; the host validates and executes; results return as structured tool results. The agent becomes a thin client of a deterministic toolkit.",
      "structure": "Model -> tool_call(name, args:JSON) -> Validator -> Executor -> tool_result(JSON) -> Model.",
      "consequences": {
        "benefits": [
          "Invalid calls are rejected at the schema layer rather than as runtime errors.",
          "The toolkit, not the model, is the locus of capability and audit.",
          "Tools can be tested and versioned independently of prompts."
        ],
        "liabilities": [
          "Tool palette design becomes the bottleneck; bad tools propagate to every call site.",
          "Models with weaker function-calling support drift; schema strictness must be tuned per model."
        ]
      },
      "constrains": "The model cannot affect state except through a registered tool with a typed signature.",
      "known_uses": [
        {
          "system": "ConvArch",
          "note": "Architecture-edit toolkit (add_node, connect, update_attribute) backed by JSON in PostgreSQL.",
          "status": "available"
        },
        {
          "system": "Bobbin (Stash2Go)",
          "note": "Per-screen api_tools and action_tools registered in a LangGraph ToolNode.",
          "status": "available"
        },
        {
          "system": "OpenAI Function Calling",
          "status": "available"
        },
        {
          "system": "Anthropic Tool Use",
          "status": "available"
        },
        {
          "system": "Claude Code",
          "status": "available",
          "url": "https://docs.claude.com/en/docs/claude-code/overview"
        },
        {
          "system": "Cursor",
          "status": "available",
          "url": "https://cursor.com/"
        },
        {
          "system": "Devin",
          "status": "available",
          "url": "https://devin.ai/"
        },
        {
          "system": "Manus",
          "status": "available",
          "url": "https://manus.im/"
        }
      ],
      "related": [
        {
          "pattern": "structured-output",
          "relation": "uses"
        },
        {
          "pattern": "react",
          "relation": "used-by"
        },
        {
          "pattern": "mcp",
          "relation": "specialises",
          "note": "MCP standardises the tool protocol across vendors."
        },
        {
          "pattern": "agentic-rag",
          "relation": "used-by"
        },
        {
          "pattern": "memgpt-paging",
          "relation": "used-by"
        },
        {
          "pattern": "browser-agent",
          "relation": "generalises"
        },
        {
          "pattern": "hallucinated-tools",
          "relation": "alternative-to"
        },
        {
          "pattern": "naive-rag-first",
          "relation": "alternative-to"
        },
        {
          "pattern": "code-execution",
          "relation": "generalises"
        },
        {
          "pattern": "tool-result-caching",
          "relation": "generalises"
        },
        {
          "pattern": "schema-free-output",
          "relation": "alternative-to"
        },
        {
          "pattern": "awareness",
          "relation": "complements"
        },
        {
          "pattern": "tool-discovery",
          "relation": "generalises"
        },
        {
          "pattern": "toolformer",
          "relation": "generalises"
        },
        {
          "pattern": "critic",
          "relation": "used-by"
        },
        {
          "pattern": "parallel-tool-calls",
          "relation": "used-by"
        },
        {
          "pattern": "agent-computer-interface",
          "relation": "generalises"
        },
        {
          "pattern": "code-as-action",
          "relation": "alternative-to"
        },
        {
          "pattern": "agent-as-tool-embedding",
          "relation": "used-by"
        }
      ],
      "references": [
        {
          "type": "doc",
          "title": "OpenAI: Function calling",
          "url": "https://platform.openai.com/docs/guides/function-calling"
        },
        {
          "type": "doc",
          "title": "Anthropic: Tool use",
          "url": "https://docs.anthropic.com/claude/docs/tool-use"
        }
      ],
      "status_in_practice": "mature",
      "tags": [
        "tool-use",
        "function-calling",
        "boundary"
      ],
      "applicability": {
        "use_when": [
          "The model must affect external state or query authoritative systems.",
          "Operations are typed and a JSON Schema can describe them.",
          "Audit and validation need to live outside the model."
        ],
        "do_not_use_when": [
          "The deliverable is free prose; structuring it as a tool call is overhead.",
          "The underlying API has no schema and cannot be wrapped cheaply.",
          "Calls are extremely high-frequency and per-call validation is the bottleneck."
        ]
      },
      "diagram": {
        "type": "sequence",
        "mermaid": "sequenceDiagram\n  participant Model\n  participant Validator\n  participant Executor\n  Model->>Validator: tool_call(name, args:JSON)\n  Validator->>Validator: validate against JSON Schema\n  alt valid\n    Validator->>Executor: dispatch\n    Executor-->>Model: tool_result(JSON)\n  else invalid\n    Validator-->>Model: typed error\n  end",
        "caption": "Tool Use makes the validator the boundary: invalid calls are rejected at the schema layer before any executor runs."
      },
      "example_scenario": "A customer-support agent receives 'cancel my order #4471.' Instead of writing free-form text the surrounding code has to parse, it emits a structured call: cancel_order({order_id: '4471'}). A validator checks the call against the API's schema; the executor runs it; the agent gets back {status: 'cancelled', refund_amount: 49.99}. The model never has to guess at field names or formatting.",
      "variants": [
        {
          "name": "Provider native function-calling",
          "summary": "The provider's API exposes a typed `tools` parameter; the model emits a `tool_call` message with name and JSON arguments. Validation happens server-side at the provider boundary.",
          "distinguishing_factor": "vendor SDK contract",
          "when_to_use": "Default for hosted models that support it (OpenAI, Anthropic, Google, Cohere). Lowest implementation cost."
        },
        {
          "name": "JSON-mode tool use",
          "summary": "The model is instructed to emit a JSON object whose schema includes a `tool` field; the host parses and validates against the local palette. No vendor-side tool API.",
          "distinguishing_factor": "schema enforced at host, not provider",
          "when_to_use": "Self-hosted or older models that lack native function-calling but reliably produce JSON."
        },
        {
          "name": "Code-as-action tool use",
          "summary": "The model emits a code snippet in a sandboxed interpreter; tool composition becomes function nesting and control flow. Each tool is a function the snippet may call.",
          "distinguishing_factor": "code is the call-encoding",
          "when_to_use": "Tasks that benefit from composition or branching across multiple tool calls per step.",
          "see_also": "code-as-action"
        }
      ]
    },
    {
      "id": "toolformer",
      "name": "Toolformer",
      "aliases": [
        "Self-Supervised Tool Learning"
      ],
      "category": "tool-use-environment",
      "intent": "Train the model to learn when and how to call tools through self-supervised data, without human annotation.",
      "context": "Tool use deployed at scale where prompt-based function-calling underperforms and human-labelled tool-use traces are unavailable.",
      "problem": "Prompt-based tool calling is brittle and capability-limited; supervised fine-tuning needs costly human-labelled traces.",
      "forces": [
        "Self-supervised data must distinguish helpful from unhelpful tool calls.",
        "The training-time tool surface diverges from runtime over time.",
        "Filtering noise dominates training cost."
      ],
      "solution": "Generate candidate tool calls during training. Insert each into a context. Score whether the resulting completion is improved (perplexity drop on the gold continuation). Keep helpful insertions as training data. Fine-tune the model to emit tool calls in those positions.",
      "consequences": {
        "benefits": [
          "No human-labelled tool-call data required.",
          "Model learns when not to call tools, not just when to."
        ],
        "liabilities": [
          "Training pipeline complexity.",
          "Tool surface drift between train and serve.",
          "Historical: superseded by RLHF-tuned tool-use in frontier models; not productionised at scale."
        ]
      },
      "constrains": "Tool use is bound to positions where self-supervised filtering judged the call helpful; ungrounded tool calls are not reinforced.",
      "known_uses": [
        {
          "system": "Toolformer paper baseline",
          "status": "available"
        },
        {
          "system": "Influences modern instruction-tuning of frontier models",
          "status": "available"
        }
      ],
      "related": [
        {
          "pattern": "tool-use",
          "relation": "specialises"
        },
        {
          "pattern": "agent-skills",
          "relation": "complements"
        },
        {
          "pattern": "tool-discovery",
          "relation": "alternative-to"
        }
      ],
      "references": [
        {
          "type": "paper",
          "title": "Toolformer: Language Models Can Teach Themselves to Use Tools",
          "authors": "Schick, Dwivedi-Yu, Dessì, Raileanu, Lomeli, Zettlemoyer, Cancedda, Scialom",
          "year": 2023,
          "url": "https://arxiv.org/abs/2302.04761"
        }
      ],
      "status_in_practice": "deprecated",
      "tags": [
        "tool-use",
        "self-supervised",
        "training"
      ]
    },
    {
      "id": "translation-layer",
      "name": "Translation Layer",
      "aliases": [
        "Anti-Corruption Layer",
        "Adapter Pattern (Agentic)",
        "API Façade"
      ],
      "category": "tool-use-environment",
      "intent": "Insert a typed boundary between the agent's clean domain model and a messy or legacy external API.",
      "context": "The agent wants to reason in one shape (the domain it cares about); the data lives in another (vendor schemas, legacy APIs, third-party formats).",
      "problem": "Letting vendor shapes leak into the agent's context wastes tokens and ties the agent's reasoning to upstream churn.",
      "forces": [
        "The legacy shape is authoritative for storage but bad for reasoning.",
        "Translation must be reversible to write back without data loss.",
        "Round-tripping costs latency and complexity."
      ],
      "solution": "A translation module sits between the agent's tool palette and the upstream API. Inbound: vendor JSON is mapped into the domain shape. Outbound: domain edits become signed vendor calls. The agent sees one consistent shape regardless of how many backends sit behind it.",
      "consequences": {
        "benefits": [
          "Multiple backends can be swapped behind one tool surface.",
          "Domain evolution is decoupled from vendor schema changes."
        ],
        "liabilities": [
          "Mapping logic is its own maintenance burden.",
          "Lossy mappings silently degrade write fidelity if not flagged."
        ]
      },
      "constrains": "Tools see only the domain shape; the vendor shape never reaches the model.",
      "known_uses": [
        {
          "system": "Weft",
          "note": "WEFT JSON ↔ Ravelry OAuth-signed REST.",
          "status": "available"
        }
      ],
      "related": [
        {
          "pattern": "polymorphic-record",
          "relation": "complements"
        },
        {
          "pattern": "mcp",
          "relation": "composes-with"
        },
        {
          "pattern": "schema-extensibility",
          "relation": "complements"
        },
        {
          "pattern": "multilingual-voice-agent",
          "relation": "complements"
        },
        {
          "pattern": "code-switching-aware-agent",
          "relation": "alternative-to"
        }
      ],
      "references": [
        {
          "type": "book",
          "title": "Domain-Driven Design (Anti-Corruption Layer)",
          "authors": "Eric Evans",
          "year": 2003,
          "url": "https://www.domainlanguage.com/ddd/"
        }
      ],
      "status_in_practice": "mature",
      "tags": [
        "translation",
        "anti-corruption",
        "ddd"
      ]
    },
    {
      "id": "wasm-skill-runtime",
      "name": "WebAssembly Skill Runtime",
      "aliases": [
        "Wasm Cognitive Skills",
        "Polyglot Skill Sandbox",
        "Capability-Sandboxed Tool Plane"
      ],
      "category": "tool-use-environment",
      "intent": "Package each agent skill as a WebAssembly module with a capability manifest, and run it inside a Wasm runtime that enforces those capabilities, so untrusted skills cannot weaken the host's sandbox.",
      "context": "An enterprise agent platform that must accept user-authored or partner-authored skills written in different languages (Rust, Python compiled, TypeScript, Go) while enforcing per-skill resource and IO limits.",
      "problem": "Plain-process tools share the host's privileges, language-specific sandboxes (e.g. Python sandbox) are escape-prone, and per-skill containers are heavy; the agent can't safely run third-party skills at request rate.",
      "forces": [
        "Skills authored by partners cannot be trusted with host privileges.",
        "Per-request container start-up is too slow and too expensive.",
        "Polyglot authoring is a real requirement; Python-only is restrictive.",
        "Capability declarations have to be checkable, not advisory."
      ],
      "solution": "Define a Wasm Component Model interface for skills: each skill compiles to a Wasm module and ships with a manifest declaring (filesystem paths, network hosts, env vars, syscalls) it needs. The host runtime instantiates a fresh sandbox per call with only those capabilities. Skills can be authored in any language compiling to Wasm. The host treats the manifest as the contract; missing-capability calls fail at the boundary.",
      "structure": "Host runtime { capability gate } -> Wasm sandbox(skill_module, manifest) -> deterministic IO -> result.",
      "consequences": {
        "benefits": [
          "Polyglot skill ecosystem with one runtime.",
          "Strong capability isolation; manifest is the audit surface.",
          "Wasm cold-start is fast enough to run per request."
        ],
        "liabilities": [
          "Wasm ecosystem maturity per language varies (Rust strong, Python heavier).",
          "Capability manifest design is the real engineering problem.",
          "Some workloads (GPU, large data) don't fit Wasm well."
        ]
      },
      "constrains": "A skill may not exercise any capability not declared in its manifest; manifest drift is detected at load time.",
      "known_uses": [
        {
          "system": "Aleph Alpha PhariaEngine",
          "note": "Cognitive Business Units (Skills) compile to Wasm and run inside the engine's sandboxed runtime.",
          "status": "available",
          "url": "https://github.com/Aleph-Alpha/pharia-engine"
        }
      ],
      "related": [
        {
          "pattern": "sandbox-isolation",
          "relation": "specialises"
        },
        {
          "pattern": "skill-library",
          "relation": "complements"
        },
        {
          "pattern": "tool-discovery",
          "relation": "complements"
        },
        {
          "pattern": "secrets-handling",
          "relation": "complements"
        },
        {
          "pattern": "code-execution",
          "relation": "complements"
        }
      ],
      "references": [
        {
          "type": "repo",
          "title": "Aleph-Alpha/pharia-engine — Serverless AI powered by WebAssembly",
          "url": "https://github.com/Aleph-Alpha/pharia-engine"
        }
      ],
      "status_in_practice": "experimental",
      "tags": [
        "tool-use",
        "sandbox",
        "germany-origin",
        "wasm",
        "aleph-alpha"
      ]
    },
    {
      "id": "best-of-n",
      "name": "Best-of-N Sampling",
      "aliases": [
        "BoN",
        "Reranking",
        "BoNBoN Alignment"
      ],
      "category": "verification-reflection",
      "intent": "Sample N candidate outputs and select the highest-ranked by a reward model or scorer.",
      "context": "Output quality matters and a reward model exists (or can be approximated) to score candidates.",
      "problem": "A single sample may be acceptable; the best of N is often markedly better at modest cost increase.",
      "forces": [
        "N candidates cost N inferences.",
        "Reward-model quality bounds achievable improvement.",
        "Diversity across candidates is needed; identical samples defeat the pattern."
      ],
      "solution": "Generate N candidates with non-zero temperature. Score each with a reward model or rule-based scorer. Return the top-1 (or top-K). BoNBoN alignment fine-tunes a model to mimic the BoN distribution directly, eliminating per-inference sampling cost.",
      "consequences": {
        "benefits": [
          "Quality lift without retraining the base model.",
          "Trade-off knob: increase N for more quality, fewer for less cost."
        ],
        "liabilities": [
          "Cost scales with N.",
          "Reward hacking: candidates can game a flawed scorer."
        ]
      },
      "constrains": "The chosen output must be from the candidate set; no synthesis across candidates.",
      "known_uses": [
        {
          "system": "RLHF training pipelines",
          "status": "available"
        }
      ],
      "related": [
        {
          "pattern": "self-consistency",
          "relation": "alternative-to"
        },
        {
          "pattern": "evaluator-optimizer",
          "relation": "alternative-to"
        },
        {
          "pattern": "parallelization",
          "relation": "specialises"
        },
        {
          "pattern": "test-time-compute-scaling",
          "relation": "specialises"
        },
        {
          "pattern": "process-reward-model",
          "relation": "used-by"
        },
        {
          "pattern": "rest-em",
          "relation": "used-by"
        },
        {
          "pattern": "automatic-workflow-search",
          "relation": "complements"
        }
      ],
      "references": [
        {
          "type": "paper",
          "title": "BoNBoN Alignment for Large Language Models and the Sweetness of Best-of-n Sampling",
          "authors": "Gui, Gârbacea, Veitch",
          "year": 2024,
          "url": "https://arxiv.org/abs/2406.00832"
        }
      ],
      "status_in_practice": "emerging",
      "tags": [
        "sampling",
        "reward",
        "alignment"
      ],
      "variants": [
        {
          "name": "Independent samples",
          "summary": "Generate N candidates with non-zero temperature; pick the highest-scoring by a reward model.",
          "distinguishing_factor": "no inter-sample dependency",
          "when_to_use": "Default. Trivially parallel; simplest implementation."
        },
        {
          "name": "Self-consistency vote",
          "summary": "Generate N candidates; aggregate by majority vote on the final answer rather than a reward score.",
          "distinguishing_factor": "majority vote, no reward model",
          "when_to_use": "No reward model is available but the task has a discrete answer (math, multiple choice).",
          "see_also": "self-consistency"
        },
        {
          "name": "BoNBoN distilled",
          "summary": "Fine-tune a model to mimic the BoN distribution directly, eliminating the per-inference sampling cost.",
          "distinguishing_factor": "amortised by training",
          "when_to_use": "BoN runtime cost is the bottleneck and you control the model."
        }
      ]
    },
    {
      "id": "confidence-reporting",
      "name": "Confidence Reporting",
      "aliases": [
        "Uncertainty Surfacing",
        "Calibrated Output"
      ],
      "category": "verification-reflection",
      "intent": "Surface the agent's uncertainty about its answer alongside the answer itself.",
      "context": "Decisions downstream of the agent depend on whether the answer is reliable; users and systems both benefit from a confidence signal.",
      "problem": "Models output answers with uniform tone regardless of internal certainty; downstream code cannot distinguish 'I know' from 'I am guessing'.",
      "forces": [
        "Confidence signals are themselves miscalibrated by the model.",
        "Surfacing uncertainty erodes user trust if overdone.",
        "Sample-based confidence (self-consistency) costs N calls."
      ],
      "solution": "Produce a confidence label (high/medium/low or numeric) alongside each answer. Derive from sample variance (self-consistency), evaluator score, retrieval recall, or rubric score. Render in UI; route low-confidence to fallback or human review.",
      "consequences": {
        "benefits": [
          "Downstream code can branch on confidence.",
          "Users learn when to verify."
        ],
        "liabilities": [
          "Calibration is empirical and drifts.",
          "False confidence remains the failure mode."
        ]
      },
      "constrains": "Outputs without a confidence label are not consumable by confidence-aware downstream code.",
      "known_uses": [
        {
          "system": "Sparrot empirical-introspection variance signal",
          "status": "available"
        },
        {
          "system": "OpenAI logprobs-derived confidence",
          "status": "available",
          "url": "https://platform.openai.com/docs/api-reference/chat"
        }
      ],
      "related": [
        {
          "pattern": "self-consistency",
          "relation": "uses"
        },
        {
          "pattern": "disambiguation",
          "relation": "complements"
        },
        {
          "pattern": "fallback-chain",
          "relation": "complements"
        },
        {
          "pattern": "attention-manipulation-explainability",
          "relation": "complements"
        }
      ],
      "references": [
        {
          "type": "paper",
          "title": "Language Models (Mostly) Know What They Know",
          "authors": "Kadavath et al.",
          "year": 2022,
          "url": "https://arxiv.org/abs/2207.05221"
        }
      ],
      "status_in_practice": "emerging",
      "tags": [
        "uncertainty",
        "calibration"
      ]
    },
    {
      "id": "critic",
      "name": "Tool-Augmented Self-Correction",
      "aliases": [
        "Tool-Interactive Self-Correction",
        "CRITIC"
      ],
      "category": "verification-reflection",
      "intent": "Self-correct LLM outputs by interactively critiquing them with external tools (search, code execution, calculator).",
      "context": "Generation tasks where errors can be detected and corrected by grounded checks (factual claims by search, code by execution, math by calculator).",
      "problem": "Self-critique without external tools recycles the model's blind spots; tools provide grounded ground-truth signals.",
      "forces": [
        "Tool selection per critique step.",
        "Critique cost adds to generation cost.",
        "Tools may themselves be wrong or limited."
      ],
      "solution": "After draft generation, the model emits a critique that names suspected errors and queries tools to verify. Tool results inform the revised output. Iterate until tools find no more issues or budget exhausted.",
      "consequences": {
        "benefits": [
          "Grounded self-correction beats ungrounded reflection.",
          "Tool invocations during critique are auditable."
        ],
        "liabilities": [
          "Latency and cost per turn.",
          "Tool selection itself is a learning problem."
        ]
      },
      "constrains": "The critic may revise outputs only when an external tool corroborates a defect; ungrounded edits are forbidden.",
      "known_uses": [
        {
          "system": "CRITIC paper baselines",
          "status": "available"
        }
      ],
      "related": [
        {
          "pattern": "reflection",
          "relation": "specialises"
        },
        {
          "pattern": "chain-of-verification",
          "relation": "alternative-to"
        },
        {
          "pattern": "tool-use",
          "relation": "uses"
        }
      ],
      "references": [
        {
          "type": "paper",
          "title": "CRITIC: Large Language Models Can Self-Correct with Tool-Interactive Critiquing",
          "authors": "Gou, Shao, Gong, Shen, Yang, Duan, Chen",
          "year": 2023,
          "url": "https://arxiv.org/abs/2305.11738"
        }
      ],
      "status_in_practice": "emerging",
      "tags": [
        "reflection",
        "tool-grounded",
        "self-correction"
      ]
    },
    {
      "id": "deterministic-llm-sandwich",
      "name": "Deterministic-LLM Sandwich",
      "aliases": [
        "Verification-and-Grounding Loop",
        "Bracketed LLM Call",
        "Verify LLM Output",
        "Pre/Post Validation"
      ],
      "category": "verification-reflection",
      "intent": "Bracket every LLM call with deterministic checks on both sides.",
      "context": "A correctness boundary where wrong outputs cause real damage (a wrong stitch count, a malformed migration, a mis-priced order).",
      "problem": "Trusting the LLM's output unconditionally accepts hallucination at the most expensive moment; banning the LLM entirely loses its strengths.",
      "forces": [
        "Bracketing adds latency per call.",
        "Pre-checks must be cheap to be worth running.",
        "Post-checks must catch what the model gets wrong, not what is merely surprising."
      ],
      "solution": "Three layers. Pre: deterministic check decides whether the LLM should run at all (e.g. AST parse must succeed). LLM: produces a candidate output with structured-output schema and frozen rubric. Post: deterministic re-validation (parse, type-check, run tests). If post fails, the original is returned unchanged.",
      "structure": "Pre(input) -> {pass, fail} ; if pass: LLM(input) -> candidate ; Post(candidate) -> {accept, reject}.",
      "consequences": {
        "benefits": [
          "Confidence at the correctness boundary; the model cannot land an unsafe artefact.",
          "Bug fixes go into the deterministic layer where they are testable."
        ],
        "liabilities": [
          "Building the deterministic checks is itself the bulk of the work.",
          "Over-strict post-checks reject valid outputs."
        ]
      },
      "constrains": "An LLM-produced artefact lands only after passing the post-check; otherwise the prior state is preserved.",
      "known_uses": [
        {
          "system": "Knitting-DSL Pipeline (Stash2Go)",
          "note": "deterministicReview.js -> scopedLlmFixer.js -> parse and revalidate.",
          "status": "available"
        }
      ],
      "related": [
        {
          "pattern": "frozen-rubric-reflection",
          "relation": "uses"
        },
        {
          "pattern": "structured-output",
          "relation": "uses"
        },
        {
          "pattern": "code-execution",
          "relation": "composes-with",
          "note": "Post-check often runs code (parse/test) to validate output."
        },
        {
          "pattern": "frozen-rubric-reflection",
          "relation": "composes-with"
        }
      ],
      "references": [
        {
          "type": "blog",
          "title": "Marco Nissen, Working with the models",
          "year": 2026,
          "url": "https://substack.com/@marconissen"
        }
      ],
      "status_in_practice": "emerging",
      "tags": [
        "verification",
        "boundary",
        "sandwich"
      ]
    },
    {
      "id": "evaluator-optimizer",
      "name": "Evaluator-Optimizer",
      "aliases": [
        "Generator-Critic Loop",
        "LLM-as-Judge Refinement"
      ],
      "category": "verification-reflection",
      "intent": "One LLM generates; another evaluates and feeds back; loop until criteria are met.",
      "context": "Tasks with measurable evaluation criteria where iterative refinement beats single-pass generation.",
      "problem": "Single-shot generation tops out below what an evaluator-corrected loop achieves.",
      "forces": [
        "The evaluator must be calibrated; a bad judge teaches bad lessons.",
        "Loop budget caps cost.",
        "Generator and evaluator can collude (especially if same model, same prompt family)."
      ],
      "solution": "Generator produces a candidate. Evaluator scores it against criteria with feedback. Generator revises with the feedback. Loop until evaluator passes or max iterations.",
      "consequences": {
        "benefits": [
          "Quality climbs predictably with iterations.",
          "Evaluator can be reused as an offline regression suite."
        ],
        "liabilities": [
          "Cost = (generator + evaluator) x iterations.",
          "Convergence is not guaranteed."
        ]
      },
      "constrains": "Generator outputs are accepted only after the evaluator passes; an unbounded loop is forbidden by the iteration cap.",
      "known_uses": [
        {
          "system": "Anthropic Building Effective Agents (Workflow #5)",
          "status": "available"
        },
        {
          "system": "Cursor auto-fix loops",
          "status": "available"
        },
        {
          "system": "Cline auto-iterate",
          "status": "available"
        },
        {
          "system": "Aider lint-then-fix loop",
          "status": "available"
        }
      ],
      "related": [
        {
          "pattern": "reflection",
          "relation": "generalises"
        },
        {
          "pattern": "best-of-n",
          "relation": "alternative-to"
        },
        {
          "pattern": "planner-executor-observer",
          "relation": "composes-with"
        },
        {
          "pattern": "llm-as-judge",
          "relation": "uses"
        },
        {
          "pattern": "same-model-self-critique",
          "relation": "conflicts-with"
        },
        {
          "pattern": "self-refine",
          "relation": "alternative-to"
        },
        {
          "pattern": "crag",
          "relation": "used-by"
        },
        {
          "pattern": "dynamic-expert-recruitment",
          "relation": "used-by"
        }
      ],
      "references": [
        {
          "type": "blog",
          "title": "Anthropic: Building Effective Agents",
          "year": 2024,
          "url": "https://www.anthropic.com/research/building-effective-agents"
        }
      ],
      "status_in_practice": "mature",
      "tags": [
        "evaluator",
        "loop",
        "judge"
      ]
    },
    {
      "id": "frozen-rubric-reflection",
      "name": "Frozen Rubric Reflection",
      "aliases": [
        "Scoped Self-Review",
        "Closed-Set Critic"
      ],
      "category": "verification-reflection",
      "intent": "Constrain reflection to a fixed, hand-authored rubric of criteria so the reviewer cannot invent new ones each run.",
      "context": "Open-ended reflection drifts pattern over pattern; the reviewer's criteria change with the wind, making outputs hard to compare across runs or users.",
      "problem": "Free-form 'review the prior output' prompts produce inconsistent reviews because the model invents categories on each call.",
      "forces": [
        "Authoring a good rubric is non-trivial up-front work.",
        "Rubric drift over time is a separate problem from per-call drift.",
        "Some defects fall outside the rubric and go unflagged."
      ],
      "solution": "A fixed rubric file (or schema) lists exactly the categories the reviewer may flag. The reviewer prompt includes the rubric and a JSON Schema enforcing it. Temperature is zero. Output validates against the schema; new finding categories are rejected.",
      "consequences": {
        "benefits": [
          "Consistent reviews across runs and users.",
          "Rubric is the single load-bearing artefact; iteration is in one place."
        ],
        "liabilities": [
          "Hard ceiling on what the reviewer can catch.",
          "Rubric authorship is its own engineering discipline."
        ]
      },
      "constrains": "The reviewer cannot output finding categories outside the rubric; the JSON schema rejects them.",
      "known_uses": [
        {
          "system": "Knitting-DSL Pipeline (Stash2Go)",
          "note": "Six-item rubric in scopedLlmReviewer.js: duplicate NOTEs, finishing order, construction voice, prose omissions, prose inventions, pattern name sanity.",
          "status": "available"
        }
      ],
      "related": [
        {
          "pattern": "reflection",
          "relation": "specialises"
        },
        {
          "pattern": "structured-output",
          "relation": "uses"
        },
        {
          "pattern": "deterministic-llm-sandwich",
          "relation": "composes-with"
        },
        {
          "pattern": "deterministic-llm-sandwich",
          "relation": "used-by"
        }
      ],
      "references": [
        {
          "type": "blog",
          "title": "Marco Nissen, Working with the models",
          "year": 2026,
          "url": "https://substack.com/@marconissen"
        }
      ],
      "status_in_practice": "emerging",
      "tags": [
        "reflection",
        "rubric",
        "structured-output"
      ],
      "applicability": {
        "use_when": [
          "Review criteria should be stable across runs so verdicts compare.",
          "Auditors need an explicit list of categories the model checked.",
          "Reflection drift across calls is producing inconsistent reviews."
        ],
        "do_not_use_when": [
          "Defects of interest do not fit any predefined rubric category.",
          "The domain shifts faster than the rubric can be re-authored.",
          "Exploratory critique is the goal; a rubric narrows it too much."
        ]
      },
      "diagram": {
        "type": "sequence",
        "mermaid": "sequenceDiagram\n  participant Author\n  participant Reviewer\n  participant Rubric\n  Author->>Reviewer: draft\n  Rubric-->>Reviewer: fixed criteria (read-only)\n  Reviewer->>Reviewer: score draft against each criterion\n  Reviewer-->>Author: structured feedback per criterion",
        "caption": "Frozen Rubric Reflection makes the rubric read-only at the tool layer; the reviewer cannot invent new criteria mid-call."
      },
      "example_scenario": "A code-review agent reviews every pull request against a fixed checklist: tests added? naming consistent? error handling? security risk? Without the fixed list, the reviewer would invent new criteria each call and reviews would not be comparable across PRs. Reviewers can only score against the listed criteria — they cannot make new ones up mid-review.",
      "variants": [
        {
          "name": "Yes/no per criterion",
          "summary": "Reviewer answers a fixed list of yes/no questions; the verdict is pass/fail by configurable threshold.",
          "distinguishing_factor": "binary criteria",
          "when_to_use": "Criteria are clearly defined and edge cases are rare."
        },
        {
          "name": "Likert-scored per criterion",
          "summary": "Reviewer rates each criterion on a 1-5 scale; verdict is computed from weighted scores.",
          "distinguishing_factor": "graded scoring",
          "when_to_use": "Quality is a continuum and a binary pass/fail loses signal."
        },
        {
          "name": "Versioned rubric",
          "summary": "Rubric carries a version id; reviews record which version they ran against, so trend analysis controls for rubric changes.",
          "distinguishing_factor": "rubric is itself versioned",
          "when_to_use": "The rubric will evolve and historical comparisons matter."
        }
      ]
    },
    {
      "id": "inner-critic",
      "name": "Self-Modification Diff Gate",
      "aliases": [
        "Diff Reviewer",
        "Self-Mod Gate",
        "Inner Critic"
      ],
      "category": "verification-reflection",
      "intent": "Gate the agent's edits to its own code or rules through a separate critic persona that reviews the diff before it lands.",
      "context": "A self-modifying agent can edit its own source, prompts, or rules; without a gate, recursive self-editing can drift into incoherence or unsafe behaviour.",
      "problem": "Self-edits applied directly bypass review; the agent can corrupt its own future behaviour irreversibly.",
      "forces": [
        "Critic and modifier may share blind spots if they share a model.",
        "Strict critics block legitimate improvements.",
        "Lax critics defeat the gate."
      ],
      "solution": "Every self-edit goes through a critic step: a separate prompt (and optionally a separate model) reviews the proposed diff against criteria (safety, charter compliance, test passing). Edits land only on critic approval. Rejected edits are logged for later human review. The critic must run on a frozen checkpoint (separate process or sandbox) so a malformed self-edit cannot corrupt the critic before it votes; recursion guard is required when the critic itself is in the edit scope.",
      "consequences": {
        "benefits": [
          "Recursive self-improvement becomes survivable in practice.",
          "Audit trail of what was rejected is itself learning signal."
        ],
        "liabilities": [
          "Critic prompt is a load-bearing artefact; bad critics are worse than no critic.",
          "Two-step pipeline doubles per-edit latency."
        ]
      },
      "constrains": "No write to self-modifiable files succeeds without a passing critic review.",
      "known_uses": [
        {
          "system": "Sparrot",
          "note": "src/sparrot/*.py edits gated by critic.py.",
          "status": "available"
        }
      ],
      "related": [
        {
          "pattern": "skill-library",
          "relation": "used-by"
        },
        {
          "pattern": "constitutional-charter",
          "relation": "uses"
        },
        {
          "pattern": "inner-committee",
          "relation": "generalises"
        },
        {
          "pattern": "quorum-on-mutation",
          "relation": "complements"
        }
      ],
      "references": [
        {
          "type": "blog",
          "title": "Marco Nissen, Working with the models",
          "year": 2026,
          "url": "https://substack.com/@marconissen"
        }
      ],
      "status_in_practice": "experimental",
      "tags": [
        "self-modification",
        "critic",
        "safety"
      ]
    },
    {
      "id": "process-reward-model",
      "name": "Process Reward Model",
      "aliases": [
        "PRM",
        "Step-Level Verifier"
      ],
      "category": "verification-reflection",
      "intent": "Train a verifier that scores each reasoning step rather than only the final answer.",
      "context": "Multi-step reasoning tasks where final-answer scoring conflates good reasoning with lucky right answers.",
      "problem": "Outcome reward models reinforce shortcut reasoning that lands on the right answer through wrong steps; step-level supervision gives finer-grained signal.",
      "forces": [
        "Step-level annotation is expensive (humans must label each step).",
        "Step boundaries vary across tasks.",
        "PRM and outcome reward sometimes conflict on what counts as 'correct'."
      ],
      "solution": "Collect step-level labels (correct / neutral / incorrect / hallucination) for chain-of-thought traces. Train a classifier to predict step labels. At inference, score every step; reject candidates whose intermediate steps have low scores. Powers test-time search and fine-tuning of the generator.",
      "consequences": {
        "benefits": [
          "Catches wrong-reasoning-right-answer cases.",
          "Enables tree-search and best-of-N with finer signal."
        ],
        "liabilities": [
          "Annotation cost.",
          "PRM calibration shifts with model capability."
        ]
      },
      "constrains": "Final answers are accepted only when intermediate steps pass the PRM threshold.",
      "known_uses": [
        {
          "system": "OpenAI 'Let's Verify Step by Step' baseline",
          "status": "available"
        },
        {
          "system": "DeepMind reasoning evaluators",
          "status": "available"
        }
      ],
      "related": [
        {
          "pattern": "best-of-n",
          "relation": "uses"
        },
        {
          "pattern": "test-time-compute-scaling",
          "relation": "specialises"
        },
        {
          "pattern": "lats",
          "relation": "complements"
        }
      ],
      "references": [
        {
          "type": "paper",
          "title": "Let's Verify Step by Step",
          "authors": "Lightman, Kosaraju, Burda, Edwards, Baker, Lee, Leike, Schulman, Sutskever, Cobbe",
          "year": 2023,
          "url": "https://arxiv.org/abs/2305.20050"
        }
      ],
      "status_in_practice": "emerging",
      "tags": [
        "verification",
        "reward",
        "step-level"
      ]
    },
    {
      "id": "reflection",
      "name": "Reflection",
      "aliases": [
        "Self-Critique",
        "Single-Pass Self-Review"
      ],
      "category": "verification-reflection",
      "intent": "Have the model review its own output and produce a revised version in one or more passes.",
      "context": "First-pass model outputs contain mistakes a second look would catch; the cost of an extra pass is acceptable.",
      "problem": "One-shot generation underuses the model; a second pass focused on critique often fixes errors at modest cost.",
      "forces": [
        "Same-model self-critique misses correlated blind spots.",
        "Free-form review drifts; the model invents new criteria each time.",
        "Termination: when does the loop stop?"
      ],
      "solution": "After producing an output, the model is prompted (often as a critic persona) to find issues. The original output and critique go back into a revision step. Repeat until a stop condition (no new issues, max iterations).",
      "consequences": {
        "benefits": [
          "Catches surface errors cheaply.",
          "Pairs naturally with structured outputs."
        ],
        "liabilities": [
          "Diminishing returns after one or two passes.",
          "Self-reinforced confidence on wrong answers (Reflexion replication studies)."
        ]
      },
      "constrains": "The reviewer may only critique against criteria fixed by the surrounding system; free-form criteria invention is forbidden when the pattern is used at a correctness boundary.",
      "known_uses": [
        {
          "system": "Knitting-DSL Pipeline (Stash2Go)",
          "note": "scopedLlmReviewer.js runs a frozen 6-item rubric.",
          "status": "available"
        },
        {
          "system": "Self-Refine paper",
          "status": "available"
        }
      ],
      "related": [
        {
          "pattern": "frozen-rubric-reflection",
          "relation": "generalises"
        },
        {
          "pattern": "evaluator-optimizer",
          "relation": "specialises"
        },
        {
          "pattern": "reflexion",
          "relation": "generalises"
        },
        {
          "pattern": "agentic-rag",
          "relation": "used-by"
        },
        {
          "pattern": "chain-of-verification",
          "relation": "generalises"
        },
        {
          "pattern": "self-refine",
          "relation": "generalises"
        },
        {
          "pattern": "same-model-self-critique",
          "relation": "alternative-to"
        },
        {
          "pattern": "critic",
          "relation": "generalises"
        },
        {
          "pattern": "self-rag",
          "relation": "used-by"
        }
      ],
      "references": [
        {
          "type": "paper",
          "title": "Self-Refine: Iterative Refinement with Self-Feedback",
          "authors": "Madaan et al.",
          "year": 2023,
          "url": "https://arxiv.org/abs/2303.17651"
        }
      ],
      "status_in_practice": "mature",
      "tags": [
        "reflection",
        "self-critique"
      ],
      "variants": [
        {
          "name": "Single-model in-prompt critique",
          "summary": "The same model writes the draft and then critiques it in a follow-up turn. Often a starting point but tends to recycle the model's blind spots.",
          "distinguishing_factor": "one model in two roles",
          "when_to_use": "Quick wins on obvious format errors; not a substitute for an independent critic.",
          "see_also": "same-model-self-critique"
        },
        {
          "name": "Separate critic model",
          "summary": "A different model (often smaller or differently fine-tuned) reviews the author's draft. Reduces correlated blind spots.",
          "distinguishing_factor": "two distinct models",
          "when_to_use": "Quality matters more than cost; a critic specialised on the domain is available."
        },
        {
          "name": "Tool-grounded critique",
          "summary": "The reviewer uses external tools (search, calculator, code execution) to ground its critique in evidence rather than only re-reading the draft.",
          "distinguishing_factor": "grounded by tools",
          "when_to_use": "Critique can be objectively checked against ground truth (factuality, code correctness, math).",
          "see_also": "critic"
        }
      ]
    },
    {
      "id": "reflexion",
      "name": "Reflexion",
      "aliases": [
        "Cross-Episode Lesson Writing",
        "Verbal Reinforcement Learning"
      ],
      "category": "verification-reflection",
      "intent": "Have the agent write linguistic lessons from past failures and consult them in future episodes.",
      "context": "The same agent attempts similar tasks repeatedly; without memory across attempts, mistakes recur.",
      "problem": "Stateless agents repeat the same errors; full RL fine-tuning is too expensive for most settings.",
      "forces": [
        "Lesson quality is bounded by the model's self-critique ability.",
        "Lesson retrieval (which lesson applies?) is a search problem.",
        "Lesson rot: outdated lessons may misguide once the world changes."
      ],
      "solution": "After each episode, the agent reflects on success/failure and writes a verbal lesson. Lessons are stored in long-term memory keyed by task type. Future episodes retrieve relevant lessons and prepend them to context.",
      "consequences": {
        "benefits": [
          "Improvement without fine-tuning weights.",
          "Lessons are human-readable and editable."
        ],
        "liabilities": [
          "Single-agent reflexion repeats blind spots because the same model writes and reads the lessons.",
          "Lesson stores grow; without curation they become noise."
        ]
      },
      "constrains": "Lessons are appended, not overwritten; old lessons are explicitly retired rather than silently deleted.",
      "known_uses": [
        {
          "system": "Sparrot",
          "note": "Reflection cycle: chunks -> insights -> rule proposals.",
          "status": "available"
        },
        {
          "system": "Bobbin (Stash2Go)",
          "note": "Per-user lesson schema not yet built.",
          "status": "pure-future"
        }
      ],
      "related": [
        {
          "pattern": "episodic-summaries",
          "relation": "complements"
        },
        {
          "pattern": "reflection",
          "relation": "specialises"
        }
      ],
      "references": [
        {
          "type": "paper",
          "title": "Reflexion: Language Agents with Verbal Reinforcement Learning",
          "authors": "Shinn, Cassano, Berman, Gopinath, Narasimhan, Yao",
          "year": 2023,
          "url": "https://arxiv.org/abs/2303.11366"
        }
      ],
      "status_in_practice": "experimental",
      "tags": [
        "memory",
        "reflection",
        "learning"
      ]
    },
    {
      "id": "self-consistency",
      "name": "Self-Consistency",
      "aliases": [
        "Sample-and-Vote",
        "Empirical Introspection",
        "Marginalised Reasoning"
      ],
      "category": "verification-reflection",
      "intent": "Sample the same question multiple times at non-zero temperature and aggregate by majority or judge to mitigate hallucination.",
      "context": "Reasoning-heavy questions where the model is mostly right but sometimes invents a different chain.",
      "problem": "A single sample at zero temperature gives the most likely chain; sampling and voting often outperforms it because the right answer is the one most chains converge on.",
      "forces": [
        "N samples cost N times more.",
        "Aggregation logic depends on whether the answer is a class, a number, or free text.",
        "Variance is itself signal: a high-variance question is one the model is uncertain on."
      ],
      "solution": "Run the same prompt N times with non-zero temperature. Extract the answer from each. Aggregate: majority vote for discrete answers, median for numeric, judge for free-form. Variance across samples is logged as a confidence signal.",
      "consequences": {
        "benefits": [
          "Higher accuracy on reasoning benchmarks at moderate cost.",
          "Variance is a free uncertainty estimate."
        ],
        "liabilities": [
          "Linear cost scaling.",
          "Free-form aggregation needs a judge model."
        ]
      },
      "constrains": "The final answer is the aggregate, not any single sample; individual samples have no authority.",
      "known_uses": [
        {
          "system": "Sparrot",
          "note": "introspect(question, samples, temperature) tool.",
          "status": "available"
        },
        {
          "system": "Chain-of-Thought + Self-Consistency benchmarks",
          "status": "available"
        }
      ],
      "related": [
        {
          "pattern": "parallelization",
          "relation": "specialises"
        },
        {
          "pattern": "best-of-n",
          "relation": "alternative-to"
        },
        {
          "pattern": "debate",
          "relation": "complements"
        },
        {
          "pattern": "confidence-reporting",
          "relation": "used-by"
        },
        {
          "pattern": "test-time-compute-scaling",
          "relation": "specialises"
        },
        {
          "pattern": "lats",
          "relation": "complements"
        },
        {
          "pattern": "map-reduce",
          "relation": "alternative-to"
        },
        {
          "pattern": "chain-of-thought",
          "relation": "complements"
        },
        {
          "pattern": "chain-of-verification",
          "relation": "complements"
        },
        {
          "pattern": "star-bootstrapping",
          "relation": "complements"
        }
      ],
      "references": [
        {
          "type": "paper",
          "title": "Self-Consistency Improves Chain of Thought Reasoning in Language Models",
          "authors": "Wang, Wei, Schuurmans, Le, Chi, Narang, Chowdhery, Zhou",
          "year": 2022,
          "url": "https://arxiv.org/abs/2203.11171"
        }
      ],
      "status_in_practice": "mature",
      "tags": [
        "sampling",
        "voting",
        "uncertainty"
      ]
    },
    {
      "id": "self-refine",
      "name": "Self-Refine",
      "aliases": [
        "Iterative Self-Feedback"
      ],
      "category": "verification-reflection",
      "intent": "Iterate generate → feedback (same model) → refine until a stop criterion fires, with no separate critic model.",
      "context": "Generation tasks where the same model can produce useful self-feedback against an explicit improvement target.",
      "problem": "One-shot generation under-uses the model; calling for explicit critique-and-revise from the same model raises quality at modest cost.",
      "forces": [
        "Same-model critique inherits the model's blind spots.",
        "Termination criterion is its own design.",
        "Cost grows linearly with iterations."
      ],
      "solution": "Three roles, one model. (1) Generate: produce initial output. (2) Feedback: same model returns concrete improvement points against a fixed target. (3) Refine: same model rewrites using the feedback. Repeat until the model says 'no more issues' or max iterations.",
      "consequences": {
        "benefits": [
          "Quality improvement on tasks with measurable targets.",
          "Same-model loop is simple to deploy."
        ],
        "liabilities": [
          "Reinforces same-model blind spots (Reflexion replication studies).",
          "Diminishing returns after 2-3 iterations."
        ]
      },
      "constrains": "Feedback must conform to the chosen target; revisions must address the most recent feedback.",
      "known_uses": [
        {
          "system": "Self-Refine paper benchmarks (math, code, dialog)",
          "status": "available"
        }
      ],
      "related": [
        {
          "pattern": "reflection",
          "relation": "specialises"
        },
        {
          "pattern": "evaluator-optimizer",
          "relation": "alternative-to"
        },
        {
          "pattern": "same-model-self-critique",
          "relation": "conflicts-with",
          "note": "Self-Refine is the well-engineered version of the failure mode same-model-self-critique describes."
        }
      ],
      "references": [
        {
          "type": "paper",
          "title": "Self-Refine: Iterative Refinement with Self-Feedback",
          "authors": "Madaan, Tandon, Gupta, Hallinan, Gao, Wiegreffe, Alon, Dziri, Prabhumoye, Yang, Welleck, Majumder, Gupta, Yazdanbakhsh, Clark",
          "year": 2023,
          "url": "https://arxiv.org/abs/2303.17651"
        }
      ],
      "status_in_practice": "mature",
      "tags": [
        "reflection",
        "iterative",
        "self-feedback"
      ]
    }
  ]
}
