{
  "schema_version": "1.0",
  "surface_type": "guide_answer",
  "guide_id": "rag-vs-long-context-cost-tradeoffs",
  "question": "How should I compare RAG against long-context prompts for cost and margin?",
  "canonical_url": "https://www.unitcostai.com/guides/rag-vs-long-context-cost-tradeoffs",
  "related_tool_url": "https://www.unitcostai.com/tools/rag-vs-long-context",
  "formula": "architecture_delta = cost_long_context - cost_rag; request_input_tokens_delta = long_context_input_tokens - (base_prompt_tokens + retrieved_chunks * tokens_per_chunk)",
  "assumptions": [
    "Compare both architectures on the same task quality bar, not raw prompt size alone",
    "Long-context removes retrieval, reranking, vector-query, and embedding-refresh stack terms",
    "Embedding refresh is a fixed monthly term and should not be multiplied by active users"
  ],
  "example": "If RAG sends 2,980 request input tokens and the long-context alternative sends 3,400, compare whether the extra prompt cost is smaller than the retrieval stack it replaces."
}
