Citations & Retrieval Skill

Implement document-based citations and RAG patterns for grounded, verifiable AI responses.

When to Use This Skill

Document Q&A with source attribution
RAG (Retrieval-Augmented Generation) systems
Grounding responses in provided documents
Building trustworthy AI applications
Research and analysis with citations

Core Concepts

Citation Types

Type Use Case Format

char_location

Text documents Character ranges

page_location

PDFs Page numbers

content_block_location

Custom content Block indexes

Basic Citations

Enable Citations

import anthropic

client = anthropic.Anthropic()

response = client.messages.create( model="claude-sonnet-4-20250514", max_tokens=1024, documents=[ { "type": "document", "source": { "type": "text", "media_type": "text/plain", "data": "The company was founded in 2020. Revenue reached $10M in 2023." }, "title": "Company Overview", "citations": {"enabled": True} # Enable citations! } ], messages=[{"role": "user", "content": "When was the company founded and what was the revenue?"}] )

Extract citations from response

for block in response.content: if block.type == "text": for citation in block.citations: print(f"Cited: {citation.document_title}") print(f"Location: chars {citation.start_char_index}-{citation.end_char_index}")

Custom Content Blocks

Fine-grained control over citation granularity

documents = [{ "type": "document", "source": { "type": "content", "content": [ {"type": "text", "text": "Section 1: Introduction..."}, {"type": "text", "text": "Section 2: Methods..."}, {"type": "text", "text": "Section 3: Results..."} ] }, "title": "Research Paper", "citations": {"enabled": True} }]

RAG Implementation

Basic RAG Pipeline

from sentence_transformers import SentenceTransformer import numpy as np

1. Embed documents

embedder = SentenceTransformer('all-MiniLM-L6-v2')

def embed_documents(documents): chunks = [] embeddings = [] for doc in documents: # Chunk the document doc_chunks = chunk_document(doc, chunk_size=512) chunks.extend(doc_chunks) embeddings.extend(embedder.encode(doc_chunks)) return chunks, np.array(embeddings)

2. Retrieve relevant chunks

def retrieve(query, chunks, embeddings, top_k=5): query_embedding = embedder.encode([query])[0] similarities = np.dot(embeddings, query_embedding) top_indices = np.argsort(similarities)[-top_k:][::-1] return [chunks[i] for i in top_indices]

3. Generate with retrieved context

def rag_query(query, chunks, embeddings): relevant_chunks = retrieve(query, chunks, embeddings)

response = client.messages.create(
    model="claude-sonnet-4-20250514",
    max_tokens=1024,
    documents=[{
        "type": "document",
        "source": {"type": "text", "media_type": "text/plain", "data": chunk},
        "title": f"Source {i+1}",
        "citations": {"enabled": True}
    } for i, chunk in enumerate(relevant_chunks)],
    messages=[{"role": "user", "content": query}]
)
return response

Contextual Retrieval (49-67% Better)

Add context to each chunk before embedding

def add_chunk_context(chunk, full_document): """Prepend context to improve retrieval accuracy by 49-67%"""

context_prompt = f"""&#x3C;document>

{full_document} </document>

Please provide a short, succinct context for this chunk that will help with retrieval:

<chunk> {chunk} </chunk>

Context:"""

response = client.messages.create(
    model="claude-haiku-4-20250514",  # Fast, cheap
    max_tokens=100,
    messages=[{"role": "user", "content": context_prompt}]
)

context = response.content[0].text
return f"{context}\n\n{chunk}"

Apply to all chunks

contextual_chunks = [add_chunk_context(chunk, full_doc) for chunk in chunks]

Citation Formatting

Format as Numbered References

def format_with_citations(response): """Format response with numbered inline citations""" text = "" citations = [] citation_map = {}

for block in response.content:
    if block.type == "text":
        current_text = block.text

        for citation in block.citations:
            key = (citation.document_title, citation.start_char_index)
            if key not in citation_map:
                citation_map[key] = len(citations) + 1
                citations.append(citation)

            # Insert citation number
            ref_num = citation_map[key]
            current_text += f" [{ref_num}]"

        text += current_text

# Add references section
text += "\n\n## References\n"
for i, citation in enumerate(citations, 1):
    text += f"[{i}] {citation.document_title}\n"

return text

Academic Citation Formats

def format_apa(author, year, title, source): """APA format: Author (Year). Title. Source.""" return f"{author} ({year}). {title}. {source}."

def format_mla(author, title, source, year): """MLA format: Author. "Title." Source, Year.""" return f'{author}. "{title}." {source}, {year}.'

def format_chicago(author, title, source, year): """Chicago format: Author. Title. Source, Year.""" return f"{author}. {title}. {source}, {year}."

Multi-Document Q&A

def multi_doc_qa(question, documents): """Answer questions across multiple documents with citations"""

doc_inputs = []
for i, doc in enumerate(documents):
    doc_inputs.append({
        "type": "document",
        "source": {
            "type": "text",
            "media_type": "text/plain",
            "data": doc["content"]
        },
        "title": doc.get("title", f"Document {i+1}"),
        "citations": {"enabled": True}
    })

response = client.messages.create(
    model="claude-sonnet-4-20250514",
    max_tokens=2048,
    documents=doc_inputs,
    messages=[{
        "role": "user",
        "content": f"Answer this question based on the provided documents. Cite your sources.\n\nQuestion: {question}"
    }]
)

return response

Prompt Caching for RAG

Cache static documents for repeated queries

response = client.messages.create( model="claude-sonnet-4-20250514", max_tokens=1024, documents=[{ "type": "document", "source": {"type": "text", "media_type": "text/plain", "data": large_document}, "title": "Knowledge Base", "citations": {"enabled": True}, "cache_control": {"type": "ephemeral"} # Cache this document! }], messages=[{"role": "user", "content": query}] )

Error Handling and Validation

Validate Citation Integrity

def validate_citations(response, documents): """Ensure all citations reference provided documents"""

cited_titles = set()
for block in response.content:
    if block.type == "text":
        for citation in block.citations:
            cited_titles.add(citation.document_title)

provided_titles = {doc.get("title") for doc in documents}

# Check for invalid citations
invalid = cited_titles - provided_titles
if invalid:
    raise ValueError(f"Citations reference unknown documents: {invalid}")

return True

def extract_citation_spans(response): """Extract text spans for each citation"""

citation_data = []
for block in response.content:
    if block.type == "text":
        text = block.text
        for citation in block.citations:
            span = text[citation.start_char_index:citation.end_char_index]
            citation_data.append({
                "text": span,
                "document": citation.document_title,
                "start": citation.start_char_index,
                "end": citation.end_char_index
            })

return citation_data

Best Practices

DO:

Enable citations for all document-based queries
Use contextual retrieval for better accuracy (+49-67%)
Cache static documents with cache_control
Provide clear document titles for attribution
Chunk documents appropriately (512-1024 tokens)
Validate citation integrity before using responses
Format citations consistently (APA, MLA, Chicago)
Test citation extraction in production systems

DON'T:

Rely on citations without enabling them
Use very small chunks (<100 tokens)
Ignore citation verification in production
Skip document preprocessing
Mix citation formats in the same document
Assume all LLM responses are cited by default
Deploy without citation validation tests

Troubleshooting

No Citations Returned

Ensure citations are enabled

documents = [{ "type": "document", "source": {"type": "text", "media_type": "text/plain", "data": content}, "citations": {"enabled": True} # Must be explicit! }]

Citations Point to Wrong Text

Verify character indexes match actual text

text = block.text cited_text = text[citation.start_char_index:citation.end_char_index] print(f"Cited text: {cited_text}") print(f"Expected: {expected_text}")

Large Document Performance

Use chunking for large documents

def chunk_with_overlap(text, chunk_size=1024, overlap=256): chunks = [] for i in range(0, len(text), chunk_size - overlap): chunks.append(text[i:i + chunk_size]) return chunks

Pass chunks individually for better retrieval

large_chunks = chunk_with_overlap(large_text)

Integration Example

#!/usr/bin/env python3 """Complete RAG + Citations example"""

import anthropic from sentence_transformers import SentenceTransformer import numpy as np

def create_rag_system(): """Initialize RAG system with citations"""

client = anthropic.Anthropic()
embedder = SentenceTransformer('all-MiniLM-L6-v2')

# Sample documents
documents = [
    {
        "title": "Python Guide",
        "content": "Python 3.11 introduced exception groups..."
    },
    {
        "title": "Web Standards",
        "content": "HTTP/2 introduced multiplexing capabilities..."
    }
]

# Embed documents
chunks = []
embeddings = []
for doc in documents:
    # Add document title as context
    chunk = f"[{doc['title']}]\n{doc['content']}"
    chunks.append(chunk)
    embeddings.append(embedder.encode(chunk))

embeddings = np.array(embeddings)

# Query function
def query(question):
    # Retrieve relevant chunks
    query_emb = embedder.encode(question)
    similarities = np.dot(embeddings, query_emb)
    top_idx = np.argmax(similarities)
    relevant_chunk = chunks[top_idx]

    # Get cited answer
    response = client.messages.create(
        model="claude-sonnet-4-20250514",
        max_tokens=1024,
        documents=[{
            "type": "document",
            "source": {
                "type": "text",
                "media_type": "text/plain",
                "data": relevant_chunk
            },
            "title": documents[top_idx]["title"],
            "citations": {"enabled": True}
        }],
        messages=[{
            "role": "user",
            "content": question
        }]
    )

    return response

return query

if name == "main": query_fn = create_rag_system() response = query_fn("What is Python 3.11?")

# Display with citations
for block in response.content:
    if block.type == "text":
        print(f"Answer: {block.text}")
        for citation in block.citations:
            print(f"  - Cited from: {citation.document_title}")

Performance Tips

Batch queries for throughput (10-20 concurrent requests)
Cache frequent documents with prompt caching
Use Haiku for context generation (faster, cheaper)
Chunk strategically (sentence/paragraph boundaries)
Monitor token usage for citation overhead (~5-10%)

Limitations

Citations only from provided documents
Character index citations require exact text matching
PDF support requires structured parsing
Citation extraction costs tokens (~5-10% overhead)
Batch operations not supported for cited responses

citations-retrieval

Safety Notice

Copy this and send it to your AI assistant to learn