PDF Harvester Skill

Extract and ingest PDF documents into RAG with proper text extraction, table handling, and metadata.

Overview

PDFs are common for research papers, reports, manuals, and ebooks. This skill covers:

Text extraction with layout preservation
Table extraction and conversion to markdown
Academic paper patterns (abstract, sections, citations)
OCR for scanned documents
Multi-page chunking strategies

Prerequisites

Core extraction

pip install pdfplumber pymupdf

For OCR (scanned documents)

pip install pytesseract pdf2image

Also need: brew install tesseract poppler (macOS)

For academic papers

pip install arxiv # If fetching from arXiv

Extraction Methods

Method 1: pdfplumber (Recommended)

Best for structured PDFs with tables.

#!/usr/bin/env python3 """PDF extraction using pdfplumber."""

import pdfplumber from pathlib import Path from typing import Dict, List, Optional import re

def extract_pdf_text( pdf_path: str, extract_tables: bool = True ) -> Dict: """ Extract text and tables from PDF.

Args:
    pdf_path: Path to PDF file
    extract_tables: Whether to extract tables separately

Returns:
    Dict with pages, tables, and metadata
"""
result = {
    "pages": [],
    "tables": [],
    "metadata": {},
    "total_pages": 0
}

with pdfplumber.open(pdf_path) as pdf:
    result["total_pages"] = len(pdf.pages)
    result["metadata"] = pdf.metadata or {}

    for page_num, page in enumerate(pdf.pages, 1):
        # Extract text
        text = page.extract_text() or ""

        result["pages"].append({
            "page_number": page_num,
            "text": text,
            "width": page.width,
            "height": page.height
        })

        # Extract tables
        if extract_tables:
            tables = page.extract_tables()
            for table_num, table in enumerate(tables, 1):
                if table and len(table) > 0:
                    result["tables"].append({
                        "page_number": page_num,
                        "table_number": table_num,
                        "data": table,
                        "markdown": table_to_markdown(table)
                    })

return result

def table_to_markdown(table: List[List]) -> str: """Convert table data to markdown format.""" if not table or len(table) == 0: return ""

# Clean cells
def clean_cell(cell):
    if cell is None:
        return ""
    return str(cell).replace("

", " ").strip()

# Header row
headers = [clean_cell(c) for c in table[0]]
md = "| " + " | ".join(headers) + " |

" md += "| " + " | ".join(["---"] * len(headers)) + " | "

# Data rows
for row in table[1:]:
    cells = [clean_cell(c) for c in row]
    # Pad if necessary
    while len(cells) &#x3C; len(headers):
        cells.append("")
    md += "| " + " | ".join(cells[:len(headers)]) + " |

return md

Method 2: PyMuPDF (fitz)

Faster, better for large PDFs.

#!/usr/bin/env python3 """PDF extraction using PyMuPDF."""

import fitz # PyMuPDF from typing import Dict, List

def extract_with_pymupdf(pdf_path: str) -> Dict: """ Extract text using PyMuPDF.

Faster than pdfplumber, good for large documents.
"""
doc = fitz.open(pdf_path)

result = {
    "pages": [],
    "metadata": doc.metadata,
    "total_pages": len(doc)
}

for page_num, page in enumerate(doc, 1):
    # Get text with layout preservation
    text = page.get_text("text")

    # Get text blocks for better structure
    blocks = page.get_text("dict")["blocks"]

    result["pages"].append({
        "page_number": page_num,
        "text": text,
        "blocks": len(blocks)
    })

doc.close()
return result

def extract_with_structure(pdf_path: str) -> Dict: """Extract with heading detection.""" doc = fitz.open(pdf_path)

pages = []
for page_num, page in enumerate(doc, 1):
    blocks = page.get_text("dict")["blocks"]

    structured_content = []
    for block in blocks:
        if block["type"] == 0:  # Text block
            for line in block.get("lines", []):
                for span in line.get("spans", []):
                    text = span["text"].strip()
                    font_size = span["size"]
                    is_bold = "bold" in span["font"].lower()

                    # Detect headings by font size
                    if font_size > 14 or is_bold:
                        structured_content.append({
                            "type": "heading",
                            "text": text,
                            "size": font_size
                        })
                    else:
                        structured_content.append({
                            "type": "paragraph",
                            "text": text
                        })

    pages.append({
        "page_number": page_num,
        "content": structured_content
    })

doc.close()
return {"pages": pages, "total_pages": len(pages)}

Method 3: OCR for Scanned PDFs

#!/usr/bin/env python3 """OCR extraction for scanned PDFs."""

import pytesseract from pdf2image import convert_from_path from typing import Dict, List

def extract_with_ocr( pdf_path: str, language: str = "eng", dpi: int = 300 ) -> Dict: """ Extract text from scanned PDF using OCR.

Args:
    pdf_path: Path to PDF
    language: Tesseract language code
    dpi: Resolution for conversion
"""
# Convert PDF pages to images
images = convert_from_path(pdf_path, dpi=dpi)

pages = []
for page_num, image in enumerate(images, 1):
    # Run OCR
    text = pytesseract.image_to_string(image, lang=language)

    pages.append({
        "page_number": page_num,
        "text": text,
        "ocr": True
    })

return {
    "pages": pages,
    "total_pages": len(pages),
    "ocr_used": True
}

def is_scanned_pdf(pdf_path: str) -> bool: """Detect if PDF is scanned (image-based).""" import fitz

doc = fitz.open(pdf_path)

# Check first few pages
for page in doc[:min(3, len(doc))]:
    text = page.get_text().strip()
    if len(text) > 100:  # Has extractable text
        doc.close()
        return False

doc.close()
return True

Chunking Strategies

Strategy 1: Page-Based

Simple chunking by page boundaries.

def chunk_by_pages( extracted: Dict, pages_per_chunk: int = 1 ) -> List[Dict]: """Chunk PDF by page boundaries.""" chunks = [] pages = extracted["pages"]

for i in range(0, len(pages), pages_per_chunk):
    page_group = pages[i:i + pages_per_chunk]

    text = "

".join(p["text"] for p in page_group)

    chunks.append({
        "content": text,
        "page_start": page_group[0]["page_number"],
        "page_end": page_group[-1]["page_number"],
        "chunk_index": len(chunks)
    })

return chunks

Strategy 2: Section-Based

Chunk by document sections/headings.

def chunk_by_sections( extracted: Dict, heading_patterns: List[str] = None ) -> List[Dict]: """Chunk PDF by section headings.""" if heading_patterns is None: heading_patterns = [ r'^#+\s', # Markdown headings r'^\d+.\s+[A-Z]', # Numbered sections r'^[A-Z][A-Z\s]+$', # ALL CAPS headings r'^(Abstract|Introduction|Conclusion|References)', ]

full_text = "

".join(p["text"] for p in extracted["pages"])

# Find section boundaries
sections = []
current_section = {"title": "Introduction", "content": "", "start_pos": 0}

lines = full_text.split("

for line in lines:
    is_heading = any(
        re.match(pattern, line.strip())
        for pattern in heading_patterns
    )

    if is_heading and current_section["content"].strip():
        sections.append(current_section)
        current_section = {
            "title": line.strip(),
            "content": "",
            "start_pos": len(sections)
        }
    else:
        current_section["content"] += line + "

# Don't forget last section
if current_section["content"].strip():
    sections.append(current_section)

return [
    {
        "content": s["content"].strip(),
        "section": s["title"],
        "chunk_index": i
    }
    for i, s in enumerate(sections)
]

Strategy 3: Semantic Paragraphs

Chunk by paragraph with size limits.

def chunk_by_paragraphs( extracted: Dict, max_chunk_size: int = 500, # words overlap: int = 50 ) -> List[Dict]: """Chunk by paragraphs with overlap.""" full_text = "

".join(p["text"] for p in extracted["pages"])

# Split into paragraphs
paragraphs = [p.strip() for p in full_text.split("

") if p.strip()]

chunks = []
current_chunk = []
current_size = 0

for para in paragraphs:
    para_size = len(para.split())

    if current_size + para_size > max_chunk_size and current_chunk:
        # Save current chunk
        chunks.append({
            "content": "

".join(current_chunk), "chunk_index": len(chunks), "word_count": current_size })

        # Start new chunk with overlap
        overlap_text = current_chunk[-1] if current_chunk else ""
        current_chunk = [overlap_text] if overlap_text else []
        current_size = len(overlap_text.split()) if overlap_text else 0

    current_chunk.append(para)
    current_size += para_size

# Last chunk
if current_chunk:
    chunks.append({
        "content": "

".join(current_chunk), "chunk_index": len(chunks), "word_count": current_size })

return chunks

Academic Paper Pattern

Special handling for research papers.

def extract_academic_paper(pdf_path: str) -> Dict: """ Extract academic paper with structure detection.

Identifies: title, authors, abstract, sections, references
"""
extracted = extract_pdf_text(pdf_path)
full_text = "

".join(p["text"] for p in extracted["pages"])

paper = {
    "title": "",
    "authors": [],
    "abstract": "",
    "sections": [],
    "references": [],
    "tables": extracted["tables"]
}

# Title is usually first large text
lines = full_text.split("

") for line in lines[:10]: if len(line) > 20 and len(line) < 200: paper["title"] = line.strip() break

# Abstract
abstract_match = re.search(
    r'Abstract[:\s]*

?(.*?)(?= (?:1.?\s+)?Introduction|

[A-Z])', full_text, re.DOTALL | re.IGNORECASE ) if abstract_match: paper["abstract"] = abstract_match.group(1).strip()

# Sections
section_pattern = r'

(\d+.?\s+[A-Z][^ ]+) ' section_matches = re.finditer(section_pattern, full_text)

section_positions = [(m.group(1), m.start()) for m in section_matches]

for i, (title, start) in enumerate(section_positions):
    end = section_positions[i+1][1] if i+1 &#x3C; len(section_positions) else len(full_text)
    content = full_text[start:end]

    paper["sections"].append({
        "title": title.strip(),
        "content": content.strip()
    })

# References section
ref_match = re.search(
    r'(?:References|Bibliography)\s*

(.*?)$', full_text, re.DOTALL | re.IGNORECASE ) if ref_match: paper["references_text"] = ref_match.group(1).strip()

return paper

Full Harvesting Pipeline

#!/usr/bin/env python3 """Complete PDF harvesting pipeline."""

from datetime import datetime from pathlib import Path from typing import Dict, List, Optional import hashlib

async def harvest_pdf( pdf_path: str, collection: str, chunk_strategy: str = "paragraphs", # pages, sections, paragraphs is_academic: bool = False, use_ocr: bool = False ) -> Dict: """ Harvest a PDF document into RAG.

Args:
    pdf_path: Path to PDF file
    collection: Target RAG collection
    chunk_strategy: How to chunk the document
    is_academic: Use academic paper extraction
    use_ocr: Force OCR extraction
"""
path = Path(pdf_path)

# Check if OCR needed
if use_ocr or is_scanned_pdf(pdf_path):
    extracted = extract_with_ocr(pdf_path)
else:
    extracted = extract_pdf_text(pdf_path)

# Get document metadata
doc_metadata = {
    "source_type": "pdf",
    "source_path": str(path.absolute()),
    "filename": path.name,
    "total_pages": extracted["total_pages"],
    "harvested_at": datetime.now().isoformat(),
    "pdf_metadata": extracted.get("metadata", {})
}

# Academic paper special handling
if is_academic:
    paper = extract_academic_paper(pdf_path)
    doc_metadata["title"] = paper["title"]
    doc_metadata["abstract"] = paper["abstract"]
    doc_metadata["is_academic"] = True

# Chunk based on strategy
if chunk_strategy == "pages":
    chunks = chunk_by_pages(extracted)
elif chunk_strategy == "sections":
    chunks = chunk_by_sections(extracted)
else:
    chunks = chunk_by_paragraphs(extracted)

# Generate document ID from content hash
content_hash = hashlib.md5(
    "".join(p["text"] for p in extracted["pages"]).encode()
).hexdigest()[:12]
doc_id = f"pdf_{content_hash}"

# Ingest chunks
ingested = 0
for chunk in chunks:
    chunk_metadata = {
        **doc_metadata,
        "chunk_index": chunk["chunk_index"],
        "total_chunks": len(chunks),
    }

    # Add page info if available
    if "page_start" in chunk:
        chunk_metadata["page_start"] = chunk["page_start"]
        chunk_metadata["page_end"] = chunk["page_end"]

    # Add section info if available
    if "section" in chunk:
        chunk_metadata["section"] = chunk["section"]

    await ingest(
        content=chunk["content"],
        collection=collection,
        metadata=chunk_metadata,
        doc_id=f"{doc_id}_chunk_{chunk['chunk_index']}"
    )
    ingested += 1

# Ingest tables separately
for table in extracted.get("tables", []):
    table_metadata = {
        **doc_metadata,
        "content_type": "table",
        "page_number": table["page_number"],
        "table_number": table["table_number"]
    }

    await ingest(
        content=table["markdown"],
        collection=collection,
        metadata=table_metadata,
        doc_id=f"{doc_id}_table_{table['page_number']}_{table['table_number']}"
    )

return {
    "status": "success",
    "filename": path.name,
    "pages": extracted["total_pages"],
    "chunks": ingested,
    "tables": len(extracted.get("tables", [])),
    "collection": collection,
    "doc_id": doc_id
}

async def harvest_pdf_url( url: str, collection: str, **kwargs ) -> Dict: """Download and harvest a PDF from URL.""" import httpx import tempfile

# Download PDF
async with httpx.AsyncClient() as client:
    response = await client.get(url, follow_redirects=True)
    response.raise_for_status()

# Save to temp file
with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as f:
    f.write(response.content)
    temp_path = f.name

try:
    result = await harvest_pdf(temp_path, collection, **kwargs)
    result["source_url"] = url
    return result
finally:
    Path(temp_path).unlink()  # Clean up

Metadata Schema

PDF chunk metadata

source_type: pdf source_path: /path/to/document.pdf source_url: https://... (if downloaded) filename: document.pdf total_pages: 45 page_start: 5 page_end: 7 section: "3. Methodology" chunk_index: 12 total_chunks: 28 harvested_at: "2024-01-01T12:00:00Z" is_academic: true title: "Paper Title" abstract: "Paper abstract..." content_type: text|table

Usage Examples

Local PDF

result = await harvest_pdf( pdf_path="/path/to/document.pdf", collection="research_papers", chunk_strategy="sections", is_academic=True )

PDF from URL

result = await harvest_pdf_url( url="https://arxiv.org/pdf/2301.00001.pdf", collection="ml_papers", is_academic=True )

Scanned document

result = await harvest_pdf( pdf_path="/path/to/scanned.pdf", collection="legacy_docs", use_ocr=True )

Refinement Notes

Track improvements as you use this skill.

Text extraction tested
Table extraction working
OCR fallback tested
Academic paper pattern validated
Chunking strategies compared
Large PDF handling optimized

pdf-harvester

Safety Notice

Copy this and send it to your AI assistant to learn

Core extraction

For OCR (scanned documents)

Also need: brew install tesseract poppler (macOS)

For academic papers

PDF chunk metadata

Local PDF

PDF from URL

Scanned document

Source Transparency

Related Skills

ffmpeg-patterns

site-crawler

ai-video-generation