PDF Processor
Instructions
When processing PDF files, follow these steps based on your specific needs:
- Identify Processing Type
Determine what you need to do with the PDF:
-
Extract text content
-
Fill form fields
-
Extract images or tables
-
Merge or split PDFs
-
Add annotations or watermarks
-
Convert to other formats
- Text Extraction
Basic Text Extraction
import PyPDF2 import pdfplumber
Method 1: Using PyPDF2
def extract_text_pypdf2(file_path): with open(file_path, 'rb') as file: reader = PyPDF2.PdfReader(file) text = "" for page in reader.pages: text += page.extract_text() return text
Method 2: Using pdfplumber (better for tables)
def extract_text_pdfplumber(file_path): with pdfplumber.open(file_path) as pdf: text = "" for page in pdf.pages: text += page.extract_text() or "" return text
Advanced Text Extraction
-
Preserve formatting and layout
-
Handle multi-column documents
-
Extract text from specific regions
-
Process scanned PDFs with OCR
- Form Processing
Form Field Detection
def detect_form_fields(file_path): reader = PyPDF2.PdfReader(file_path) fields = {} if reader.get_fields(): for field_name, field in reader.get_fields().items(): fields[field_name] = { 'type': field.field_type, 'value': field.value, 'required': field.required if hasattr(field, 'required') else False } return fields
def fill_form_fields(file_path, output_path, field_data): reader = PyPDF2.PdfReader(file_path) writer = PyPDF2.PdfWriter()
for page in reader.pages:
writer.add_page(page)
if writer.get_fields():
for field_name, value in field_data.items():
if field_name in writer.get_fields():
writer.get_fields()[field_name].value = value
with open(output_path, 'wb') as output_file:
writer.write(output_file)
Common Form Types
-
Application forms
-
Invoices and receipts
-
Survey forms
-
Legal documents
-
Medical forms
- Content Analysis
Structure Analysis
def analyze_pdf_structure(file_path): with pdfplumber.open(file_path) as pdf: analysis = { 'pages': len(pdf.pages), 'has_images': False, 'has_tables': False, 'has_forms': False, 'text_density': [], 'sections': [] }
for i, page in enumerate(pdf.pages):
# Check for images
if page.images:
analysis['has_images'] = True
# Check for tables
if page.extract_tables():
analysis['has_tables'] = True
# Calculate text density
text = page.extract_text()
if text:
density = len(text) / (page.width * page.height)
analysis['text_density'].append(density)
# Detect section headers (basic heuristic)
lines = text.split('\n') if text else []
for line in lines:
if line.isupper() and len(line) < 50:
analysis['sections'].append({
'page': i + 1,
'title': line.strip()
})
return analysis
Table Extraction
def extract_tables(file_path): tables = [] with pdfplumber.open(file_path) as pdf: for page_num, page in enumerate(pdf.pages): page_tables = page.extract_tables() for table in page_tables: tables.append({ 'page': page_num + 1, 'data': table, 'rows': len(table), 'columns': len(table[0]) if table else 0 }) return tables
- PDF Manipulation
Merge PDFs
from PyPDF2 import PdfMerger
def merge_pdfs(file_paths, output_path): merger = PdfMerger() for path in file_paths: merger.append(path) merger.write(output_path) merger.close()
Split PDF
def split_pdf(file_path, output_dir): reader = PyPDF2.PdfReader(file_path) for i, page in enumerate(reader.pages): writer = PyPDF2.PdfWriter() writer.add_page(page) output_path = f"{output_dir}/page_{i+1}.pdf" with open(output_path, 'wb') as output_file: writer.write(output_file)
Add Watermark
def add_watermark(input_path, output_path, watermark_text): reader = PyPDF2.PdfReader(input_path) writer = PyPDF2.PdfWriter()
for page in reader.pages:
writer.add_page(page)
# Add watermark logic here
# This requires additional libraries like reportlab
with open(output_path, 'wb') as output_file:
writer.write(output_file)
6. OCR for Scanned PDFs
Using Tesseract OCR
import pytesseract from PIL import Image import fitz # PyMuPDF
def ocr_pdf(file_path): doc = fitz.open(file_path) text = ""
for page_num in range(len(doc)):
page = doc.load_page(page_num)
pix = page.get_pixmap()
img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
text += pytesseract.image_to_string(img)
return text
7. Error Handling
Common Issues
-
Password-protected PDFs
-
Corrupted files
-
Unsupported formats
-
Memory issues with large files
-
Encoding problems
Error Handling Pattern
import logging
def process_pdf_safely(file_path, processing_func): try: # Check if file exists if not os.path.exists(file_path): raise FileNotFoundError(f"File not found: {file_path}")
# Check file size
file_size = os.path.getsize(file_path)
if file_size > 100 * 1024 * 1024: # 100MB limit
logging.warning(f"Large file detected: {file_size} bytes")
# Process the file
result = processing_func(file_path)
return result
except Exception as e:
logging.error(f"Error processing PDF {file_path}: {str(e)}")
raise
8. Performance Optimization
For Large Files
-
Process pages in chunks
-
Use generators for memory efficiency
-
Implement progress tracking
-
Consider parallel processing
Batch Processing
import concurrent.futures import os
def batch_process_pdfs(directory, processing_func, max_workers=4): pdf_files = [f for f in os.listdir(directory) if f.endswith('.pdf')]
with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
futures = []
for pdf_file in pdf_files:
file_path = os.path.join(directory, pdf_file)
future = executor.submit(processing_func, file_path)
futures.append((pdf_file, future))
results = {}
for pdf_file, future in futures:
try:
results[pdf_file] = future.result()
except Exception as e:
results[pdf_file] = f"Error: {str(e)}"
return results
Usage Examples
Example 1: Extract Text from Invoice
-
Load the PDF invoice
-
Extract all text content
-
Parse for invoice number, date, amount
-
Save extracted data to structured format
Example 2: Fill Application Form
-
Load the application form PDF
-
Detect all form fields
-
Fill fields with provided data
-
Save filled form as new PDF
Example 3: Extract Tables from Report
-
Open multi-page report PDF
-
Extract all tables from each page
-
Convert tables to CSV or Excel
-
Preserve table structure and formatting
Required Libraries
Install necessary Python packages:
pip install PyPDF2 pdfplumber PyMuPDF pytesseract pillow
Tips
-
Always check if PDF is password-protected first
-
Use different libraries based on your needs (speed vs accuracy)
-
For scanned documents, OCR quality depends on image resolution
-
Consider the PDF version when working with older files
-
Test with sample pages before processing entire documents
-
Handle encoding issues for non-English text