Document Inventory Skill

Overview

This skill scans document collections (PDFs, Word docs, text files) and creates a structured inventory with metadata, automatic categorization, and collection statistics. Essential first step before building knowledge bases.

Quick Start

from pathlib import Path import sqlite3

Scan directory

documents = [] for filepath in Path("/path/to/docs").rglob("*.pdf"): documents.append({ 'filename': filepath.name, 'size': filepath.stat().st_size, 'path': str(filepath) })

Store in database

conn = sqlite3.connect("inventory.db") cursor = conn.cursor() cursor.execute("CREATE TABLE IF NOT EXISTS docs (name TEXT, size INTEGER, path TEXT)") for doc in documents: cursor.execute("INSERT INTO docs VALUES (?, ?, ?)", (doc['filename'], doc['size'], doc['path'])) conn.commit() print(f"Inventoried {len(documents)} documents")

When to Use

Auditing large document libraries before processing
Understanding the scope of a document collection
Categorizing documents by type, source, or content
Preparing inventories for knowledge base creation
Generating reports on document collections
Identifying duplicates or organizing files

Features

Recursive scanning - Process nested directories
Metadata extraction - Size, dates, page counts
Auto-categorization - Pattern-based classification
Statistics generation - Collection summaries
SQLite storage - Queryable inventory database
Multiple formats - PDF, DOCX, TXT, and more

Implementation

Core Inventory Builder

#!/usr/bin/env python3 """Document inventory builder."""

import sqlite3 import os from pathlib import Path from datetime import datetime import logging

logging.basicConfig(level=logging.INFO) logger = logging.getLogger(name)

class DocumentInventory: """Build and manage document inventories."""

SUPPORTED_EXTENSIONS = {
    '.pdf': 'PDF',
    '.docx': 'Word',
    '.doc': 'Word',
    '.txt': 'Text',
    '.md': 'Markdown',
    '.xlsx': 'Excel',
    '.xls': 'Excel',
    '.pptx': 'PowerPoint',
    '.ppt': 'PowerPoint',
}

def __init__(self, db_path):
    self.db_path = db_path
    self.conn = sqlite3.connect(db_path, timeout=30)
    self._setup_tables()

def _setup_tables(self):
    cursor = self.conn.cursor()

    cursor.execute('''
        CREATE TABLE IF NOT EXISTS documents (
            id INTEGER PRIMARY KEY,
            filename TEXT NOT NULL,
            filepath TEXT UNIQUE NOT NULL,
            extension TEXT,
            file_type TEXT,
            category TEXT,
            file_size INTEGER,
            created_date TEXT,
            modified_date TEXT,
            parent_dir TEXT,
            depth INTEGER,
            scanned_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
        )
    ''')

    cursor.execute('''
        CREATE INDEX IF NOT EXISTS idx_category ON documents(category)
    ''')

    cursor.execute('''
        CREATE INDEX IF NOT EXISTS idx_extension ON documents(extension)
    ''')

    self.conn.commit()

def scan_directory(self, root_path):
    """Scan directory and build inventory."""
    root = Path(root_path).resolve()
    logger.info(f"Scanning: {root}")

    count = 0
    for filepath in root.rglob('*'):
        if filepath.is_file():
            ext = filepath.suffix.lower()
            if ext in self.SUPPORTED_EXTENSIONS:
                self._add_document(filepath, root)
                count += 1

                if count % 500 == 0:
                    logger.info(f"Scanned {count} documents...")
                    self.conn.commit()

    self.conn.commit()
    logger.info(f"Scan complete: {count} documents found")
    return count

def _add_document(self, filepath, root):
    """Add document to inventory."""
    cursor = self.conn.cursor()

    try:
        stat = filepath.stat()
        ext = filepath.suffix.lower()

        cursor.execute('''
            INSERT OR REPLACE INTO documents
            (filename, filepath, extension, file_type, category,
             file_size, created_date, modified_date, parent_dir, depth)
            VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
        ''', (
            filepath.name,
            str(filepath),
            ext,
            self.SUPPORTED_EXTENSIONS.get(ext, 'Unknown'),
            self._categorize(filepath),
            stat.st_size,
            datetime.fromtimestamp(stat.st_ctime).isoformat(),
            datetime.fromtimestamp(stat.st_mtime).isoformat(),
            str(filepath.parent),
            len(filepath.relative_to(root).parts) - 1
        ))

    except Exception as e:
        logger.warning(f"Error adding {filepath}: {e}")

def _categorize(self, filepath):
    """Auto-categorize document based on patterns."""
    name = filepath.name.upper()
    path_str = str(filepath).upper()

    # Industry standard patterns
    patterns = {
        'API': 'API',
        'ISO': 'ISO',
        'ASME': 'ASME',
        'DNV': 'DNV',
        'NORSOK': 'NORSOK',
        'BSI': 'BSI',
        'ASTM': 'ASTM',
        'AWS': 'AWS',
        'ABS': 'ABS',
        'AISC': 'AISC',
        'IEEE': 'IEEE',
    }

    for pattern, category in patterns.items():
        if pattern in name or pattern in path_str:
            return category

    # Path-based categorization
    path_categories = {
        'STANDARD': 'Standards',
        'SPEC': 'Specifications',
        'MANUAL': 'Manuals',
        'GUIDE': 'Guides',
        'REPORT': 'Reports',
        'DRAWING': 'Drawings',
        'PROCEDURE': 'Procedures',
    }

    for pattern, category in path_categories.items():
        if pattern in path_str:
            return category

    return 'Unknown'

def get_statistics(self):
    """Get inventory statistics."""
    cursor = self.conn.cursor()

    stats = {}

    # Total count
    cursor.execute('SELECT COUNT(*) FROM documents')
    stats['total_documents'] = cursor.fetchone()[0]

    # Total size
    cursor.execute('SELECT SUM(file_size) FROM documents')
    total_bytes = cursor.fetchone()[0] or 0
    stats['total_size_mb'] = round(total_bytes / (1024 * 1024), 2)

    # By file type
    cursor.execute('''
        SELECT file_type, COUNT(*), SUM(file_size)
        FROM documents
        GROUP BY file_type
        ORDER BY COUNT(*) DESC
    ''')
    stats['by_type'] = {
        row[0]: {'count': row[1], 'size_mb': round((row[2] or 0) / 1024 / 1024, 2)}
        for row in cursor.fetchall()
    }

    # By category
    cursor.execute('''
        SELECT category, COUNT(*)
        FROM documents
        GROUP BY category
        ORDER BY COUNT(*) DESC
    ''')
    stats['by_category'] = dict(cursor.fetchall())

    # By extension
    cursor.execute('''
        SELECT extension, COUNT(*)
        FROM documents
        GROUP BY extension
        ORDER BY COUNT(*) DESC
    ''')
    stats['by_extension'] = dict(cursor.fetchall())

    return stats

def search(self, query, category=None, file_type=None, limit=50):
    """Search inventory."""
    cursor = self.conn.cursor()

    sql = 'SELECT filename, filepath, category, file_size FROM documents WHERE 1=1'
    params = []

    if query:
        sql += ' AND filename LIKE ?'
        params.append(f'%{query}%')

    if category:
        sql += ' AND category = ?'
        params.append(category)

    if file_type:
        sql += ' AND file_type = ?'
        params.append(file_type)

    sql += ' ORDER BY filename LIMIT ?'
    params.append(limit)

    cursor.execute(sql, params)
    return cursor.fetchall()

def export_csv(self, output_path):
    """Export inventory to CSV."""
    import csv

    cursor = self.conn.cursor()
    cursor.execute('SELECT * FROM documents')

    columns = [desc[0] for desc in cursor.description]

    with open(output_path, 'w', newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerow(columns)
        writer.writerows(cursor.fetchall())

    logger.info(f"Exported to {output_path}")

CLI Interface

#!/usr/bin/env python3 """Document Inventory CLI."""

import argparse import json

def main(): parser = argparse.ArgumentParser(description='Document Inventory Tool') subparsers = parser.add_subparsers(dest='command', help='Commands')

# Scan command
scan_parser = subparsers.add_parser('scan', help='Scan directory')
scan_parser.add_argument('path', help='Directory to scan')
scan_parser.add_argument('--db', default='inventory.db', help='Database path')

# Stats command
stats_parser = subparsers.add_parser('stats', help='Show statistics')
stats_parser.add_argument('--db', default='inventory.db', help='Database path')
stats_parser.add_argument('--json', action='store_true', help='Output as JSON')

# Search command
search_parser = subparsers.add_parser('search', help='Search inventory')
search_parser.add_argument('query', help='Search query')
search_parser.add_argument('--db', default='inventory.db', help='Database path')
search_parser.add_argument('--category', help='Filter by category')
search_parser.add_argument('--type', help='Filter by file type')

# Export command
export_parser = subparsers.add_parser('export', help='Export to CSV')
export_parser.add_argument('output', help='Output CSV path')
export_parser.add_argument('--db', default='inventory.db', help='Database path')

args = parser.parse_args()

if args.command == 'scan':
    inventory = DocumentInventory(args.db)
    count = inventory.scan_directory(args.path)
    print(f"\nScanned {count} documents")

    stats = inventory.get_statistics()
    print(f"Total size: {stats['total_size_mb']} MB")
    print(f"\nBy category:")
    for cat, count in list(stats['by_category'].items())[:10]:
        print(f"  {cat}: {count}")

elif args.command == 'stats':
    inventory = DocumentInventory(args.db)
    stats = inventory.get_statistics()

    if args.json:
        print(json.dumps(stats, indent=2))
    else:
        print(f"Total Documents: {stats['total_documents']}")
        print(f"Total Size: {stats['total_size_mb']} MB")
        print(f"\nBy Type:")
        for t, data in stats['by_type'].items():
            print(f"  {t}: {data['count']} ({data['size_mb']} MB)")
        print(f"\nBy Category:")
        for cat, count in list(stats['by_category'].items())[:15]:
            print(f"  {cat}: {count}")

elif args.command == 'search':
    inventory = DocumentInventory(args.db)
    results = inventory.search(
        args.query,
        category=args.category,
        file_type=args.type
    )

    print(f"Found {len(results)} results:\n")
    for filename, filepath, category, size in results:
        size_kb = size / 1024
        print(f"  [{category:10}] {filename} ({size_kb:.1f} KB)")

elif args.command == 'export':
    inventory = DocumentInventory(args.db)
    inventory.export_csv(args.output)

else:
    parser.print_help()

if name == 'main': main()

Report Generator

def generate_report(db_path, output_path): """Generate HTML inventory report.""" inventory = DocumentInventory(db_path) stats = inventory.get_statistics()

html = f"""
&#x3C;!DOCTYPE html>
&#x3C;html>
&#x3C;head>
    &#x3C;title>Document Inventory Report&#x3C;/title>
    &#x3C;style>
        body {{ font-family: Arial, sans-serif; margin: 40px; }}
        h1 {{ color: #333; }}
        .stat-box {{ background: #f5f5f5; padding: 20px; margin: 10px 0; border-radius: 8px; }}
        .stat-value {{ font-size: 2em; font-weight: bold; color: #2196F3; }}
        table {{ border-collapse: collapse; width: 100%; margin: 20px 0; }}
        th, td {{ border: 1px solid #ddd; padding: 12px; text-align: left; }}
        th {{ background: #2196F3; color: white; }}
        tr:nth-child(even) {{ background: #f9f9f9; }}
    &#x3C;/style>
&#x3C;/head>
&#x3C;body>
    &#x3C;h1>Document Inventory Report&#x3C;/h1>
    &#x3C;p>Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}&#x3C;/p>

    &#x3C;div class="stat-box">
        &#x3C;div class="stat-value">{stats['total_documents']:,}&#x3C;/div>
        &#x3C;div>Total Documents&#x3C;/div>
    &#x3C;/div>

    &#x3C;div class="stat-box">
        &#x3C;div class="stat-value">{stats['total_size_mb']:,.1f} MB&#x3C;/div>
        &#x3C;div>Total Size&#x3C;/div>
    &#x3C;/div>

    &#x3C;h2>By File Type&#x3C;/h2>
    &#x3C;table>
        &#x3C;tr>&#x3C;th>Type&#x3C;/th>&#x3C;th>Count&#x3C;/th>&#x3C;th>Size (MB)&#x3C;/th>&#x3C;/tr>
        {''.join(f"&#x3C;tr>&#x3C;td>{t}&#x3C;/td>&#x3C;td>{d['count']}&#x3C;/td>&#x3C;td>{d['size_mb']}&#x3C;/td>&#x3C;/tr>"
                 for t, d in stats['by_type'].items())}
    &#x3C;/table>

    &#x3C;h2>By Category&#x3C;/h2>
    &#x3C;table>
        &#x3C;tr>&#x3C;th>Category&#x3C;/th>&#x3C;th>Count&#x3C;/th>&#x3C;/tr>
        {''.join(f"&#x3C;tr>&#x3C;td>{c}&#x3C;/td>&#x3C;td>{n}&#x3C;/td>&#x3C;/tr>"
                 for c, n in stats['by_category'].items())}
    &#x3C;/table>
&#x3C;/body>
&#x3C;/html>
"""

with open(output_path, 'w') as f:
    f.write(html)

print(f"Report generated: {output_path}")

Custom Categorization

Extend with Your Patterns

Add custom patterns for your domain

CUSTOM_PATTERNS = { 'SPEC': 'Specifications', 'DWG': 'Drawings', 'REV': 'Revisions', 'APPROVED': 'Approved', 'DRAFT': 'Draft', 'SUPERSEDED': 'Superseded', }

def categorize_custom(filepath): name = filepath.name.upper() for pattern, category in CUSTOM_PATTERNS.items(): if pattern in name: return category return 'Uncategorized'

Multi-Level Categories

def categorize_hierarchical(filepath): """Create hierarchical categories.""" name = filepath.name.upper()

# Primary category
primary = 'General'
if 'API' in name:
    primary = 'API Standards'
elif 'ISO' in name:
    primary = 'ISO Standards'

# Secondary category
secondary = 'Other'
if 'DESIGN' in name:
    secondary = 'Design'
elif 'SAFETY' in name:
    secondary = 'Safety'
elif 'QUALITY' in name:
    secondary = 'Quality'

return f"{primary}/{secondary}"

Execution Checklist

Identify target directory for scanning
Create SQLite database for inventory
Run initial scan and review results
Customize categorization patterns if needed
Generate statistics report
Export to CSV for review
Generate HTML report for stakeholders
Plan next steps (knowledge base creation)

Error Handling

Common Errors

Error: PermissionError

Cause: Insufficient permissions to read files
Solution: Run with appropriate permissions or skip protected files

Error: sqlite3.OperationalError (database is locked)

Cause: Concurrent access without timeout
Solution: Use timeout=30 when connecting

Error: UnicodeDecodeError in filenames

Cause: Non-UTF8 characters in file paths
Solution: Use errors='replace' when processing paths

Error: OSError (too many open files)

Cause: Not closing file handles properly
Solution: Use context managers and batch commits

Error: Slow scanning on network drives

Cause: Network latency for each file access
Solution: Copy to local drive or use async scanning

Metrics

Metric Typical Value

Scan speed (local) ~1000 files/second

Scan speed (network) ~100 files/second

Database size ~1KB per 10 documents

Memory usage ~50MB for 100K documents

Report generation <1 second

Best Practices

Scan before processing - Always inventory first
Use SQLite timeout - timeout=30 for concurrent access
Batch commits - Commit every 500 files
Handle errors gracefully - Log and continue on failures
Export for review - Generate CSV/HTML for stakeholders
Update incrementally - Use INSERT OR REPLACE

Example Usage

Scan directory

python inventory.py scan /path/to/documents --db inventory.db

View statistics

python inventory.py stats --db inventory.db

Search

python inventory.py search "API" --category "Standards"

Export to CSV

python inventory.py export inventory.csv --db inventory.db

Related Skills

knowledge-base-builder
Build searchable database after inventory
pdf-text-extractor
Extract text from inventoried PDFs
semantic-search-setup
Add AI search capabilities

Dependencies

No external dependencies - uses Python standard library

Optional: pandas for advanced data manipulation

pip install pandas

Version History

1.1.0 (2026-01-02): Added Quick Start, Execution Checklist, Error Handling, Metrics sections; updated frontmatter with version, category, related_skills
1.0.0 (2024-10-15): Initial release with SQLite storage, auto-categorization, CLI interface

document-inventory

Safety Notice

Copy this and send it to your AI assistant to learn

Scan directory

Store in database

Add custom patterns for your domain

Scan directory

View statistics

Search

Export to CSV

No external dependencies - uses Python standard library

Optional: pandas for advanced data manipulation

Source Transparency

Related Skills

echarts

pandoc

mkdocs