collection-migration

Collection Migration Skill

Safely move, rename, merge, and manage RAG collections.

Overview

As projects evolve, you may need to:

Rename collections (project renamed)
Merge collections (consolidating knowledge)
Split collections (grew too large)
Archive collections (project ended)
Clone collections (forking a project)

This skill provides safe procedures for each operation.

Prerequisites

pip install qdrant-client

Safety Principles

Always backup first - Export before any destructive operation
Verify after migration - Run validation checks
Preserve metadata - Don't lose document provenance
Atomic operations - Complete fully or rollback

Operation 1: Export Collection

Use case: Backup or transfer to another environment

#!/usr/bin/env python3 """Export a collection to JSON."""

import json from datetime import datetime from qdrant_client import QdrantClient

def export_collection( collection_name: str, output_path: str = None, qdrant_url: str = "http://localhost:6333" ) -> str: """ Export collection to JSON file.

Args:
    collection_name: Name of collection to export
    output_path: Output file path (default: {collection}_{timestamp}.json)
    qdrant_url: Qdrant server URL

Returns:
    Path to exported file
"""
client = QdrantClient(url=qdrant_url)

# Get all points
results = client.scroll(
    collection_name=collection_name,
    limit=100000,
    with_payload=True,
    with_vectors=True
)

points = results[0]

# Build export data
export_data = {
    "collection_name": collection_name,
    "exported_at": datetime.now().isoformat(),
    "document_count": len(points),
    "documents": [
        {
            "id": str(p.id),
            "content": p.payload.get("content", ""),
            "metadata": {k: v for k, v in p.payload.items() if k != "content"},
            "vector": p.vector
        }
        for p in points
    ]
}

# Write to file
if output_path is None:
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    output_path = f"{collection_name}_{timestamp}.json"

with open(output_path, "w") as f:
    json.dump(export_data, f, indent=2)

print(f"✅ Exported {len(points)} documents to {output_path}")
return output_path

if name == "main": import sys if len(sys.argv) < 2: print("Usage: python export_collection.py <collection_name> [output_path]") sys.exit(1)

collection = sys.argv[1]
output = sys.argv[2] if len(sys.argv) > 2 else None
export_collection(collection, output)

Operation 2: Import Collection

Use case: Restore from backup or import shared collection

#!/usr/bin/env python3 """Import a collection from JSON export."""

import json from qdrant_client import QdrantClient from qdrant_client.models import Distance, VectorParams, PointStruct

def import_collection( input_path: str, new_name: str = None, qdrant_url: str = "http://localhost:6333", skip_vectors: bool = False ) -> str: """ Import collection from JSON file.

Args:
    input_path: Path to exported JSON file
    new_name: New collection name (default: use original name)
    qdrant_url: Qdrant server URL
    skip_vectors: If True, regenerate embeddings instead of using exported ones

Returns:
    Name of imported collection
"""
with open(input_path) as f:
    data = json.load(f)

collection_name = new_name or data["collection_name"]
client = QdrantClient(url=qdrant_url)

# Check if collection exists
existing = [c.name for c in client.get_collections().collections]
if collection_name in existing:
    raise ValueError(f"Collection '{collection_name}' already exists. Use different name or delete first.")

# Determine vector size from first document
if data["documents"] and data["documents"][0].get("vector"):
    vector_size = len(data["documents"][0]["vector"])
else:
    vector_size = 384  # Default for all-MiniLM-L6-v2

# Create collection
client.create_collection(
    collection_name=collection_name,
    vectors_config=VectorParams(size=vector_size, distance=Distance.COSINE)
)

# Prepare points
points = []
for doc in data["documents"]:
    if skip_vectors or not doc.get("vector"):
        continue  # Would need to regenerate embeddings

    payload = doc["metadata"] or {}
    payload["content"] = doc["content"]
    payload["imported_from"] = data["collection_name"]
    payload["imported_at"] = data["exported_at"]

    points.append(PointStruct(
        id=hash(doc["id"]) % (2**63),
        vector=doc["vector"],
        payload=payload
    ))

# Batch insert
batch_size = 100
for i in range(0, len(points), batch_size):
    batch = points[i:i + batch_size]
    client.upsert(collection_name=collection_name, points=batch)

print(f"✅ Imported {len(points)} documents into '{collection_name}'")
return collection_name

if name == "main": import sys if len(sys.argv) < 2: print("Usage: python import_collection.py <input_path> [new_name]") sys.exit(1)

input_path = sys.argv[1]
new_name = sys.argv[2] if len(sys.argv) > 2 else None
import_collection(input_path, new_name)

Operation 3: Rename Collection

Use case: Project renamed, need to update collection name

def rename_collection( old_name: str, new_name: str, qdrant_url: str = "http://localhost:6333" ): """ Rename a collection (export + import + delete). """ # Export first (backup) export_path = export_collection(old_name, qdrant_url=qdrant_url)

# Import with new name
import_collection(export_path, new_name=new_name, qdrant_url=qdrant_url)

# Delete old collection
client = QdrantClient(url=qdrant_url)
client.delete_collection(old_name)

print(f"✅ Renamed '{old_name}' to '{new_name}'")

Operation 4: Merge Collections

Use case: Consolidating multiple projects, combining research

def merge_collections( source_collections: list, target_collection: str, qdrant_url: str = "http://localhost:6333", deduplicate: bool = True ): """ Merge multiple collections into one.

Args:
    source_collections: List of collection names to merge
    target_collection: Name for merged collection
    deduplicate: If True, skip duplicate content
"""
client = QdrantClient(url=qdrant_url)

# Determine vector size from first source
first_coll = client.get_collection(source_collections[0])
vector_size = first_coll.config.params.vectors.size

# Create or get target collection
existing = [c.name for c in client.get_collections().collections]
if target_collection not in existing:
    from qdrant_client.models import Distance, VectorParams
    client.create_collection(
        collection_name=target_collection,
        vectors_config=VectorParams(size=vector_size, distance=Distance.COSINE)
    )

seen_hashes = set()
total_added = 0
total_skipped = 0

for source_name in source_collections:
    print(f"Merging '{source_name}'...")

    results = client.scroll(
        collection_name=source_name,
        limit=100000,
        with_payload=True,
        with_vectors=True
    )

    points = results[0]

    for p in points:
        content = p.payload.get("content", "")

        # Deduplication
        if deduplicate:
            content_hash = hash(content)
            if content_hash in seen_hashes:
                total_skipped += 1
                continue
            seen_hashes.add(content_hash)

        # Track source in payload
        payload = p.payload.copy()
        payload["merged_from"] = source_name

        client.upsert(
            collection_name=target_collection,
            points=[PointStruct(
                id=hash(f"{source_name}_{p.id}") % (2**63),
                vector=p.vector,
                payload=payload
            )]
        )
        total_added += 1

print(f"✅ Merged {total_added} documents into '{target_collection}'")
if deduplicate:
    print(f"   Skipped {total_skipped} duplicates")

Operation 5: Archive Collection

Use case: Project ended, keep data but mark as inactive

from pathlib import Path

def archive_collection( collection_name: str, qdrant_url: str = "http://localhost:6333" ): """ Archive a collection (export + delete with marker file). """ # Export export_path = export_collection(collection_name, qdrant_url=qdrant_url)

# Move to archives
archive_dir = Path("archives")
archive_dir.mkdir(exist_ok=True)

archive_path = archive_dir / Path(export_path).name
Path(export_path).rename(archive_path)

# Delete from database
client = QdrantClient(url=qdrant_url)
client.delete_collection(collection_name)

# Create marker file
from datetime import datetime
marker_path = archive_dir / f"{collection_name}.archived"
with open(marker_path, "w") as f:
    f.write(f"Archived: {datetime.now().isoformat()}\n")
    f.write(f"Export: {archive_path}\n")

print(f"✅ Archived '{collection_name}' to {archive_path}")

def restore_archive( collection_name: str, qdrant_url: str = "http://localhost:6333" ): """Restore an archived collection.""" archive_dir = Path("archives")

# Find the export file
exports = list(archive_dir.glob(f"{collection_name}_*.json"))
if not exports:
    raise FileNotFoundError(f"No archive found for '{collection_name}'")

# Use most recent
export_path = sorted(exports)[-1]

# Import
import_collection(str(export_path), new_name=collection_name, qdrant_url=qdrant_url)

# Remove marker
marker = archive_dir / f"{collection_name}.archived"
if marker.exists():
    marker.unlink()

print(f"✅ Restored '{collection_name}' from archive")

Refinement Notes

Add notes as you use these migration tools.

Export/import tested
Merge with deduplication verified
Archive/restore workflow complete

collection-migration

Safety Notice

Copy this and send it to your AI assistant to learn

Source Transparency

Related Skills

ffmpeg-patterns

site-crawler

ai-video-generation