Data Design

Overview

Principles for designing data structures, schemas, and data flows that are efficient, maintainable, and scalable.

Data Modeling

Entity-Relationship Diagrams

┌─────────────┐ ┌─────────────┐ ┌─────────────┐ │ User │ │ Order │ │ Product │ ├─────────────┤ ├─────────────┤ ├─────────────┤ │ id (PK) │──┐ │ id (PK) │ ┌──│ id (PK) │ │ email │ │ │ user_id(FK) │←───┘ │ name │ │ name │ └───→│ status │ │ price │ │ created_at │ │ total │ │ stock │ └─────────────┘ │ created_at │ └─────────────┘ └─────────────┘ │ │ │ ┌──────┴──────┐ │ ↓ ↓ │ ┌─────────────┐ │ │ OrderItem │ │ ├─────────────┤ │ │ id (PK) │ │ │ order_id(FK)│ │ │ product_id │─────────────────────┘ │ quantity │ │ price │ └─────────────┘

Relationship Types

Type Description Example

1:1 One to one User ↔ Profile

1:N One to many User → Orders

M:N Many to many Students ↔ Courses

-- 1:1 (profile extends user) CREATE TABLE users ( id SERIAL PRIMARY KEY, email VARCHAR(255) UNIQUE );

CREATE TABLE profiles ( user_id INTEGER PRIMARY KEY REFERENCES users(id), bio TEXT, avatar_url VARCHAR(255) );

-- 1:N (user has many orders) CREATE TABLE orders ( id SERIAL PRIMARY KEY, user_id INTEGER REFERENCES users(id), total DECIMAL(10,2) );

-- M:N (students ↔ courses via junction table) CREATE TABLE enrollments ( student_id INTEGER REFERENCES students(id), course_id INTEGER REFERENCES courses(id), enrolled_at TIMESTAMP DEFAULT NOW(), PRIMARY KEY (student_id, course_id) );

Normalization

Normal Forms

Form Rule Example Violation

1NF Atomic values, no repeating groups tags: "a,b,c"

2NF 1NF + no partial dependencies Non-key depends on part of composite key

3NF 2NF + no transitive dependencies zip → city in orders table

BCNF Every determinant is a candidate key Rare edge cases

-- ❌ Violates 1NF (non-atomic) CREATE TABLE products ( id SERIAL PRIMARY KEY, name VARCHAR(255), tags VARCHAR(255) -- "electronics,sale,featured" );

-- ✅ 1NF compliant CREATE TABLE products ( id SERIAL PRIMARY KEY, name VARCHAR(255) );

CREATE TABLE product_tags ( product_id INTEGER REFERENCES products(id), tag VARCHAR(50), PRIMARY KEY (product_id, tag) );

-- ❌ Violates 3NF (transitive dependency) CREATE TABLE orders ( id SERIAL PRIMARY KEY, customer_zip VARCHAR(10), customer_city VARCHAR(100) -- Depends on zip, not order );

-- ✅ 3NF compliant CREATE TABLE customers ( id SERIAL PRIMARY KEY, zip VARCHAR(10), city VARCHAR(100) );

CREATE TABLE orders ( id SERIAL PRIMARY KEY, customer_id INTEGER REFERENCES customers(id) );

Denormalization

When to Denormalize

Normalize for: ✅ Write-heavy workloads ✅ Data integrity requirements ✅ Storage efficiency ✅ Flexibility in queries

Denormalize for: ✅ Read-heavy workloads ✅ Complex joins hurting performance ✅ Reporting/analytics ✅ Known access patterns

Denormalization Patterns

-- Computed columns CREATE TABLE orders ( id SERIAL PRIMARY KEY, items JSONB, item_count INTEGER GENERATED ALWAYS AS (jsonb_array_length(items)) STORED, total DECIMAL(10,2) );

-- Duplicated data for read performance CREATE TABLE posts ( id SERIAL PRIMARY KEY, author_id INTEGER REFERENCES users(id), author_name VARCHAR(100), -- Duplicated from users author_avatar VARCHAR(255), -- Duplicated from users content TEXT );

-- Materialized view for complex queries CREATE MATERIALIZED VIEW monthly_sales AS SELECT DATE_TRUNC('month', created_at) as month, product_id, SUM(quantity) as units_sold, SUM(total) as revenue FROM order_items GROUP BY 1, 2;

-- Refresh periodically REFRESH MATERIALIZED VIEW CONCURRENTLY monthly_sales;

Schema Design Patterns

Soft Deletes

CREATE TABLE users ( id SERIAL PRIMARY KEY, email VARCHAR(255), deleted_at TIMESTAMP NULL, -- Partial unique index CONSTRAINT unique_active_email UNIQUE (email) WHERE deleted_at IS NULL );

-- Query active users only SELECT * FROM users WHERE deleted_at IS NULL;

Audit Trail

CREATE TABLE audit_log ( id SERIAL PRIMARY KEY, table_name VARCHAR(100), record_id INTEGER, action VARCHAR(10), -- INSERT, UPDATE, DELETE old_data JSONB, new_data JSONB, changed_by INTEGER REFERENCES users(id), changed_at TIMESTAMP DEFAULT NOW() );

-- Trigger for automatic auditing CREATE OR REPLACE FUNCTION audit_trigger() RETURNS TRIGGER AS $$ BEGIN INSERT INTO audit_log (table_name, record_id, action, old_data, new_data, changed_by) VALUES ( TG_TABLE_NAME, COALESCE(NEW.id, OLD.id), TG_OP, CASE WHEN TG_OP != 'INSERT' THEN to_jsonb(OLD) END, CASE WHEN TG_OP != 'DELETE' THEN to_jsonb(NEW) END, current_setting('app.user_id', true)::INTEGER ); RETURN COALESCE(NEW, OLD); END; $$ LANGUAGE plpgsql;

Multi-Tenancy

-- Row-level security CREATE TABLE organizations ( id SERIAL PRIMARY KEY, name VARCHAR(255) );

CREATE TABLE projects ( id SERIAL PRIMARY KEY, org_id INTEGER REFERENCES organizations(id), name VARCHAR(255) );

-- Enable RLS ALTER TABLE projects ENABLE ROW LEVEL SECURITY;

CREATE POLICY org_isolation ON projects USING (org_id = current_setting('app.org_id')::INTEGER);

-- Set org context per request SET app.org_id = 123; SELECT * FROM projects; -- Only sees org 123's projects

Versioning / History

-- Version table pattern CREATE TABLE documents ( id SERIAL PRIMARY KEY, current_version_id INTEGER );

CREATE TABLE document_versions ( id SERIAL PRIMARY KEY, document_id INTEGER REFERENCES documents(id), version INTEGER, content TEXT, created_at TIMESTAMP DEFAULT NOW(), created_by INTEGER REFERENCES users(id), UNIQUE (document_id, version) );

-- Temporal tables (PostgreSQL) CREATE TABLE products ( id SERIAL PRIMARY KEY, name VARCHAR(255), price DECIMAL(10,2), valid_from TIMESTAMP DEFAULT NOW(), valid_to TIMESTAMP DEFAULT 'infinity' );

-- Query historical state SELECT * FROM products WHERE valid_from <= '2024-01-01' AND valid_to > '2024-01-01';

NoSQL Schema Design

Document Store (MongoDB)

// Embedded vs Referenced

// ✅ Embed when: data is accessed together, 1:few relationship { _id: ObjectId("..."), title: "Blog Post", author: { name: "John", email: "john@example.com" }, comments: [ { user: "Jane", text: "Great post!", date: ISODate("...") } ] }

// ✅ Reference when: data is accessed independently, 1:many or M:N { _id: ObjectId("..."), title: "Blog Post", authorId: ObjectId("..."), // Reference to users collection commentIds: [ObjectId("..."), ObjectId("...")] }

// ❌ Anti-pattern: Unbounded arrays { _id: ObjectId("..."), logs: [...] // Can grow to millions, hits 16MB limit }

// ✅ Better: Bucket pattern { _id: ObjectId("..."), sensorId: "sensor-123", date: ISODate("2024-01-15"), readings: [...] // Max ~1000 per document }

Key-Value Store (Redis)

Naming conventions

user:123 # User object user:123:sessions # User's sessions (set) user:123:orders # User's orders (list) order:456 # Order object orders:pending # Queue of pending orders (list) products:category:electronics # Products in category (set)

Expiration patterns

session:{token} # Expires after 30 min rate_limit:ip:1.2.3.4 # Expires after 1 min cache:api:/users/123 # Expires after 5 min

Data Pipeline Design

ETL vs ELT

ETL (Extract, Transform, Load): Source → Transform (external) → Data Warehouse Use: Traditional, when transformation is complex

ELT (Extract, Load, Transform): Source → Data Lake/Warehouse → Transform (in-place) Use: Modern, leverages warehouse compute power

Event Sourcing

// Events are the source of truth interface Event { id: string; aggregateId: string; type: string; payload: unknown; timestamp: Date; version: number; }

// Event store class EventStore { async append(aggregateId: string, events: Event[]) { await db.events.insertMany(events); }

async getEvents(aggregateId: string): Promise<Event[]> { return db.events .find({ aggregateId }) .sort({ version: 1 }) .toArray(); } }

// Rebuild state from events function rebuildAccount(events: Event[]): Account { return events.reduce((account, event) => { switch (event.type) { case 'AccountOpened': return { balance: 0, ...event.payload }; case 'MoneyDeposited': return { ...account, balance: account.balance + event.payload.amount }; case 'MoneyWithdrawn': return { ...account, balance: account.balance - event.payload.amount }; default: return account; } }, {} as Account); }

Data Governance

Data Quality Dimensions

Dimension Description Example Check

Accuracy Correct values Email format validation

Completeness No missing data Required fields present

Consistency Same across systems User ID matches in all tables

Timeliness Up to date Last updated within 24h

Uniqueness No duplicates Unique email per user

Schema Evolution

-- Safe migrations

-- ✅ Adding nullable column ALTER TABLE users ADD COLUMN phone VARCHAR(20) NULL;

-- ✅ Adding column with default ALTER TABLE users ADD COLUMN status VARCHAR(20) DEFAULT 'active';

-- ⚠️ Making column non-null (multi-step) -- Step 1: Add with default ALTER TABLE users ADD COLUMN verified BOOLEAN DEFAULT false; -- Step 2: Backfill data UPDATE users SET verified = true WHERE email_verified_at IS NOT NULL; -- Step 3: Add constraint ALTER TABLE users ALTER COLUMN verified SET NOT NULL;

-- ❌ Dangerous: Renaming column -- Instead: Add new, migrate data, remove old (over multiple deploys)

Related Skills

[[database]] - Database implementation
[[architecture-patterns]] - Data architecture patterns
[[api-design]] - Data in APIs

data-design

Safety Notice

Copy this and send it to your AI assistant to learn

Naming conventions

Expiration patterns

Source Transparency

Related Skills

saas-platforms

architecture-patterns

frontend

project-management