Geographic Clustering
Grid-based clustering with medoid finding and risk scoring from event density.
When to Use This Skill
-
Clustering thousands of geo-located events for visualization
-
Need O(n) performance instead of O(n²) distance-based clustering
-
Want actual data points as map markers (medoids, not synthetic centroids)
-
Calculating risk scores from cluster characteristics
Core Concepts
Grid-based clustering is much faster than distance-based algorithms:
-
O(n) grouping by grid cell vs O(n²) for distance-based
-
Medoid finding gives actual data point as center (better for map markers)
-
Multi-factor risk scoring combines density, sentiment, and recency
Implementation
TypeScript
interface ProcessedEvent { id: string; lat?: number; lon?: number; sentiment: number; seenDate: Date; sourceCountry: string; }
interface Cluster { id: string; events: ProcessedEvent[]; centroidLat: number; centroidLon: number; countryCode: string; }
interface RiskPrediction { id: string; lat: number; lon: number; riskScore: number; sentiment: number; eventCount: number; topSignals: string[]; summary: string; countryCode: string; }
// Grid cell size in degrees (~100km at equator) const GRID_SIZE = 1.0;
/**
- Generate grid cell key for coordinates
*/
function getGridKey(lat: number, lon: number): string {
const gridLat = Math.floor(lat / GRID_SIZE) * GRID_SIZE;
const gridLon = Math.floor(lon / GRID_SIZE) * GRID_SIZE;
return
${gridLat},${gridLon}; }
/**
- Cluster events by geographic grid - O(n) complexity */ function clusterEvents( events: ProcessedEvent[], minClusterSize: number = 3 ): Cluster[] { const grid = new Map<string, ProcessedEvent[]>();
// O(n) grouping by grid cell for (const event of events) { if (event.lat === undefined || event.lon === undefined) continue;
const key = getGridKey(event.lat, event.lon);
if (!grid.has(key)) {
grid.set(key, []);
}
grid.get(key)!.push(event);
}
// Convert grid cells to clusters const clusters: Cluster[] = []; let clusterId = 1;
for (const [key, clusterEvents] of grid) { if (clusterEvents.length < minClusterSize) continue;
const centroidLat = clusterEvents.reduce((sum, e) => sum + (e.lat || 0), 0)
/ clusterEvents.length;
const centroidLon = clusterEvents.reduce((sum, e) => sum + (e.lon || 0), 0)
/ clusterEvents.length;
clusters.push({
id: `cluster-${clusterId++}`,
events: clusterEvents,
centroidLat,
centroidLon,
countryCode: getDominantCountry(clusterEvents),
});
}
return clusters; }
/**
- Find the medoid (most central actual point) in a cluster */ function findMedoid(events: ProcessedEvent[]): ProcessedEvent { if (events.length === 0) throw new Error('Cannot find medoid of empty cluster'); if (events.length === 1) return events[0];
const centroidLat = events.reduce((sum, e) => sum + (e.lat || 0), 0) / events.length; const centroidLon = events.reduce((sum, e) => sum + (e.lon || 0), 0) / events.length;
let medoid = events[0]; let minDist = Infinity;
for (const event of events) { const dist = Math.hypot( (event.lat || 0) - centroidLat, (event.lon || 0) - centroidLon ); if (dist < minDist) { minDist = dist; medoid = event; } }
return medoid; }
/**
- Get most common country code in cluster */ function getDominantCountry(events: ProcessedEvent[]): string { const counts: Record<string, number> = {};
for (const event of events) { const country = event.sourceCountry || 'XX'; counts[country] = (counts[country] || 0) + 1; }
let maxCount = 0; let dominant = 'XX';
for (const [country, count] of Object.entries(counts)) { if (count > maxCount) { maxCount = count; dominant = country; } }
return dominant; }
/**
- Calculate risk score for a cluster (0-100)
- Multi-factor: density + sentiment + recency */ function calculateRiskScore(cluster: Cluster): number { const eventCount = cluster.events.length; const avgSentiment = cluster.events.reduce((sum, e) => sum + e.sentiment, 0) / eventCount;
// Factor 1: Event density (0-40 points) const eventFactor = Math.min(eventCount / 300, 1) * 40;
// Factor 2: Sentiment (0-30 points) const sentimentFactor = Math.abs(Math.min(avgSentiment, 0)) * 30;
// Factor 3: Recency (0-30 points) const now = Date.now(); const avgAge = cluster.events.reduce((sum, e) => { return sum + (now - e.seenDate.getTime()); }, 0) / eventCount; const hoursOld = avgAge / (1000 * 60 * 60); const recencyFactor = Math.max(0, 30 - hoursOld);
const total = eventFactor + sentimentFactor + recencyFactor; return Math.round(Math.min(100, Math.max(0, total))); }
/**
- Convert clusters to risk predictions for API/UI */ function clustersToPredictions(clusters: Cluster[]): RiskPrediction[] { const predictions: RiskPrediction[] = [];
for (const cluster of clusters) { const medoid = findMedoid(cluster.events); const riskScore = calculateRiskScore(cluster); const avgSentiment = cluster.events.reduce((sum, e) => sum + e.sentiment, 0) / cluster.events.length;
const riskLevel = riskScore >= 80 ? 'Critical'
: riskScore >= 60 ? 'High'
: riskScore >= 40 ? 'Moderate'
: 'Low';
predictions.push({
id: cluster.id,
lat: medoid.lat || cluster.centroidLat,
lon: medoid.lon || cluster.centroidLon,
riskScore,
sentiment: avgSentiment,
eventCount: cluster.events.length,
topSignals: generateSignals(cluster, riskScore),
summary: `${riskLevel} risk in ${cluster.countryCode}: ${cluster.events.length} events`,
countryCode: cluster.countryCode,
});
}
return predictions.sort((a, b) => b.riskScore - a.riskScore); }
function generateSignals(cluster: Cluster, riskScore: number): string[] { const signals: string[] = []; const avgSentiment = cluster.events.reduce((sum, e) => sum + e.sentiment, 0) / cluster.events.length;
signals.push(${cluster.events.length} events);
if (avgSentiment < -0.5) signals.push('Very negative tone'); else if (avgSentiment < -0.2) signals.push('Negative tone');
if (riskScore >= 80) signals.push('Critical risk'); else if (riskScore >= 60) signals.push('High risk');
return signals.slice(0, 3); }
Usage Examples
Basic Clustering Pipeline
async function processEvents() { const events = await fetchEvents();
// Cluster geographically (min 3 events per cluster) const clusters = clusterEvents(events, 3);
// Convert to predictions for map const predictions = clustersToPredictions(clusters);
// Return top 50 for visualization return predictions.slice(0, 50); }
Best Practices
-
Grid-based O(n) - Much faster than distance-based for large datasets
-
Medoid vs Centroid - Use actual data point for map markers (won't be in ocean)
-
Multi-factor risk - Combine density, sentiment, and recency
-
Minimum cluster size - Filter noise with threshold (3+ events)
-
Configurable parameters - Make grid size and thresholds adjustable
Common Mistakes
-
Distance-based clustering on large datasets (O(n²) is too slow)
-
Synthetic centroids for map markers (may be in invalid locations)
-
Single-factor risk scoring (misses nuance)
-
No minimum cluster size (noise becomes clusters)
-
Hardcoded risk thresholds (should be configurable)
Related Patterns
-
deduplication - Deduplicate events before clustering
-
batch-processing - Process clusters efficiently
-
snapshot-aggregation - Aggregate cluster data for dashboards