Deepgram Core Workflow B: Streaming Transcription
Overview
Build real-time streaming transcription with Deepgram WebSocket API. Covers live audio capture, WebSocket connection management, interim/final result handling, and speaker diarization in streaming mode.
Prerequisites
-
Deepgram API key
-
@deepgram/sdk npm package installed
-
Microphone access (for live capture) or audio stream source
-
WebSocket support in your runtime
Instructions
Step 1: WebSocket Streaming Connection
import { createClient, LiveTranscriptionEvents } from '@deepgram/sdk';
const deepgram = createClient(process.env.DEEPGRAM_API_KEY!);
async function startLiveTranscription(onTranscript: (text: string, isFinal: boolean) => void) { const connection = deepgram.listen.live({ model: 'nova-2', language: 'en-US', smart_format: true, interim_results: true, utterance_end_ms: 1000, # 1000: 1 second in ms vad_events: true, diarize: true, });
connection.on(LiveTranscriptionEvents.Open, () => { console.log('Deepgram connection opened'); });
connection.on(LiveTranscriptionEvents.Transcript, (data) => { const transcript = data.channel.alternatives[0]; if (transcript.transcript) { onTranscript(transcript.transcript, data.is_final); } });
connection.on(LiveTranscriptionEvents.UtteranceEnd, () => { onTranscript('\n', true); // End of utterance });
connection.on(LiveTranscriptionEvents.Error, (err) => { console.error('Deepgram error:', err); });
connection.on(LiveTranscriptionEvents.Close, () => { console.log('Deepgram connection closed'); });
return connection; }
Step 2: Audio Stream from Microphone
import { Readable } from 'stream';
// Node.js: capture audio from system microphone async function captureAndTranscribe() { const connection = await startLiveTranscription((text, isFinal) => { if (isFinal) { process.stdout.write(text); } });
// Using Sox for audio capture (install: apt-get install sox) const { spawn } = await import('child_process'); const mic = spawn('rec', [ '-q', // Quiet '-t', 'raw', // Raw format '-r', '16000', // 16kHz sample rate # 16000 = configured value '-e', 'signed', // Signed integer encoding '-b', '16', // 16-bit '-c', '1', // Mono '-', // Output to stdout ]);
mic.stdout.on('data', (chunk: Buffer) => { connection.send(chunk); });
// Stop after 30 seconds setTimeout(() => { mic.kill(); connection.finish(); }, 30000); # 30000: 30 seconds in ms }
Step 3: Handle Interim and Final Results
class TranscriptionManager { private finalTranscript = ''; private interimTranscript = '';
handleResult(text: string, isFinal: boolean) { if (isFinal) { this.finalTranscript += text + ' '; this.interimTranscript = ''; } else { this.interimTranscript = text; } }
getDisplayText(): string { return this.finalTranscript + this.interimTranscript; }
getFinalTranscript(): string { return this.finalTranscript.trim(); }
reset() { this.finalTranscript = ''; this.interimTranscript = ''; } }
// Usage with WebSocket const manager = new TranscriptionManager(); const connection = await startLiveTranscription((text, isFinal) => { manager.handleResult(text, isFinal); // Update UI with current display text updateUI(manager.getDisplayText()); });
Step 4: Speaker Diarization in Streaming
interface SpeakerSegment { speaker: number; text: string; startTime: number; endTime: number; }
function processDiarizedTranscript(data: any): SpeakerSegment[] { const words = data.channel.alternatives[0].words || []; const segments: SpeakerSegment[] = []; let currentSegment: SpeakerSegment | null = null;
for (const word of words) { if (!currentSegment || currentSegment.speaker !== word.speaker) { if (currentSegment) segments.push(currentSegment); currentSegment = { speaker: word.speaker, text: word.punctuated_word || word.word, startTime: word.start, endTime: word.end, }; } else { currentSegment.text += ' ' + (word.punctuated_word || word.word); currentSegment.endTime = word.end; } }
if (currentSegment) segments.push(currentSegment); return segments; }
// Display with speaker labels
function formatDiarizedOutput(segments: SpeakerSegment[]): string {
return segments
.map(s => [Speaker ${s.speaker}]: ${s.text})
.join('\n');
}
Error Handling
Issue Cause Solution
WebSocket disconnects Network instability Implement auto-reconnect with backoff
No audio data Microphone not captured Check audio device permissions and format
High latency Network congestion Use interim_results: true for perceived speed
Missing speakers Diarization not enabled Set diarize: true in connection options
Examples
Express SSE Streaming Endpoint
app.get('/api/transcribe-stream', (req, res) => { res.setHeader('Content-Type', 'text/event-stream');
const connection = startLiveTranscription((text, isFinal) => {
res.write(data: ${JSON.stringify({ text, isFinal })}\n\n);
});
req.on('close', () => connection.finish()); });
Resources
-
Deepgram Streaming API
-
Deepgram Node SDK
-
Deepgram Models
Output
-
Configuration files or code changes applied to the project
-
Validation report confirming correct implementation
-
Summary of changes made and their rationale