/** * Audio preprocessing pipeline for Azure Speech Services. * * Converts incoming audio to Azure-compatible format: * - 16kHz sample rate * - Mono channel * - 16-bit signed PCM * - Normalized to -3 dBFS * - Silence trimmed using energy-based VAD * - Max 30 seconds duration */ export interface ProcessedAudio { /** 16-bit PCM samples at 16kHz mono */ samples: Int16Array; /** Sample rate (always 16000) */ sampleRate: number; /** Number of channels (always 1) */ channels: number; /** Duration in seconds */ duration: number; /** Raw PCM Buffer ready for Azure API (WAV-headerless 16-bit mono 16kHz) */ pcmBuffer: Buffer; /** RMS energy (pre-normalization) for quality assessment */ rmsEnergy: number; /** Peak amplitude (pre-normalization) */ peakAmplitude: number; /** Signal-to-noise ratio estimate in dB */ snrEstimate: number; } interface WavHeader { audioFormat: number; numChannels: number; sampleRate: number; byteRate: number; blockAlign: number; bitsPerSample: number; dataSize: number; } /** Maximum allowed audio duration in seconds */ const MAX_DURATION_SEC = 30; /** Maximum raw WAV file size before processing (default 5MB). Prevents memory exhaustion. */ const MAX_INPUT_BYTES = parseInt( process.env.VOICEPRINT_MAX_INPUT_BYTES ?? "5242880", 10, ); /** Target normalization level in dBFS */ const TARGET_DBFS = -3; /** Frame size for VAD in milliseconds */ const VAD_FRAME_MS = 30; /** Energy threshold multiplier (× mean energy) for VAD */ const VAD_ENERGY_THRESHOLD = 1.5; /** Minimum consecutive speech frames to consider as voice activity */ const VAD_MIN_SPEECH_FRAMES = 3; /** * Parse a WAV file header and return header info + data offset. * Supports PCM formats with 8, 16, 24, or 32-bit samples. */ function parseWavHeader(buffer: Buffer): { header: WavHeader; dataOffset: number } { if (buffer.length < 44) { throw new Error(`Audio too short to be valid WAV: ${buffer.length} bytes`); } const riffId = buffer.toString("ascii", 0, 4); if (riffId !== "RIFF") { throw new Error(`Invalid RIFF header: ${riffId}`); } const waveId = buffer.toString("ascii", 8, 12); if (waveId !== "WAVE") { throw new Error(`Invalid WAVE header: ${waveId}`); } // Scan for fmt chunk let offset = 12; let audioFormat = 0; let numChannels = 0; let sampleRate = 0; let byteRate = 0; let blockAlign = 0; let bitsPerSample = 0; let dataSize = 0; let dataOffset = 0; while (offset < buffer.length - 8) { const chunkId = buffer.toString("ascii", offset, offset + 4); const chunkSize = buffer.readUInt32LE(offset + 4); if (chunkId === "fmt ") { audioFormat = buffer.readUInt16LE(offset + 8); numChannels = buffer.readUInt16LE(offset + 10); sampleRate = buffer.readUInt32LE(offset + 12); byteRate = buffer.readUInt32LE(offset + 16); blockAlign = buffer.readUInt16LE(offset + 20); bitsPerSample = buffer.readUInt16LE(offset + 22); } else if (chunkId === "data") { dataSize = chunkSize; dataOffset = offset + 8; break; } offset += 8 + chunkSize; // Handle padding byte if (chunkSize % 2 !== 0) offset += 1; } if (!dataOffset || !dataSize) { throw new Error("No data chunk found in WAV file"); } if (audioFormat !== 1) { throw new Error(`Unsupported audio format: ${audioFormat} (only PCM/1 supported)`); } return { header: { audioFormat, numChannels, sampleRate, byteRate, blockAlign, bitsPerSample, dataSize }, dataOffset, }; } /** * Read raw PCM samples from a WAV data section into Float64Array, * normalizing to the range [-1.0, 1.0]. */ function readPcmSamples(buffer: Buffer, header: WavHeader, dataOffset: number): Float64Array { const { bitsPerSample } = header; const sampleCount = Math.floor(header.dataSize / (bitsPerSample / 8)); const samples = new Float64Array(sampleCount); if (bitsPerSample === 8) { for (let i = 0; i < sampleCount; i++) { // 8-bit PCM is unsigned, bias = 128 samples[i] = (buffer[dataOffset + i] - 128) / 128; } } else if (bitsPerSample === 16) { for (let i = 0; i < sampleCount; i++) { samples[i] = buffer.readInt16LE(dataOffset + i * 2) / 32768; } } else if (bitsPerSample === 24) { for (let i = 0; i < sampleCount; i++) { // 24-bit signed, little-endian const b0 = buffer[dataOffset + i * 3]; const b1 = buffer[dataOffset + i * 3 + 1]; const b2 = buffer[dataOffset + i * 3 + 2]; let value = b0 | (b1 << 8) | (b2 << 16); // Sign extend if (b2 & 0x80) value |= ~0xffffff; samples[i] = value / 8388608; } } else if (bitsPerSample === 32) { for (let i = 0; i < sampleCount; i++) { samples[i] = buffer.readInt32LE(dataOffset + i * 4) / 2147483648; } } else { throw new Error(`Unsupported bits per sample: ${bitsPerSample}`); } return samples; } /** * Convert multi-channel samples to mono by averaging channels. */ function convertToMono(samples: Float64Array, numChannels: number): Float64Array { if (numChannels === 1) return samples; const frameCount = Math.floor(samples.length / numChannels); const mono = new Float64Array(frameCount); for (let i = 0; i < frameCount; i++) { let sum = 0; for (let ch = 0; ch < numChannels; ch++) { sum += samples[i * numChannels + ch]; } mono[i] = sum / numChannels; } return mono; } /** * Resample audio using linear interpolation. */ function resample(samples: Float64Array, fromRate: number, toRate: number): Float64Array { if (fromRate === toRate) return samples; const ratio = fromRate / toRate; const outputLength = Math.floor(samples.length / ratio); const output = new Float64Array(outputLength); for (let i = 0; i < outputLength; i++) { const inputPos = i * ratio; const inputIndex = Math.floor(inputPos); const frac = inputPos - inputIndex; if (inputIndex >= samples.length - 1) { output[i] = samples[samples.length - 1]; } else { output[i] = samples[inputIndex] * (1 - frac) + samples[inputIndex + 1] * frac; } } return output; } /** * Normalize audio to target dBFS level. */ function normalize(samples: Float64Array, targetDbfs: number): Float64Array { let peak = 0; for (let i = 0; i < samples.length; i++) { const abs = Math.abs(samples[i]); if (abs > peak) peak = abs; } if (peak === 0) return samples; const targetAmplitude = 10 ** (targetDbfs / 20); const gain = targetAmplitude / peak; if (Math.abs(gain - 1) < 0.001) return samples; const normalized = new Float64Array(samples.length); for (let i = 0; i < samples.length; i++) { normalized[i] = samples[i] * gain; } return normalized; } /** * Calculate RMS energy of a frame of samples. */ function rmsEnergy(frame: Float64Array): number { let sumSq = 0; for (let i = 0; i < frame.length; i++) { sumSq += frame[i] * frame[i]; } return Math.sqrt(sumSq / frame.length); } /** * Simple energy-based Voice Activity Detection. * Trims leading and trailing silence, preserving internal pauses. */ function vadTrim(samples: Float64Array, sampleRate: number): Float64Array { const frameSize = Math.floor(sampleRate * VAD_FRAME_MS / 1000); const numFrames = Math.floor(samples.length / frameSize); if (numFrames < 2) return samples; // Calculate energy for each frame const frameEnergies = new Float64Array(numFrames); for (let f = 0; f < numFrames; f++) { const start = f * frameSize; const frame = samples.subarray(start, start + frameSize); frameEnergies[f] = rmsEnergy(frame); } // Calculate threshold as mean energy × multiplier let meanEnergy = 0; for (let f = 0; f < numFrames; f++) { meanEnergy += frameEnergies[f]; } meanEnergy /= numFrames; const threshold = Math.max(meanEnergy * VAD_ENERGY_THRESHOLD, 0.001); // Label frames as speech or silence const isSpeech = new Array(numFrames); for (let f = 0; f < numFrames; f++) { isSpeech[f] = frameEnergies[f] >= threshold; } // Find first speech frame (with minimum consecutive speech requirement) let firstSpeech = -1; let consecutiveCount = 0; for (let f = 0; f < numFrames; f++) { if (isSpeech[f]) { consecutiveCount++; if (consecutiveCount >= VAD_MIN_SPEECH_FRAMES) { firstSpeech = f - VAD_MIN_SPEECH_FRAMES + 1; break; } } else { consecutiveCount = 0; } } // Find last speech frame (from the end) let lastSpeech = -1; consecutiveCount = 0; for (let f = numFrames - 1; f >= 0; f--) { if (isSpeech[f]) { consecutiveCount++; if (consecutiveCount >= VAD_MIN_SPEECH_FRAMES) { lastSpeech = f + VAD_MIN_SPEECH_FRAMES - 1; break; } } else { consecutiveCount = 0; } } if (firstSpeech === -1 || lastSpeech === -1 || firstSpeech >= lastSpeech) { // No clear speech detected, return original return samples; } const startSample = Math.max(0, firstSpeech * frameSize); const endSample = Math.min(samples.length, (lastSpeech + 1) * frameSize); return samples.subarray(startSample, endSample); } /** * Convert Float64Array samples to 16-bit signed Int16Array PCM. */ function floatTo16BitPcm(samples: Float64Array): Int16Array { const pcm = new Int16Array(samples.length); for (let i = 0; i < samples.length; i++) { // Clamp to [-1.0, 1.0] and convert to 16-bit const clamped = Math.max(-1, Math.min(1, samples[i])); pcm[i] = Math.round(clamped * 32767); } return pcm; } /** * Audio quality metrics for confidence scoring. */ function computeQualityMetrics(samples: Float64Array): { rmsEnergy: number; peakAmplitude: number; snrEstimate: number; } { let peak = 0; let sumSq = 0; for (let i = 0; i < samples.length; i++) { const abs = Math.abs(samples[i]); if (abs > peak) peak = abs; sumSq += samples[i] * samples[i]; } const rms = Math.sqrt(sumSq / samples.length); // Simple SNR estimate: assume noise floor is bottom 10% of samples by energy const sorted = new Float64Array(samples.length); for (let i = 0; i < samples.length; i++) sorted[i] = Math.abs(samples[i]); sorted.sort(); const noiseFloor = sorted[Math.floor(samples.length * 0.1)]; const snr = noiseFloor > 0 ? 20 * Math.log10(rms / noiseFloor) : 40; return { rmsEnergy: rms, peakAmplitude: peak, snrEstimate: Math.min(40, Math.max(0, snr)), }; } /** * Main audio preprocessing pipeline: * 1. Validate input size * 2. Parse WAV header * 3. Validate duration from header (reject too-long audio before decoding) * 4. Read PCM samples * 5. Convert to mono * 6. Resample to 16kHz * 7. Normalize to -3 dBFS * 8. VAD silence trimming * 9. Limit to 30 seconds * 10. Convert to 16-bit PCM */ export async function preprocessAudio(inputBuffer: Buffer): Promise { // Reject oversized input early to prevent memory exhaustion if (inputBuffer.length > MAX_INPUT_BYTES) { throw new Error( `Audio file too large: ${(inputBuffer.length / 1024 / 1024).toFixed(1)}MB. ` + `Maximum ${(MAX_INPUT_BYTES / 1024 / 1024).toFixed(0)}MB. ` + `Please upload a shorter audio clip (max ${MAX_DURATION_SEC} seconds).`, ); } // Detect if it's a WAV by checking RIFF header const isWav = inputBuffer.length >= 4 && inputBuffer.toString("ascii", 0, 4) === "RIFF"; if (!isWav) { throw new Error( "Unsupported audio format. Only WAV files are supported. " + "Please upload a WAV file (PCM encoded).", ); } const { header, dataOffset } = parseWavHeader(inputBuffer); // Validate duration from header BEFORE allocating sample buffers. // This prevents loading multi-hour WAV files into memory. const totalSamples = Math.floor(header.dataSize / (header.bitsPerSample / 8) / header.numChannels); const durationSec = totalSamples / header.sampleRate; if (durationSec > MAX_DURATION_SEC + 30) { throw new Error( `Audio too long: ${durationSec.toFixed(1)}s. Maximum ${MAX_DURATION_SEC}s for analysis. ` + `Please trim your audio before uploading.`, ); } let samples = readPcmSamples(inputBuffer, header, dataOffset); // Convert to mono samples = convertToMono(samples, header.numChannels); // Store quality metrics before normalization const metrics = computeQualityMetrics(samples); // Resample to 16kHz samples = resample(samples, header.sampleRate, 16000); // Normalize to -3 dBFS samples = normalize(samples, TARGET_DBFS); // VAD silence trimming samples = vadTrim(samples, 16000); // Limit to 30 seconds (480,000 samples at 16kHz) const maxSamples = 16000 * MAX_DURATION_SEC; if (samples.length > maxSamples) { samples = samples.subarray(0, maxSamples); } // Convert to 16-bit PCM const pcmSamples = floatTo16BitPcm(samples); // Create raw PCM buffer (no WAV header - Azure expects raw PCM) const pcmBuffer = Buffer.from(pcmSamples.buffer); return { samples: pcmSamples, sampleRate: 16000, channels: 1, duration: samples.length / 16000, pcmBuffer, rmsEnergy: metrics.rmsEnergy, peakAmplitude: metrics.peakAmplitude, snrEstimate: metrics.snrEstimate, }; }