452 lines
13 KiB
TypeScript
452 lines
13 KiB
TypeScript
/**
|
||
* Audio preprocessing pipeline for Azure Speech Services.
|
||
*
|
||
* Converts incoming audio to Azure-compatible format:
|
||
* - 16kHz sample rate
|
||
* - Mono channel
|
||
* - 16-bit signed PCM
|
||
* - Normalized to -3 dBFS
|
||
* - Silence trimmed using energy-based VAD
|
||
* - Max 30 seconds duration
|
||
*/
|
||
|
||
export interface ProcessedAudio {
|
||
/** 16-bit PCM samples at 16kHz mono */
|
||
samples: Int16Array;
|
||
/** Sample rate (always 16000) */
|
||
sampleRate: number;
|
||
/** Number of channels (always 1) */
|
||
channels: number;
|
||
/** Duration in seconds */
|
||
duration: number;
|
||
/** Raw PCM Buffer ready for Azure API (WAV-headerless 16-bit mono 16kHz) */
|
||
pcmBuffer: Buffer;
|
||
/** RMS energy (pre-normalization) for quality assessment */
|
||
rmsEnergy: number;
|
||
/** Peak amplitude (pre-normalization) */
|
||
peakAmplitude: number;
|
||
/** Signal-to-noise ratio estimate in dB */
|
||
snrEstimate: number;
|
||
}
|
||
|
||
interface WavHeader {
|
||
audioFormat: number;
|
||
numChannels: number;
|
||
sampleRate: number;
|
||
byteRate: number;
|
||
blockAlign: number;
|
||
bitsPerSample: number;
|
||
dataSize: number;
|
||
}
|
||
|
||
/** Maximum allowed audio duration in seconds */
|
||
const MAX_DURATION_SEC = 30;
|
||
/** Maximum raw WAV file size before processing (default 5MB). Prevents memory exhaustion. */
|
||
const MAX_INPUT_BYTES = parseInt(
|
||
process.env.VOICEPRINT_MAX_INPUT_BYTES ?? "5242880",
|
||
10,
|
||
);
|
||
/** Target normalization level in dBFS */
|
||
const TARGET_DBFS = -3;
|
||
/** Frame size for VAD in milliseconds */
|
||
const VAD_FRAME_MS = 30;
|
||
/** Energy threshold multiplier (× mean energy) for VAD */
|
||
const VAD_ENERGY_THRESHOLD = 1.5;
|
||
/** Minimum consecutive speech frames to consider as voice activity */
|
||
const VAD_MIN_SPEECH_FRAMES = 3;
|
||
|
||
/**
|
||
* Parse a WAV file header and return header info + data offset.
|
||
* Supports PCM formats with 8, 16, 24, or 32-bit samples.
|
||
*/
|
||
function parseWavHeader(buffer: Buffer): { header: WavHeader; dataOffset: number } {
|
||
if (buffer.length < 44) {
|
||
throw new Error(`Audio too short to be valid WAV: ${buffer.length} bytes`);
|
||
}
|
||
|
||
const riffId = buffer.toString("ascii", 0, 4);
|
||
if (riffId !== "RIFF") {
|
||
throw new Error(`Invalid RIFF header: ${riffId}`);
|
||
}
|
||
|
||
const waveId = buffer.toString("ascii", 8, 12);
|
||
if (waveId !== "WAVE") {
|
||
throw new Error(`Invalid WAVE header: ${waveId}`);
|
||
}
|
||
|
||
// Scan for fmt chunk
|
||
let offset = 12;
|
||
let audioFormat = 0;
|
||
let numChannels = 0;
|
||
let sampleRate = 0;
|
||
let byteRate = 0;
|
||
let blockAlign = 0;
|
||
let bitsPerSample = 0;
|
||
let dataSize = 0;
|
||
let dataOffset = 0;
|
||
|
||
while (offset < buffer.length - 8) {
|
||
const chunkId = buffer.toString("ascii", offset, offset + 4);
|
||
const chunkSize = buffer.readUInt32LE(offset + 4);
|
||
|
||
if (chunkId === "fmt ") {
|
||
audioFormat = buffer.readUInt16LE(offset + 8);
|
||
numChannels = buffer.readUInt16LE(offset + 10);
|
||
sampleRate = buffer.readUInt32LE(offset + 12);
|
||
byteRate = buffer.readUInt32LE(offset + 16);
|
||
blockAlign = buffer.readUInt16LE(offset + 20);
|
||
bitsPerSample = buffer.readUInt16LE(offset + 22);
|
||
} else if (chunkId === "data") {
|
||
dataSize = chunkSize;
|
||
dataOffset = offset + 8;
|
||
break;
|
||
}
|
||
|
||
offset += 8 + chunkSize;
|
||
// Handle padding byte
|
||
if (chunkSize % 2 !== 0) offset += 1;
|
||
}
|
||
|
||
if (!dataOffset || !dataSize) {
|
||
throw new Error("No data chunk found in WAV file");
|
||
}
|
||
|
||
if (audioFormat !== 1) {
|
||
throw new Error(`Unsupported audio format: ${audioFormat} (only PCM/1 supported)`);
|
||
}
|
||
|
||
return {
|
||
header: { audioFormat, numChannels, sampleRate, byteRate, blockAlign, bitsPerSample, dataSize },
|
||
dataOffset,
|
||
};
|
||
}
|
||
|
||
/**
|
||
* Read raw PCM samples from a WAV data section into Float64Array,
|
||
* normalizing to the range [-1.0, 1.0].
|
||
*/
|
||
function readPcmSamples(buffer: Buffer, header: WavHeader, dataOffset: number): Float64Array {
|
||
const { bitsPerSample } = header;
|
||
const sampleCount = Math.floor(header.dataSize / (bitsPerSample / 8));
|
||
const samples = new Float64Array(sampleCount);
|
||
|
||
if (bitsPerSample === 8) {
|
||
for (let i = 0; i < sampleCount; i++) {
|
||
// 8-bit PCM is unsigned, bias = 128
|
||
samples[i] = (buffer[dataOffset + i] - 128) / 128;
|
||
}
|
||
} else if (bitsPerSample === 16) {
|
||
for (let i = 0; i < sampleCount; i++) {
|
||
samples[i] = buffer.readInt16LE(dataOffset + i * 2) / 32768;
|
||
}
|
||
} else if (bitsPerSample === 24) {
|
||
for (let i = 0; i < sampleCount; i++) {
|
||
// 24-bit signed, little-endian
|
||
const b0 = buffer[dataOffset + i * 3];
|
||
const b1 = buffer[dataOffset + i * 3 + 1];
|
||
const b2 = buffer[dataOffset + i * 3 + 2];
|
||
let value = b0 | (b1 << 8) | (b2 << 16);
|
||
// Sign extend
|
||
if (b2 & 0x80) value |= ~0xffffff;
|
||
samples[i] = value / 8388608;
|
||
}
|
||
} else if (bitsPerSample === 32) {
|
||
for (let i = 0; i < sampleCount; i++) {
|
||
samples[i] = buffer.readInt32LE(dataOffset + i * 4) / 2147483648;
|
||
}
|
||
} else {
|
||
throw new Error(`Unsupported bits per sample: ${bitsPerSample}`);
|
||
}
|
||
|
||
return samples;
|
||
}
|
||
|
||
/**
|
||
* Convert multi-channel samples to mono by averaging channels.
|
||
*/
|
||
function convertToMono(samples: Float64Array, numChannels: number): Float64Array {
|
||
if (numChannels === 1) return samples;
|
||
|
||
const frameCount = Math.floor(samples.length / numChannels);
|
||
const mono = new Float64Array(frameCount);
|
||
|
||
for (let i = 0; i < frameCount; i++) {
|
||
let sum = 0;
|
||
for (let ch = 0; ch < numChannels; ch++) {
|
||
sum += samples[i * numChannels + ch];
|
||
}
|
||
mono[i] = sum / numChannels;
|
||
}
|
||
|
||
return mono;
|
||
}
|
||
|
||
/**
|
||
* Resample audio using linear interpolation.
|
||
*/
|
||
function resample(samples: Float64Array, fromRate: number, toRate: number): Float64Array {
|
||
if (fromRate === toRate) return samples;
|
||
|
||
const ratio = fromRate / toRate;
|
||
const outputLength = Math.floor(samples.length / ratio);
|
||
const output = new Float64Array(outputLength);
|
||
|
||
for (let i = 0; i < outputLength; i++) {
|
||
const inputPos = i * ratio;
|
||
const inputIndex = Math.floor(inputPos);
|
||
const frac = inputPos - inputIndex;
|
||
|
||
if (inputIndex >= samples.length - 1) {
|
||
output[i] = samples[samples.length - 1];
|
||
} else {
|
||
output[i] = samples[inputIndex] * (1 - frac) + samples[inputIndex + 1] * frac;
|
||
}
|
||
}
|
||
|
||
return output;
|
||
}
|
||
|
||
/**
|
||
* Normalize audio to target dBFS level.
|
||
*/
|
||
function normalize(samples: Float64Array, targetDbfs: number): Float64Array {
|
||
let peak = 0;
|
||
for (let i = 0; i < samples.length; i++) {
|
||
const abs = Math.abs(samples[i]);
|
||
if (abs > peak) peak = abs;
|
||
}
|
||
|
||
if (peak === 0) return samples;
|
||
|
||
const targetAmplitude = 10 ** (targetDbfs / 20);
|
||
const gain = targetAmplitude / peak;
|
||
|
||
if (Math.abs(gain - 1) < 0.001) return samples;
|
||
|
||
const normalized = new Float64Array(samples.length);
|
||
for (let i = 0; i < samples.length; i++) {
|
||
normalized[i] = samples[i] * gain;
|
||
}
|
||
|
||
return normalized;
|
||
}
|
||
|
||
/**
|
||
* Calculate RMS energy of a frame of samples.
|
||
*/
|
||
function rmsEnergy(frame: Float64Array): number {
|
||
let sumSq = 0;
|
||
for (let i = 0; i < frame.length; i++) {
|
||
sumSq += frame[i] * frame[i];
|
||
}
|
||
return Math.sqrt(sumSq / frame.length);
|
||
}
|
||
|
||
/**
|
||
* Simple energy-based Voice Activity Detection.
|
||
* Trims leading and trailing silence, preserving internal pauses.
|
||
*/
|
||
function vadTrim(samples: Float64Array, sampleRate: number): Float64Array {
|
||
const frameSize = Math.floor(sampleRate * VAD_FRAME_MS / 1000);
|
||
const numFrames = Math.floor(samples.length / frameSize);
|
||
|
||
if (numFrames < 2) return samples;
|
||
|
||
// Calculate energy for each frame
|
||
const frameEnergies = new Float64Array(numFrames);
|
||
for (let f = 0; f < numFrames; f++) {
|
||
const start = f * frameSize;
|
||
const frame = samples.subarray(start, start + frameSize);
|
||
frameEnergies[f] = rmsEnergy(frame);
|
||
}
|
||
|
||
// Calculate threshold as mean energy × multiplier
|
||
let meanEnergy = 0;
|
||
for (let f = 0; f < numFrames; f++) {
|
||
meanEnergy += frameEnergies[f];
|
||
}
|
||
meanEnergy /= numFrames;
|
||
|
||
const threshold = Math.max(meanEnergy * VAD_ENERGY_THRESHOLD, 0.001);
|
||
|
||
// Label frames as speech or silence
|
||
const isSpeech = new Array<boolean>(numFrames);
|
||
for (let f = 0; f < numFrames; f++) {
|
||
isSpeech[f] = frameEnergies[f] >= threshold;
|
||
}
|
||
|
||
// Find first speech frame (with minimum consecutive speech requirement)
|
||
let firstSpeech = -1;
|
||
let consecutiveCount = 0;
|
||
for (let f = 0; f < numFrames; f++) {
|
||
if (isSpeech[f]) {
|
||
consecutiveCount++;
|
||
if (consecutiveCount >= VAD_MIN_SPEECH_FRAMES) {
|
||
firstSpeech = f - VAD_MIN_SPEECH_FRAMES + 1;
|
||
break;
|
||
}
|
||
} else {
|
||
consecutiveCount = 0;
|
||
}
|
||
}
|
||
|
||
// Find last speech frame (from the end)
|
||
let lastSpeech = -1;
|
||
consecutiveCount = 0;
|
||
for (let f = numFrames - 1; f >= 0; f--) {
|
||
if (isSpeech[f]) {
|
||
consecutiveCount++;
|
||
if (consecutiveCount >= VAD_MIN_SPEECH_FRAMES) {
|
||
lastSpeech = f + VAD_MIN_SPEECH_FRAMES - 1;
|
||
break;
|
||
}
|
||
} else {
|
||
consecutiveCount = 0;
|
||
}
|
||
}
|
||
|
||
if (firstSpeech === -1 || lastSpeech === -1 || firstSpeech >= lastSpeech) {
|
||
// No clear speech detected, return original
|
||
return samples;
|
||
}
|
||
|
||
const startSample = Math.max(0, firstSpeech * frameSize);
|
||
const endSample = Math.min(samples.length, (lastSpeech + 1) * frameSize);
|
||
|
||
return samples.subarray(startSample, endSample);
|
||
}
|
||
|
||
/**
|
||
* Convert Float64Array samples to 16-bit signed Int16Array PCM.
|
||
*/
|
||
function floatTo16BitPcm(samples: Float64Array): Int16Array {
|
||
const pcm = new Int16Array(samples.length);
|
||
for (let i = 0; i < samples.length; i++) {
|
||
// Clamp to [-1.0, 1.0] and convert to 16-bit
|
||
const clamped = Math.max(-1, Math.min(1, samples[i]));
|
||
pcm[i] = Math.round(clamped * 32767);
|
||
}
|
||
return pcm;
|
||
}
|
||
|
||
/**
|
||
* Audio quality metrics for confidence scoring.
|
||
*/
|
||
function computeQualityMetrics(samples: Float64Array): {
|
||
rmsEnergy: number;
|
||
peakAmplitude: number;
|
||
snrEstimate: number;
|
||
} {
|
||
let peak = 0;
|
||
let sumSq = 0;
|
||
for (let i = 0; i < samples.length; i++) {
|
||
const abs = Math.abs(samples[i]);
|
||
if (abs > peak) peak = abs;
|
||
sumSq += samples[i] * samples[i];
|
||
}
|
||
|
||
const rms = Math.sqrt(sumSq / samples.length);
|
||
|
||
// Simple SNR estimate: assume noise floor is bottom 10% of samples by energy
|
||
const sorted = new Float64Array(samples.length);
|
||
for (let i = 0; i < samples.length; i++) sorted[i] = Math.abs(samples[i]);
|
||
sorted.sort();
|
||
const noiseFloor = sorted[Math.floor(samples.length * 0.1)];
|
||
|
||
const snr = noiseFloor > 0 ? 20 * Math.log10(rms / noiseFloor) : 40;
|
||
|
||
return {
|
||
rmsEnergy: rms,
|
||
peakAmplitude: peak,
|
||
snrEstimate: Math.min(40, Math.max(0, snr)),
|
||
};
|
||
}
|
||
|
||
/**
|
||
* Main audio preprocessing pipeline:
|
||
* 1. Validate input size
|
||
* 2. Parse WAV header
|
||
* 3. Validate duration from header (reject too-long audio before decoding)
|
||
* 4. Read PCM samples
|
||
* 5. Convert to mono
|
||
* 6. Resample to 16kHz
|
||
* 7. Normalize to -3 dBFS
|
||
* 8. VAD silence trimming
|
||
* 9. Limit to 30 seconds
|
||
* 10. Convert to 16-bit PCM
|
||
*/
|
||
export async function preprocessAudio(inputBuffer: Buffer): Promise<ProcessedAudio> {
|
||
// Reject oversized input early to prevent memory exhaustion
|
||
if (inputBuffer.length > MAX_INPUT_BYTES) {
|
||
throw new Error(
|
||
`Audio file too large: ${(inputBuffer.length / 1024 / 1024).toFixed(1)}MB. ` +
|
||
`Maximum ${(MAX_INPUT_BYTES / 1024 / 1024).toFixed(0)}MB. ` +
|
||
`Please upload a shorter audio clip (max ${MAX_DURATION_SEC} seconds).`,
|
||
);
|
||
}
|
||
|
||
// Detect if it's a WAV by checking RIFF header
|
||
const isWav =
|
||
inputBuffer.length >= 4 &&
|
||
inputBuffer.toString("ascii", 0, 4) === "RIFF";
|
||
|
||
if (!isWav) {
|
||
throw new Error(
|
||
"Unsupported audio format. Only WAV files are supported. " +
|
||
"Please upload a WAV file (PCM encoded).",
|
||
);
|
||
}
|
||
|
||
const { header, dataOffset } = parseWavHeader(inputBuffer);
|
||
|
||
// Validate duration from header BEFORE allocating sample buffers.
|
||
// This prevents loading multi-hour WAV files into memory.
|
||
const totalSamples = Math.floor(header.dataSize / (header.bitsPerSample / 8) / header.numChannels);
|
||
const durationSec = totalSamples / header.sampleRate;
|
||
if (durationSec > MAX_DURATION_SEC + 30) {
|
||
throw new Error(
|
||
`Audio too long: ${durationSec.toFixed(1)}s. Maximum ${MAX_DURATION_SEC}s for analysis. ` +
|
||
`Please trim your audio before uploading.`,
|
||
);
|
||
}
|
||
let samples = readPcmSamples(inputBuffer, header, dataOffset);
|
||
|
||
// Convert to mono
|
||
samples = convertToMono(samples, header.numChannels);
|
||
|
||
// Store quality metrics before normalization
|
||
const metrics = computeQualityMetrics(samples);
|
||
|
||
// Resample to 16kHz
|
||
samples = resample(samples, header.sampleRate, 16000);
|
||
|
||
// Normalize to -3 dBFS
|
||
samples = normalize(samples, TARGET_DBFS);
|
||
|
||
// VAD silence trimming
|
||
samples = vadTrim(samples, 16000);
|
||
|
||
// Limit to 30 seconds (480,000 samples at 16kHz)
|
||
const maxSamples = 16000 * MAX_DURATION_SEC;
|
||
if (samples.length > maxSamples) {
|
||
samples = samples.subarray(0, maxSamples);
|
||
}
|
||
|
||
// Convert to 16-bit PCM
|
||
const pcmSamples = floatTo16BitPcm(samples);
|
||
|
||
// Create raw PCM buffer (no WAV header - Azure expects raw PCM)
|
||
const pcmBuffer = Buffer.from(pcmSamples.buffer);
|
||
|
||
return {
|
||
samples: pcmSamples,
|
||
sampleRate: 16000,
|
||
channels: 1,
|
||
duration: samples.length / 16000,
|
||
pcmBuffer,
|
||
rmsEnergy: metrics.rmsEnergy,
|
||
peakAmplitude: metrics.peakAmplitude,
|
||
snrEstimate: metrics.snrEstimate,
|
||
};
|
||
}
|