Files
Kordant/web/src/server/services/voiceprint/audio.processor.ts

452 lines
13 KiB
TypeScript
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
/**
* Audio preprocessing pipeline for Azure Speech Services.
*
* Converts incoming audio to Azure-compatible format:
* - 16kHz sample rate
* - Mono channel
* - 16-bit signed PCM
* - Normalized to -3 dBFS
* - Silence trimmed using energy-based VAD
* - Max 30 seconds duration
*/
export interface ProcessedAudio {
/** 16-bit PCM samples at 16kHz mono */
samples: Int16Array;
/** Sample rate (always 16000) */
sampleRate: number;
/** Number of channels (always 1) */
channels: number;
/** Duration in seconds */
duration: number;
/** Raw PCM Buffer ready for Azure API (WAV-headerless 16-bit mono 16kHz) */
pcmBuffer: Buffer;
/** RMS energy (pre-normalization) for quality assessment */
rmsEnergy: number;
/** Peak amplitude (pre-normalization) */
peakAmplitude: number;
/** Signal-to-noise ratio estimate in dB */
snrEstimate: number;
}
interface WavHeader {
audioFormat: number;
numChannels: number;
sampleRate: number;
byteRate: number;
blockAlign: number;
bitsPerSample: number;
dataSize: number;
}
/** Maximum allowed audio duration in seconds */
const MAX_DURATION_SEC = 30;
/** Maximum raw WAV file size before processing (default 5MB). Prevents memory exhaustion. */
const MAX_INPUT_BYTES = parseInt(
process.env.VOICEPRINT_MAX_INPUT_BYTES ?? "5242880",
10,
);
/** Target normalization level in dBFS */
const TARGET_DBFS = -3;
/** Frame size for VAD in milliseconds */
const VAD_FRAME_MS = 30;
/** Energy threshold multiplier (× mean energy) for VAD */
const VAD_ENERGY_THRESHOLD = 1.5;
/** Minimum consecutive speech frames to consider as voice activity */
const VAD_MIN_SPEECH_FRAMES = 3;
/**
* Parse a WAV file header and return header info + data offset.
* Supports PCM formats with 8, 16, 24, or 32-bit samples.
*/
function parseWavHeader(buffer: Buffer): { header: WavHeader; dataOffset: number } {
if (buffer.length < 44) {
throw new Error(`Audio too short to be valid WAV: ${buffer.length} bytes`);
}
const riffId = buffer.toString("ascii", 0, 4);
if (riffId !== "RIFF") {
throw new Error(`Invalid RIFF header: ${riffId}`);
}
const waveId = buffer.toString("ascii", 8, 12);
if (waveId !== "WAVE") {
throw new Error(`Invalid WAVE header: ${waveId}`);
}
// Scan for fmt chunk
let offset = 12;
let audioFormat = 0;
let numChannels = 0;
let sampleRate = 0;
let byteRate = 0;
let blockAlign = 0;
let bitsPerSample = 0;
let dataSize = 0;
let dataOffset = 0;
while (offset < buffer.length - 8) {
const chunkId = buffer.toString("ascii", offset, offset + 4);
const chunkSize = buffer.readUInt32LE(offset + 4);
if (chunkId === "fmt ") {
audioFormat = buffer.readUInt16LE(offset + 8);
numChannels = buffer.readUInt16LE(offset + 10);
sampleRate = buffer.readUInt32LE(offset + 12);
byteRate = buffer.readUInt32LE(offset + 16);
blockAlign = buffer.readUInt16LE(offset + 20);
bitsPerSample = buffer.readUInt16LE(offset + 22);
} else if (chunkId === "data") {
dataSize = chunkSize;
dataOffset = offset + 8;
break;
}
offset += 8 + chunkSize;
// Handle padding byte
if (chunkSize % 2 !== 0) offset += 1;
}
if (!dataOffset || !dataSize) {
throw new Error("No data chunk found in WAV file");
}
if (audioFormat !== 1) {
throw new Error(`Unsupported audio format: ${audioFormat} (only PCM/1 supported)`);
}
return {
header: { audioFormat, numChannels, sampleRate, byteRate, blockAlign, bitsPerSample, dataSize },
dataOffset,
};
}
/**
* Read raw PCM samples from a WAV data section into Float64Array,
* normalizing to the range [-1.0, 1.0].
*/
function readPcmSamples(buffer: Buffer, header: WavHeader, dataOffset: number): Float64Array {
const { bitsPerSample } = header;
const sampleCount = Math.floor(header.dataSize / (bitsPerSample / 8));
const samples = new Float64Array(sampleCount);
if (bitsPerSample === 8) {
for (let i = 0; i < sampleCount; i++) {
// 8-bit PCM is unsigned, bias = 128
samples[i] = (buffer[dataOffset + i] - 128) / 128;
}
} else if (bitsPerSample === 16) {
for (let i = 0; i < sampleCount; i++) {
samples[i] = buffer.readInt16LE(dataOffset + i * 2) / 32768;
}
} else if (bitsPerSample === 24) {
for (let i = 0; i < sampleCount; i++) {
// 24-bit signed, little-endian
const b0 = buffer[dataOffset + i * 3];
const b1 = buffer[dataOffset + i * 3 + 1];
const b2 = buffer[dataOffset + i * 3 + 2];
let value = b0 | (b1 << 8) | (b2 << 16);
// Sign extend
if (b2 & 0x80) value |= ~0xffffff;
samples[i] = value / 8388608;
}
} else if (bitsPerSample === 32) {
for (let i = 0; i < sampleCount; i++) {
samples[i] = buffer.readInt32LE(dataOffset + i * 4) / 2147483648;
}
} else {
throw new Error(`Unsupported bits per sample: ${bitsPerSample}`);
}
return samples;
}
/**
* Convert multi-channel samples to mono by averaging channels.
*/
function convertToMono(samples: Float64Array, numChannels: number): Float64Array {
if (numChannels === 1) return samples;
const frameCount = Math.floor(samples.length / numChannels);
const mono = new Float64Array(frameCount);
for (let i = 0; i < frameCount; i++) {
let sum = 0;
for (let ch = 0; ch < numChannels; ch++) {
sum += samples[i * numChannels + ch];
}
mono[i] = sum / numChannels;
}
return mono;
}
/**
* Resample audio using linear interpolation.
*/
function resample(samples: Float64Array, fromRate: number, toRate: number): Float64Array {
if (fromRate === toRate) return samples;
const ratio = fromRate / toRate;
const outputLength = Math.floor(samples.length / ratio);
const output = new Float64Array(outputLength);
for (let i = 0; i < outputLength; i++) {
const inputPos = i * ratio;
const inputIndex = Math.floor(inputPos);
const frac = inputPos - inputIndex;
if (inputIndex >= samples.length - 1) {
output[i] = samples[samples.length - 1];
} else {
output[i] = samples[inputIndex] * (1 - frac) + samples[inputIndex + 1] * frac;
}
}
return output;
}
/**
* Normalize audio to target dBFS level.
*/
function normalize(samples: Float64Array, targetDbfs: number): Float64Array {
let peak = 0;
for (let i = 0; i < samples.length; i++) {
const abs = Math.abs(samples[i]);
if (abs > peak) peak = abs;
}
if (peak === 0) return samples;
const targetAmplitude = 10 ** (targetDbfs / 20);
const gain = targetAmplitude / peak;
if (Math.abs(gain - 1) < 0.001) return samples;
const normalized = new Float64Array(samples.length);
for (let i = 0; i < samples.length; i++) {
normalized[i] = samples[i] * gain;
}
return normalized;
}
/**
* Calculate RMS energy of a frame of samples.
*/
function rmsEnergy(frame: Float64Array): number {
let sumSq = 0;
for (let i = 0; i < frame.length; i++) {
sumSq += frame[i] * frame[i];
}
return Math.sqrt(sumSq / frame.length);
}
/**
* Simple energy-based Voice Activity Detection.
* Trims leading and trailing silence, preserving internal pauses.
*/
function vadTrim(samples: Float64Array, sampleRate: number): Float64Array {
const frameSize = Math.floor(sampleRate * VAD_FRAME_MS / 1000);
const numFrames = Math.floor(samples.length / frameSize);
if (numFrames < 2) return samples;
// Calculate energy for each frame
const frameEnergies = new Float64Array(numFrames);
for (let f = 0; f < numFrames; f++) {
const start = f * frameSize;
const frame = samples.subarray(start, start + frameSize);
frameEnergies[f] = rmsEnergy(frame);
}
// Calculate threshold as mean energy × multiplier
let meanEnergy = 0;
for (let f = 0; f < numFrames; f++) {
meanEnergy += frameEnergies[f];
}
meanEnergy /= numFrames;
const threshold = Math.max(meanEnergy * VAD_ENERGY_THRESHOLD, 0.001);
// Label frames as speech or silence
const isSpeech = new Array<boolean>(numFrames);
for (let f = 0; f < numFrames; f++) {
isSpeech[f] = frameEnergies[f] >= threshold;
}
// Find first speech frame (with minimum consecutive speech requirement)
let firstSpeech = -1;
let consecutiveCount = 0;
for (let f = 0; f < numFrames; f++) {
if (isSpeech[f]) {
consecutiveCount++;
if (consecutiveCount >= VAD_MIN_SPEECH_FRAMES) {
firstSpeech = f - VAD_MIN_SPEECH_FRAMES + 1;
break;
}
} else {
consecutiveCount = 0;
}
}
// Find last speech frame (from the end)
let lastSpeech = -1;
consecutiveCount = 0;
for (let f = numFrames - 1; f >= 0; f--) {
if (isSpeech[f]) {
consecutiveCount++;
if (consecutiveCount >= VAD_MIN_SPEECH_FRAMES) {
lastSpeech = f + VAD_MIN_SPEECH_FRAMES - 1;
break;
}
} else {
consecutiveCount = 0;
}
}
if (firstSpeech === -1 || lastSpeech === -1 || firstSpeech >= lastSpeech) {
// No clear speech detected, return original
return samples;
}
const startSample = Math.max(0, firstSpeech * frameSize);
const endSample = Math.min(samples.length, (lastSpeech + 1) * frameSize);
return samples.subarray(startSample, endSample);
}
/**
* Convert Float64Array samples to 16-bit signed Int16Array PCM.
*/
function floatTo16BitPcm(samples: Float64Array): Int16Array {
const pcm = new Int16Array(samples.length);
for (let i = 0; i < samples.length; i++) {
// Clamp to [-1.0, 1.0] and convert to 16-bit
const clamped = Math.max(-1, Math.min(1, samples[i]));
pcm[i] = Math.round(clamped * 32767);
}
return pcm;
}
/**
* Audio quality metrics for confidence scoring.
*/
function computeQualityMetrics(samples: Float64Array): {
rmsEnergy: number;
peakAmplitude: number;
snrEstimate: number;
} {
let peak = 0;
let sumSq = 0;
for (let i = 0; i < samples.length; i++) {
const abs = Math.abs(samples[i]);
if (abs > peak) peak = abs;
sumSq += samples[i] * samples[i];
}
const rms = Math.sqrt(sumSq / samples.length);
// Simple SNR estimate: assume noise floor is bottom 10% of samples by energy
const sorted = new Float64Array(samples.length);
for (let i = 0; i < samples.length; i++) sorted[i] = Math.abs(samples[i]);
sorted.sort();
const noiseFloor = sorted[Math.floor(samples.length * 0.1)];
const snr = noiseFloor > 0 ? 20 * Math.log10(rms / noiseFloor) : 40;
return {
rmsEnergy: rms,
peakAmplitude: peak,
snrEstimate: Math.min(40, Math.max(0, snr)),
};
}
/**
* Main audio preprocessing pipeline:
* 1. Validate input size
* 2. Parse WAV header
* 3. Validate duration from header (reject too-long audio before decoding)
* 4. Read PCM samples
* 5. Convert to mono
* 6. Resample to 16kHz
* 7. Normalize to -3 dBFS
* 8. VAD silence trimming
* 9. Limit to 30 seconds
* 10. Convert to 16-bit PCM
*/
export async function preprocessAudio(inputBuffer: Buffer): Promise<ProcessedAudio> {
// Reject oversized input early to prevent memory exhaustion
if (inputBuffer.length > MAX_INPUT_BYTES) {
throw new Error(
`Audio file too large: ${(inputBuffer.length / 1024 / 1024).toFixed(1)}MB. ` +
`Maximum ${(MAX_INPUT_BYTES / 1024 / 1024).toFixed(0)}MB. ` +
`Please upload a shorter audio clip (max ${MAX_DURATION_SEC} seconds).`,
);
}
// Detect if it's a WAV by checking RIFF header
const isWav =
inputBuffer.length >= 4 &&
inputBuffer.toString("ascii", 0, 4) === "RIFF";
if (!isWav) {
throw new Error(
"Unsupported audio format. Only WAV files are supported. " +
"Please upload a WAV file (PCM encoded).",
);
}
const { header, dataOffset } = parseWavHeader(inputBuffer);
// Validate duration from header BEFORE allocating sample buffers.
// This prevents loading multi-hour WAV files into memory.
const totalSamples = Math.floor(header.dataSize / (header.bitsPerSample / 8) / header.numChannels);
const durationSec = totalSamples / header.sampleRate;
if (durationSec > MAX_DURATION_SEC + 30) {
throw new Error(
`Audio too long: ${durationSec.toFixed(1)}s. Maximum ${MAX_DURATION_SEC}s for analysis. ` +
`Please trim your audio before uploading.`,
);
}
let samples = readPcmSamples(inputBuffer, header, dataOffset);
// Convert to mono
samples = convertToMono(samples, header.numChannels);
// Store quality metrics before normalization
const metrics = computeQualityMetrics(samples);
// Resample to 16kHz
samples = resample(samples, header.sampleRate, 16000);
// Normalize to -3 dBFS
samples = normalize(samples, TARGET_DBFS);
// VAD silence trimming
samples = vadTrim(samples, 16000);
// Limit to 30 seconds (480,000 samples at 16kHz)
const maxSamples = 16000 * MAX_DURATION_SEC;
if (samples.length > maxSamples) {
samples = samples.subarray(0, maxSamples);
}
// Convert to 16-bit PCM
const pcmSamples = floatTo16BitPcm(samples);
// Create raw PCM buffer (no WAV header - Azure expects raw PCM)
const pcmBuffer = Buffer.from(pcmSamples.buffer);
return {
samples: pcmSamples,
sampleRate: 16000,
channels: 1,
duration: samples.length / 16000,
pcmBuffer,
rmsEnergy: metrics.rmsEnergy,
peakAmplitude: metrics.peakAmplitude,
snrEstimate: metrics.snrEstimate,
};
}