deep research addressement

2026-06-01 08:40:10 -04:00
parent c159f07322
commit ba73daa66c
205 changed files with 157390 additions and 951 deletions
--- a/web/src/server/services/voiceprint/audio.processor.ts
+++ b/web/src/server/services/voiceprint/audio.processor.ts
@@ -0,0 +1,424 @@
+/**
+ * Audio preprocessing pipeline for Azure Speech Services.
+ *
+ * Converts incoming audio to Azure-compatible format:
+ * - 16kHz sample rate
+ * - Mono channel
+ * - 16-bit signed PCM
+ * - Normalized to -3 dBFS
+ * - Silence trimmed using energy-based VAD
+ * - Max 30 seconds duration
+ */
+
+export interface ProcessedAudio {
+  /** 16-bit PCM samples at 16kHz mono */
+  samples: Int16Array;
+  /** Sample rate (always 16000) */
+  sampleRate: number;
+  /** Number of channels (always 1) */
+  channels: number;
+  /** Duration in seconds */
+  duration: number;
+  /** Raw PCM Buffer ready for Azure API (WAV-headerless 16-bit mono 16kHz) */
+  pcmBuffer: Buffer;
+  /** RMS energy (pre-normalization) for quality assessment */
+  rmsEnergy: number;
+  /** Peak amplitude (pre-normalization) */
+  peakAmplitude: number;
+  /** Signal-to-noise ratio estimate in dB */
+  snrEstimate: number;
+}
+
+interface WavHeader {
+  audioFormat: number;
+  numChannels: number;
+  sampleRate: number;
+  byteRate: number;
+  blockAlign: number;
+  bitsPerSample: number;
+  dataSize: number;
+}
+
+/** Maximum allowed audio duration in seconds */
+const MAX_DURATION_SEC = 30;
+/** Target normalization level in dBFS */
+const TARGET_DBFS = -3;
+/** Frame size for VAD in milliseconds */
+const VAD_FRAME_MS = 30;
+/** Energy threshold multiplier (× mean energy) for VAD */
+const VAD_ENERGY_THRESHOLD = 1.5;
+/** Minimum consecutive speech frames to consider as voice activity */
+const VAD_MIN_SPEECH_FRAMES = 3;
+
+/**
+ * Parse a WAV file header and return header info + data offset.
+ * Supports PCM formats with 8, 16, 24, or 32-bit samples.
+ */
+function parseWavHeader(buffer: Buffer): { header: WavHeader; dataOffset: number } {
+  if (buffer.length < 44) {
+    throw new Error(`Audio too short to be valid WAV: ${buffer.length} bytes`);
+  }
+
+  const riffId = buffer.toString("ascii", 0, 4);
+  if (riffId !== "RIFF") {
+    throw new Error(`Invalid RIFF header: ${riffId}`);
+  }
+
+  const waveId = buffer.toString("ascii", 8, 12);
+  if (waveId !== "WAVE") {
+    throw new Error(`Invalid WAVE header: ${waveId}`);
+  }
+
+  // Scan for fmt chunk
+  let offset = 12;
+  let audioFormat = 0;
+  let numChannels = 0;
+  let sampleRate = 0;
+  let byteRate = 0;
+  let blockAlign = 0;
+  let bitsPerSample = 0;
+  let dataSize = 0;
+  let dataOffset = 0;
+
+  while (offset < buffer.length - 8) {
+    const chunkId = buffer.toString("ascii", offset, offset + 4);
+    const chunkSize = buffer.readUInt32LE(offset + 4);
+
+    if (chunkId === "fmt ") {
+      audioFormat = buffer.readUInt16LE(offset + 8);
+      numChannels = buffer.readUInt16LE(offset + 10);
+      sampleRate = buffer.readUInt32LE(offset + 12);
+      byteRate = buffer.readUInt32LE(offset + 16);
+      blockAlign = buffer.readUInt16LE(offset + 20);
+      bitsPerSample = buffer.readUInt16LE(offset + 22);
+    } else if (chunkId === "data") {
+      dataSize = chunkSize;
+      dataOffset = offset + 8;
+      break;
+    }
+
+    offset += 8 + chunkSize;
+    // Handle padding byte
+    if (chunkSize % 2 !== 0) offset += 1;
+  }
+
+  if (!dataOffset || !dataSize) {
+    throw new Error("No data chunk found in WAV file");
+  }
+
+  if (audioFormat !== 1) {
+    throw new Error(`Unsupported audio format: ${audioFormat} (only PCM/1 supported)`);
+  }
+
+  return {
+    header: { audioFormat, numChannels, sampleRate, byteRate, blockAlign, bitsPerSample, dataSize },
+    dataOffset,
+  };
+}
+
+/**
+ * Read raw PCM samples from a WAV data section into Float64Array,
+ * normalizing to the range [-1.0, 1.0].
+ */
+function readPcmSamples(buffer: Buffer, header: WavHeader, dataOffset: number): Float64Array {
+  const { bitsPerSample } = header;
+  const sampleCount = Math.floor(header.dataSize / (bitsPerSample / 8));
+  const samples = new Float64Array(sampleCount);
+
+  if (bitsPerSample === 8) {
+    for (let i = 0; i < sampleCount; i++) {
+      // 8-bit PCM is unsigned, bias = 128
+      samples[i] = (buffer[dataOffset + i] - 128) / 128;
+    }
+  } else if (bitsPerSample === 16) {
+    for (let i = 0; i < sampleCount; i++) {
+      samples[i] = buffer.readInt16LE(dataOffset + i * 2) / 32768;
+    }
+  } else if (bitsPerSample === 24) {
+    for (let i = 0; i < sampleCount; i++) {
+      // 24-bit signed, little-endian
+      const b0 = buffer[dataOffset + i * 3];
+      const b1 = buffer[dataOffset + i * 3 + 1];
+      const b2 = buffer[dataOffset + i * 3 + 2];
+      let value = b0 | (b1 << 8) | (b2 << 16);
+      // Sign extend
+      if (b2 & 0x80) value |= ~0xffffff;
+      samples[i] = value / 8388608;
+    }
+  } else if (bitsPerSample === 32) {
+    for (let i = 0; i < sampleCount; i++) {
+      samples[i] = buffer.readInt32LE(dataOffset + i * 4) / 2147483648;
+    }
+  } else {
+    throw new Error(`Unsupported bits per sample: ${bitsPerSample}`);
+  }
+
+  return samples;
+}
+
+/**
+ * Convert multi-channel samples to mono by averaging channels.
+ */
+function convertToMono(samples: Float64Array, numChannels: number): Float64Array {
+  if (numChannels === 1) return samples;
+
+  const frameCount = Math.floor(samples.length / numChannels);
+  const mono = new Float64Array(frameCount);
+
+  for (let i = 0; i < frameCount; i++) {
+    let sum = 0;
+    for (let ch = 0; ch < numChannels; ch++) {
+      sum += samples[i * numChannels + ch];
+    }
+    mono[i] = sum / numChannels;
+  }
+
+  return mono;
+}
+
+/**
+ * Resample audio using linear interpolation.
+ */
+function resample(samples: Float64Array, fromRate: number, toRate: number): Float64Array {
+  if (fromRate === toRate) return samples;
+
+  const ratio = fromRate / toRate;
+  const outputLength = Math.floor(samples.length / ratio);
+  const output = new Float64Array(outputLength);
+
+  for (let i = 0; i < outputLength; i++) {
+    const inputPos = i * ratio;
+    const inputIndex = Math.floor(inputPos);
+    const frac = inputPos - inputIndex;
+
+    if (inputIndex >= samples.length - 1) {
+      output[i] = samples[samples.length - 1];
+    } else {
+      output[i] = samples[inputIndex] * (1 - frac) + samples[inputIndex + 1] * frac;
+    }
+  }
+
+  return output;
+}
+
+/**
+ * Normalize audio to target dBFS level.
+ */
+function normalize(samples: Float64Array, targetDbfs: number): Float64Array {
+  let peak = 0;
+  for (let i = 0; i < samples.length; i++) {
+    const abs = Math.abs(samples[i]);
+    if (abs > peak) peak = abs;
+  }
+
+  if (peak === 0) return samples;
+
+  const targetAmplitude = 10 ** (targetDbfs / 20);
+  const gain = targetAmplitude / peak;
+
+  if (Math.abs(gain - 1) < 0.001) return samples;
+
+  const normalized = new Float64Array(samples.length);
+  for (let i = 0; i < samples.length; i++) {
+    normalized[i] = samples[i] * gain;
+  }
+
+  return normalized;
+}
+
+/**
+ * Calculate RMS energy of a frame of samples.
+ */
+function rmsEnergy(frame: Float64Array): number {
+  let sumSq = 0;
+  for (let i = 0; i < frame.length; i++) {
+    sumSq += frame[i] * frame[i];
+  }
+  return Math.sqrt(sumSq / frame.length);
+}
+
+/**
+ * Simple energy-based Voice Activity Detection.
+ * Trims leading and trailing silence, preserving internal pauses.
+ */
+function vadTrim(samples: Float64Array, sampleRate: number): Float64Array {
+  const frameSize = Math.floor(sampleRate * VAD_FRAME_MS / 1000);
+  const numFrames = Math.floor(samples.length / frameSize);
+
+  if (numFrames < 2) return samples;
+
+  // Calculate energy for each frame
+  const frameEnergies = new Float64Array(numFrames);
+  for (let f = 0; f < numFrames; f++) {
+    const start = f * frameSize;
+    const frame = samples.subarray(start, start + frameSize);
+    frameEnergies[f] = rmsEnergy(frame);
+  }
+
+  // Calculate threshold as mean energy × multiplier
+  let meanEnergy = 0;
+  for (let f = 0; f < numFrames; f++) {
+    meanEnergy += frameEnergies[f];
+  }
+  meanEnergy /= numFrames;
+
+  const threshold = Math.max(meanEnergy * VAD_ENERGY_THRESHOLD, 0.001);
+
+  // Label frames as speech or silence
+  const isSpeech = new Array<boolean>(numFrames);
+  for (let f = 0; f < numFrames; f++) {
+    isSpeech[f] = frameEnergies[f] >= threshold;
+  }
+
+  // Find first speech frame (with minimum consecutive speech requirement)
+  let firstSpeech = -1;
+  let consecutiveCount = 0;
+  for (let f = 0; f < numFrames; f++) {
+    if (isSpeech[f]) {
+      consecutiveCount++;
+      if (consecutiveCount >= VAD_MIN_SPEECH_FRAMES) {
+        firstSpeech = f - VAD_MIN_SPEECH_FRAMES + 1;
+        break;
+      }
+    } else {
+      consecutiveCount = 0;
+    }
+  }
+
+  // Find last speech frame (from the end)
+  let lastSpeech = -1;
+  consecutiveCount = 0;
+  for (let f = numFrames - 1; f >= 0; f--) {
+    if (isSpeech[f]) {
+      consecutiveCount++;
+      if (consecutiveCount >= VAD_MIN_SPEECH_FRAMES) {
+        lastSpeech = f + VAD_MIN_SPEECH_FRAMES - 1;
+        break;
+      }
+    } else {
+      consecutiveCount = 0;
+    }
+  }
+
+  if (firstSpeech === -1 || lastSpeech === -1 || firstSpeech >= lastSpeech) {
+    // No clear speech detected, return original
+    return samples;
+  }
+
+  const startSample = Math.max(0, firstSpeech * frameSize);
+  const endSample = Math.min(samples.length, (lastSpeech + 1) * frameSize);
+
+  return samples.subarray(startSample, endSample);
+}
+
+/**
+ * Convert Float64Array samples to 16-bit signed Int16Array PCM.
+ */
+function floatTo16BitPcm(samples: Float64Array): Int16Array {
+  const pcm = new Int16Array(samples.length);
+  for (let i = 0; i < samples.length; i++) {
+    // Clamp to [-1.0, 1.0] and convert to 16-bit
+    const clamped = Math.max(-1, Math.min(1, samples[i]));
+    pcm[i] = Math.round(clamped * 32767);
+  }
+  return pcm;
+}
+
+/**
+ * Audio quality metrics for confidence scoring.
+ */
+function computeQualityMetrics(samples: Float64Array): {
+  rmsEnergy: number;
+  peakAmplitude: number;
+  snrEstimate: number;
+} {
+  let peak = 0;
+  let sumSq = 0;
+  for (let i = 0; i < samples.length; i++) {
+    const abs = Math.abs(samples[i]);
+    if (abs > peak) peak = abs;
+    sumSq += samples[i] * samples[i];
+  }
+
+  const rms = Math.sqrt(sumSq / samples.length);
+
+  // Simple SNR estimate: assume noise floor is bottom 10% of samples by energy
+  const sorted = new Float64Array(samples.length);
+  for (let i = 0; i < samples.length; i++) sorted[i] = Math.abs(samples[i]);
+  sorted.sort();
+  const noiseFloor = sorted[Math.floor(samples.length * 0.1)];
+
+  const snr = noiseFloor > 0 ? 20 * Math.log10(rms / noiseFloor) : 40;
+
+  return {
+    rmsEnergy: rms,
+    peakAmplitude: peak,
+    snrEstimate: Math.min(40, Math.max(0, snr)),
+  };
+}
+
+/**
+ * Main audio preprocessing pipeline:
+ * 1. Parse WAV header
+ * 2. Read PCM samples
+ * 3. Convert to mono
+ * 4. Resample to 16kHz
+ * 5. Normalize to -3 dBFS
+ * 6. VAD silence trimming
+ * 7. Limit to 30 seconds
+ * 8. Convert to 16-bit PCM
+ */
+export async function preprocessAudio(inputBuffer: Buffer): Promise<ProcessedAudio> {
+  // Detect if it's a WAV by checking RIFF header
+  const isWav =
+    inputBuffer.length >= 4 &&
+    inputBuffer.toString("ascii", 0, 4) === "RIFF";
+
+  if (!isWav) {
+    throw new Error(
+      "Unsupported audio format. Only WAV files are supported. " +
+      "Please upload a WAV file (PCM encoded).",
+    );
+  }
+
+  const { header, dataOffset } = parseWavHeader(inputBuffer);
+  let samples = readPcmSamples(inputBuffer, header, dataOffset);
+
+  // Convert to mono
+  samples = convertToMono(samples, header.numChannels);
+
+  // Store quality metrics before normalization
+  const metrics = computeQualityMetrics(samples);
+
+  // Resample to 16kHz
+  samples = resample(samples, header.sampleRate, 16000);
+
+  // Normalize to -3 dBFS
+  samples = normalize(samples, TARGET_DBFS);
+
+  // VAD silence trimming
+  samples = vadTrim(samples, 16000);
+
+  // Limit to 30 seconds (480,000 samples at 16kHz)
+  const maxSamples = 16000 * MAX_DURATION_SEC;
+  if (samples.length > maxSamples) {
+    samples = samples.subarray(0, maxSamples);
+  }
+
+  // Convert to 16-bit PCM
+  const pcmSamples = floatTo16BitPcm(samples);
+
+  // Create raw PCM buffer (no WAV header - Azure expects raw PCM)
+  const pcmBuffer = Buffer.from(pcmSamples.buffer);
+
+  return {
+    samples: pcmSamples,
+    sampleRate: 16000,
+    channels: 1,
+    duration: samples.length / 16000,
+    pcmBuffer,
+    rmsEnergy: metrics.rmsEnergy,
+    peakAmplitude: metrics.peakAmplitude,
+    snrEstimate: metrics.snrEstimate,
+  };
+}