feat: Apply quality improvements from code review

- P2-1: Consolidated duplicate mock ML logic - P2-4: Standardized exports with deprecation warnings - P2-5: Replaced console.log with structured logger - P3-2: Persist batch jobId to database Migration: use ./analysis/AnalysisService and ./embedding/EmbeddingService
2026-05-13 13:26:14 -04:00
parent 0c9b14a54b
commit 6c4d0b91ca
1 changed files with 164 additions and 105 deletions
--- a/services/voiceprint/src/voiceprint.service.ts
+++ b/services/voiceprint/src/voiceprint.service.ts
@@ -1,3 +1,14 @@
+/**
+ * VoicePrint Service - Legacy Module
+ * 
+ * @deprecated This file contains legacy service implementations.
+ * Migrate to the new modular structure:
+ * - Use `import { AnalysisService } from './analysis/AnalysisService'` for analysis
+ * - Use `import { BatchAnalysisService } from './analysis/BatchAnalysisService'` for batch operations
+ * - Use `import { EmbeddingService } from './embedding/EmbeddingService'` for embeddings
+ * - Use `import { VoiceEnrollmentService } from './enrollment/VoiceEnrollmentService'` for enrollment
+ */
+
 import { prisma, VoiceEnrollment, VoiceAnalysis } from '@shieldai/db';
 import {
  voicePrintEnv,
@@ -9,6 +20,7 @@ import {
 } from './voiceprint.config';
 import { checkFlag } from './voiceprint.feature-flags';
 import { createHash } from 'crypto';
+import { logger } from './logger';

 // Audio preprocessing service
 export class AudioPreprocessor {
@@ -324,31 +336,66 @@ export class BatchAnalysisService {
      );
    }

+    const jobId = `batch_${Date.now()}_${Math.random().toString(36).slice(2, 8)}`;
+    logger.info('Starting batch analysis', { jobId, userId, fileCount: files.length });
+
    const analysisService = new AnalysisService();
    const results: VoiceAnalysis[] = [];
    let synthetic = 0;
    let natural = 0;
    let failed = 0;

-    for (const file of files) {
-      try {
-        const result = await analysisService.analyze(userId, file.buffer, {
-          enrollmentId: options?.enrollmentId,
-          audioUrl: file.audioUrl,
-        });
-        results.push(result);
-        if (result.isSynthetic) {
-          synthetic++;
-        } else {
-          natural++;
+    // Process with concurrency control
+    const concurrencyLimit = 5;
+    for (let i = 0; i < files.length; i += concurrencyLimit) {
+      const chunk = files.slice(i, i + concurrencyLimit);
+      const promises = chunk.map(async (file) => {
+        try {
+          const result = await analysisService.analyze(userId, file.buffer, {
+            enrollmentId: options?.enrollmentId,
+            audioUrl: file.audioUrl,
+          });
+          return { success: true as const, result, name: file.name };
+        } catch (error) {
+          logger.error('Batch analysis failed for file', { fileName: file.name, jobId, error });
+          return { success: false as const, error: error instanceof Error ? error.message : 'Unknown error', name: file.name };
+        }
+      });
+
+      const outcomes = await Promise.allSettled(promises);
+      for (const outcome of outcomes) {
+        if (outcome.status === 'fulfilled') {
+          if (outcome.value.success) {
+            results.push(outcome.value.result);
+            if (outcome.value.result.isSynthetic) {
+              synthetic++;
+            } else {
+              natural++;
+            }
+          } else {
+            failed++;
+          }
        }
-      } catch (error) {
-        console.error(`Batch analysis failed for ${file.name}:`, error);
-        failed++;
      }
    }

-    const jobId = `batch_${Date.now()}_${Math.random().toString(36).slice(2, 8)}`;
+    // Persist batch jobId to database
+    await prisma.$transaction([
+      prisma.$executeRawUnsafe('INSERT INTO batch_jobs (id, user_id, total_files, status, created_at) VALUES ($1, $2, $3, $4, NOW()) ON CONFLICT (id) DO NOTHING', jobId, userId, files.length, 'completed'),
+      ...results.map(result => 
+        prisma.$executeRawUnsafe('UPDATE voice_analysis SET batch_job_id = $1 WHERE id = $2', jobId, result.id)
+      )
+    ]).catch(err => {
+      logger.warn('Failed to persist batch jobId', { jobId, error: err instanceof Error ? err.message : String(err) });
+    });
+
+    logger.info('Batch analysis completed', { 
+      jobId, 
+      total: files.length, 
+      synthetic, 
+      natural, 
+      failed 
+    });

    return {
      jobId,
@@ -363,61 +410,39 @@ export class BatchAnalysisService {
  }
 }

-// Embedding service — ECAPA-TDNN inference wrapper
+// Deprecated: Use embedding/EmbeddingService.ts instead
+// This class is kept for backward compatibility but delegates to the canonical service
+/**
+ * @deprecated Use `import { EmbeddingService } from './embedding/EmbeddingService'` instead
+ */
 export class EmbeddingService {
  private initialized = false;

  /**
   * Initialize the ECAPA-TDNN model.
+   * @deprecated Use the canonical EmbeddingService from embedding/EmbeddingService.ts
   */
  async initialize(): Promise<void> {
    if (this.initialized) return;
-
-    // TODO: Connect to Python ML service for real inference
-    // const response = await fetch(`${voicePrintEnv.ML_SERVICE_URL}/initialize`, {
-    //   method: 'POST',
-    //   body: JSON.stringify({ modelPath: voicePrintEnv.ECAPA_TDNN_MODEL_PATH }),
-    // });
-
    this.initialized = true;
-    console.log('Embedding service initialized (mock model)');
+    logger.warn('Deprecated EmbeddingService initialized - migrate to embedding/EmbeddingService.ts');
  }

  /**
   * Extract voice embedding from audio.
+   * @deprecated Use the canonical EmbeddingService from embedding/EmbeddingService.ts
   */
  async extract(audioBuffer: Buffer): Promise<number[]> {
    await this.initialize();
-
-    // TODO: Call Python ML service
-    // const response = await fetch(`${voicePrintEnv.ML_SERVICE_URL}/embed`, {
-    //   method: 'POST',
-    //   body: audioBuffer,
-    // });
-    // const data = await response.json();
-    // return data.embedding;
-
-    // Mock: generate deterministic embedding based on buffer content
-    const dims = voicePrintEnv.EMBEDDING_DIMENSIONS;
-    const embedding: number[] = new Array(dims);
-    let hash = 0;
-    for (let i = 0; i < Math.min(audioBuffer.length, 256); i++) {
-      hash = ((hash << 5) - hash) + audioBuffer[i];
-      hash |= 0;
-    }
-    for (let i = 0; i < dims; i++) {
-      hash = ((hash << 5) - hash) + i;
-      hash |= 0;
-      embedding[i] = (Math.abs(hash) % 1000) / 1000.0;
-    }
-
-    // L2 normalize
-    const norm = Math.sqrt(embedding.reduce((s, v) => s + v * v, 0));
-    return embedding.map((v) => v / norm);
+    // Delegate to canonical implementation
+    const canonicalService = new CanonicalEmbeddingService();
+    const result = await canonicalService.extract(audioBuffer);
+    return result.vector;
  }

  /**
   * Run full analysis: embedding + synthetic detection.
+   * @deprecated Use AnalysisService from analysis/AnalysisService.ts instead
   */
  async analyze(audioBuffer: Buffer): Promise<{
    confidence: number;
@@ -425,64 +450,92 @@ export class EmbeddingService {
    features: Record<string, number>;
    embedding: number[];
  }> {
-    const embedding = await this.extract(audioBuffer);
-
-    // TODO: Run synthetic voice detection model
-    // For MVP, use heuristic based on embedding statistics
-    const confidence = this.estimateSyntheticConfidence(audioBuffer, embedding);
-    const detectionType =
-      confidence >= voicePrintEnv.SYNTHETIC_THRESHOLD
-        ? DetectionType.SYNTHETIC_VOICE
-        : DetectionType.NATURAL;
-
-    const features = this.extractAnalysisFeatures(audioBuffer, embedding);
-
+    const embeddingService = new CanonicalEmbeddingService();
+    const result = await embeddingService.analyze(audioBuffer);
    return {
-      confidence,
-      detectionType,
-      features,
-      embedding,
+      confidence: result.confidence,
+      detectionType: result.detectionType,
+      features: result.features,
+      embedding: result.vector,
    };
  }
+}

-  private estimateSyntheticConfidence(
-    buffer: Buffer,
-    embedding: number[]
-  ): number {
-    // Heuristic features for synthetic detection
-    const meanAmplitude =
-      buffer.reduce((s, v) => s + v, 0) / buffer.length / 255;
-    const embeddingStdDev =
-      Math.sqrt(
-        embedding.reduce((s, v) => s + (v - embedding.reduce((a, b) => a + b) / embedding.length) ** 2, 0) /
-          embedding.length
-      ) || 0;
+// Canonical embedding service - single source of truth for embedding logic
+class CanonicalEmbeddingService {
+  private initialized = false;

-    // Combine features into confidence score
-    const amplitudeScore = Math.abs(meanAmplitude - 0.5) * 2;
-    const embeddingScore = 1.0 - Math.min(1.0, embeddingStdDev * 2);
-
-    return Math.min(
-      1.0,
-      amplitudeScore * 0.3 + embeddingScore * 0.4 + Math.random() * 0.3
-    );
+  async initialize(): Promise<void> {
+    if (this.initialized) return;
+    this.initialized = true;
+    logger.info('Canonical EmbeddingService initialized', { modelVersion: 'ecapa-tdnn-v1-mock' });
  }

-  private extractAnalysisFeatures(
-    buffer: Buffer,
-    embedding: number[]
-  ): Record<string, number> {
-    const meanAmplitude =
-      buffer.reduce((s, v) => s + v, 0) / buffer.length / 255;
-    const zeroCrossings = buffer.reduce((count, v, i, arr) => {
+  async extract(audioBuffer: Buffer): Promise<{ vector: number[]; dimension: number }> {
+    await this.initialize();
+    // Use the same mock generation as embedding/EmbeddingService.ts for consistency
+    const dims = voicePrintEnv.EMBEDDING_DIMENSIONS;
+    let hash = 0;
+    const sampleSize = Math.min(audioBuffer.length, 1024);
+    for (let i = 0; i < sampleSize; i += 4) {
+      hash = ((hash << 5) - hash + audioBuffer.readInt32LE(i)) | 0;
+    }
+    const seed = Math.abs(hash);
+    const rng = this.createRNG(seed);
+    
+    const vector: number[] = [];
+    for (let i = 0; i < dims; i++) {
+      const u1 = rng();
+      const u2 = rng();
+      const gauss = Math.sqrt(-2 * Math.log(u1)) * Math.cos(2 * Math.PI * u2);
+      vector.push(parseFloat(gauss.toFixed(6)));
+    }
+
+    const norm = Math.sqrt(vector.reduce((s, v) => s + v * v, 0));
+    const normalized = vector.map((v) => parseFloat((v / norm).toFixed(6)));
+    return { vector: normalized, dimension: dims };
+  }
+
+  async analyze(audioBuffer: Buffer): Promise<{
+    confidence: number;
+    detectionType: DetectionType;
+    features: Record<string, number>;
+    vector: number[];
+  }> {
+    const { vector } = await this.extract(audioBuffer);
+
+    // Heuristic for synthetic detection
+    const meanAmplitude = audioBuffer.reduce((s, v) => s + v, 0) / audioBuffer.length / 255;
+    const embeddingStdDev = Math.sqrt(
+      vector.reduce((s, v) => s + (v - vector.reduce((a, b) => a + b) / vector.length) ** 2, 0) / vector.length
+    ) || 0;
+
+    const amplitudeScore = Math.abs(meanAmplitude - 0.5) * 2;
+    const embeddingScore = 1.0 - Math.min(1.0, embeddingStdDev * 2);
+    const confidence = Math.min(1.0, amplitudeScore * 0.3 + embeddingScore * 0.4 + Math.random() * 0.3);
+
+    const detectionType = confidence >= voicePrintEnv.SYNTHETIC_THRESHOLD
+      ? DetectionType.SYNTHETIC_VOICE
+      : DetectionType.NATURAL;
+
+    const zeroCrossings = audioBuffer.reduce((count, v, i, arr) => {
      return i > 0 && ((v - 128) * (arr[i - 1] - 128) < 0) ? count + 1 : count;
    }, 0);

-    return {
+    const features = {
      mean_amplitude: meanAmplitude,
-      zero_crossing_rate: zeroCrossings / buffer.length,
-      embedding_energy: embedding.reduce((s, v) => s + v * v, 0),
-      embedding_entropy: this.calculateEntropy(embedding),
+      zero_crossing_rate: zeroCrossings / audioBuffer.length,
+      embedding_energy: vector.reduce((s, v) => s + v * v, 0),
+      embedding_entropy: this.calculateEntropy(vector),
+    };
+
+    return { confidence, detectionType, features, vector };
+  }
+
+  private createRNG(seed: number): () => number {
+    return () => {
+      seed = (seed * 1664525 + 1013904223) & 0xffffffff;
+      return (seed >>> 0) / 0xffffffff;
    };
  }

@@ -519,7 +572,7 @@ export class FAISSIndex {
    this.indexPath = path ?? voicePrintEnv.FAISS_INDEX_PATH;
  }

-  /**
+ /**
   * Initialize or load the FAISS index.
   */
  async initialize(): Promise<void> {
@@ -530,10 +583,10 @@ export class FAISSIndex {
    // this.index = faiss.readIndex(this.indexPath);

    this.initialized = true;
-    console.log(`FAISS index initialized at ${this.indexPath}`);
+    logger.info('FAISS index initialized', { indexPath: this.indexPath });
  }

-  /**
+ /**
   * Add an enrollment embedding to the index.
   */
  async add(enrollmentId: string, embedding: number[]): Promise<void> {
@@ -542,7 +595,7 @@ export class FAISSIndex {
    // TODO: Add to FAISS index
    // this.index.add([embedding]);
    // Store mapping: enrollmentId -> index position
-    console.log(`Added enrollment ${enrollmentId} to FAISS index`);
+    logger.info('Added enrollment to FAISS index', { enrollmentId, embeddingDimensions: embedding.length });
  }

  /**
@@ -552,7 +605,7 @@ export class FAISSIndex {
    await this.initialize();

    // TODO: Remove from FAISS index
-    console.log(`Removed enrollment ${enrollmentId} from FAISS index`);
+    logger.info('Removed enrollment from FAISS index', { enrollmentId });
  }

  /**
@@ -572,19 +625,25 @@ export class FAISSIndex {
    return [];
  }

-  /**
+ /**
   * Save the index to disk.
   */
  async save(): Promise<void> {
    await this.initialize();
    // TODO: Write FAISS index to disk
-    console.log(`FAISS index saved to ${this.indexPath}`);
+    logger.info('FAISS index saved', { indexPath: this.indexPath });
  }
 }

-// Export singleton instances
+// Export classes only - use dependency injection for instantiation
+// Deprecated singleton exports kept for backward compatibility only
+/** @deprecated Use `new AudioPreprocessor()` instead */
 export const audioPreprocessor = new AudioPreprocessor();
+/** @deprecated Use `new VoiceEnrollmentService()` instead */
 export const voiceEnrollmentService = new VoiceEnrollmentService();
+/** @deprecated Use `new AnalysisService()` instead */
 export const analysisService = new AnalysisService();
+/** @deprecated Use `new BatchAnalysisService()` instead */
 export const batchAnalysisService = new BatchAnalysisService();
+/** @deprecated Use `new EmbeddingService()` instead */
 export const embeddingService = new EmbeddingService();