feat: integrate Datadog APM + Sentry error tracking with CloudWatch metrics FRE-4806

- Add CloudWatch metrics emitter (api_latency, api_requests, api_errors) - Add request monitoring middleware for API (latency, error rate, throughput) - Register error-handling, logging, and monitoring middleware in server.ts - Add Datadog log forwarding via HTTP intake API - Add application-level CloudWatch alarms for P99 latency, error rate, throughput - Inject Datadog/Sentry env vars and secrets into ECS task definitions - Add DD_API_KEY and SENTRY_DSN to ECS secrets - Create CloudWatch log groups for datadog and sentry services - Update .env.example with AWS_REGION and monitoring variables - Add @aws-sdk/client-cloudwatch dependency to monitoring package Co-Authored-By: Paperclip <noreply@paperclip.ing>
2026-05-10 02:15:11 -04:00
parent 57a206d7b3
commit c7df40ac26
18 changed files with 5260 additions and 76 deletions
--- a/.env.example
+++ b/.env.example
@@ -4,3 +4,22 @@ PORT=3000
 LOG_LEVEL=info
 HIBP_API_KEY=""
 RESEND_API_KEY=""
 AWS_REGION="us-east-1"
 # Datadog APM Configuration
 DD_SERVICE="shieldai-api"
 DD_ENV="development"
 DD_VERSION="0.1.0"
 DD_TRACE_ENABLED="true"
 DD_TRACE_SAMPLE_RATE="1.0"
 DD_LOGS_INJECTION="true"
 DD_AGENT_HOST="localhost"
 DD_AGENT_PORT="8126"
 DD_API_KEY=""
 DD_SITE="datadoghq.com"
 # Sentry Error Tracking
 SENTRY_DSN=""
 SENTRY_ENVIRONMENT="development"
 SENTRY_RELEASE="0.1.0"
 SENTRY_TRACES_SAMPLE_RATE="0.1"
--- a/docker-compose.prod.yml
+++ b/docker-compose.prod.yml
@@ -1,5 +1,17 @@
 version: '3.9'
 x-monitoring: &monitoring
  DD_ENV: ${DD_ENV:-production}
  DD_SERVICE: ${DD_SERVICE:-shieldai}
  DD_VERSION: ${DOCKER_TAG:-latest}
  DD_TRACE_ENABLED: ${DD_TRACE_ENABLED:-true}
  DD_AGENT_HOST: datadog-agent
  DD_AGENT_PORT: "8126"
  DD_LOGS_INJECTION: "true"
  SENTRY_DSN: ${SENTRY_DSN:-}
  SENTRY_ENVIRONMENT: ${DD_ENV:-production}
  SENTRY_RELEASE: ${DOCKER_TAG:-latest}
 services:
  api:
    image: ghcr.io/${GITHUB_REPOSITORY_OWNER}/shieldai-api:${DOCKER_TAG:-latest}
@@ -7,12 +19,13 @@ services:
    ports:
      - "${PORT:-3000}:3000"
    environment:
-      - DATABASE_URL=postgresql://shieldai:${POSTGRES_PASSWORD}@postgres:5432/shieldai
+      DATABASE_URL: "postgresql://shieldai:${POSTGRES_PASSWORD}@postgres:5432/shieldai"
-      - REDIS_URL=redis://redis:6379
+      REDIS_URL: "redis://redis:6379"
-      - PORT=3000
+      PORT: "3000"
-      - LOG_LEVEL=info
+      LOG_LEVEL: info
-      - HIBP_API_KEY=${HIBP_API_KEY}
+      HIBP_API_KEY: ${HIBP_API_KEY}
-      - RESEND_API_KEY=${RESEND_API_KEY}
+      RESEND_API_KEY: ${RESEND_API_KEY}
      <<: *monitoring
    depends_on:
      postgres:
        condition: service_healthy
@@ -25,9 +38,11 @@ services:
    image: ghcr.io/${GITHUB_REPOSITORY_OWNER}/shieldai-darkwatch:${DOCKER_TAG:-latest}
    restart: unless-stopped
    environment:
-      - DATABASE_URL=postgresql://shieldai:${POSTGRES_PASSWORD}@postgres:5432/shieldai
+      DATABASE_URL: "postgresql://shieldai:${POSTGRES_PASSWORD}@postgres:5432/shieldai"
-      - REDIS_URL=redis://redis:6379
+      REDIS_URL: "redis://redis:6379"
-      - HIBP_API_KEY=${HIBP_API_KEY}
+      HIBP_API_KEY: ${HIBP_API_KEY}
      DD_SERVICE: "shieldai-darkwatch"
      <<: *monitoring
    depends_on:
      postgres:
        condition: service_healthy
@@ -40,8 +55,10 @@ services:
    image: ghcr.io/${GITHUB_REPOSITORY_OWNER}/shieldai-spamshield:${DOCKER_TAG:-latest}
    restart: unless-stopped
    environment:
-      - DATABASE_URL=postgresql://shieldai:${POSTGRES_PASSWORD}@postgres:5432/shieldai
+      DATABASE_URL: "postgresql://shieldai:${POSTGRES_PASSWORD}@postgres:5432/shieldai"
-      - REDIS_URL=redis://redis:6379
+      REDIS_URL: "redis://redis:6379"
      DD_SERVICE: "shieldai-spamshield"
      <<: *monitoring
    depends_on:
      postgres:
        condition: service_healthy
@@ -54,8 +71,10 @@ services:
    image: ghcr.io/${GITHUB_REPOSITORY_OWNER}/shieldai-voiceprint:${DOCKER_TAG:-latest}
    restart: unless-stopped
    environment:
-      - DATABASE_URL=postgresql://shieldai:${POSTGRES_PASSWORD}@postgres:5432/shieldai
+      DATABASE_URL: "postgresql://shieldai:${POSTGRES_PASSWORD}@postgres:5432/shieldai"
-      - REDIS_URL=redis://redis:6379
+      REDIS_URL: "redis://redis:6379"
      DD_SERVICE: "shieldai-voiceprint"
      <<: *monitoring
    depends_on:
      postgres:
        condition: service_healthy
@@ -64,6 +83,29 @@ services:
    networks:
      - shieldai
  datadog-agent:
    image: datadog/agent:7
    restart: unless-stopped
    environment:
      DD_API_KEY: ${DD_API_KEY}
      DD_SITE: ${DD_SITE:-datadoghq.com}
      DD_ENV: ${DD_ENV:-production}
      DD_DOGSTATSD_NON_LOCAL_TRAFFIC: "true"
      DD_APM_ENABLED: "true"
      DD_APM_NON_LOCAL_TRAFFIC: "true"
      DD_LOGS_ENABLED: "true"
      DD_LOGS_CONFIG_CONTAINER_COLLECT_ALL: "true"
      DD_HEALTH_PORT_ENABLE: "true"
    ports:
      - "8125:8125/udp"
      - "8126:8126"
    volumes:
      - /var/run/docker.sock:/var/run/docker.sock:ro
      - /proc/:/host/proc/:ro
      - /sys/fs/cgroup:/host/sys/fs/cgroup:ro
    networks:
      - shieldai
  postgres:
    image: postgres:16-alpine
    restart: unless-stopped
--- a/infra/modules/cloudwatch/main.tf
+++ b/infra/modules/cloudwatch/main.tf
@@ -23,6 +23,27 @@ variable "cache_endpoint" {
  type        = string
 }
 variable "alert_email" {
  description = "Email address for alert notifications"
  type        = string
  default     = "ops@shieldai.com"
 }
 resource "aws_sns_topic" "alerts" {
  name = "${var.project_name}-${var.environment}-alerts"
  tags = {
    Environment = var.environment
    Project     = var.project_name
  }
 }
 resource "aws_sns_topic_subscription" "alerts_email" {
  topic_arn = aws_sns_topic.alerts.arn
  protocol  = "email"
  endpoint  = var.alert_email
 }
 resource "aws_cloudwatch_dashboard" "main" {
  dashboard_name = "${var.project_name}-${var.environment}-dashboard"
@@ -92,6 +113,120 @@ resource "aws_cloudwatch_dashboard" "main" {
          region   = "us-east-1"
          period  = 60
        }
      },
      {
        type = "metric"
        properties = {
          title = "P99 Latency (Target Group)"
          metrics = [
            ["AWS/ApplicationELB", "TargetResponseTime", "LoadBalancer", "${var.cluster_name}-alb", "Statistic", "p99"],
            ["AWS/ApplicationELB", "TargetResponseTime", "LoadBalancer", "${var.cluster_name}-alb", "Statistic", "p95"]
          ]
          view     = "timeSeries"
          stacked  = false
          region   = "us-east-1"
          period  = 60
        }
      },
      {
        type = "metric"
        properties = {
          title = "Error Rate (5xx / Total)"
          metrics = [
            ["AWS/ApplicationELB", "HTTPCode_Elb_5XX_Count", "LoadBalancer", "${var.cluster_name}-alb"],
            ["AWS/ApplicationELB", "HTTPCode_Elb_4XX_Count", "LoadBalancer", "${var.cluster_name}-alb"]
          ]
          view     = "timeSeries"
          stacked  = false
          region   = "us-east-1"
          period  = 60
        }
      },
      {
        type = "metric"
        properties = {
          title = "Throughput (Request Count)"
          metrics = [
            ["AWS/ApplicationELB", "RequestCount", "LoadBalancer", "${var.cluster_name}-alb"]
          ]
          view     = "timeSeries"
          stacked  = false
          region   = "us-east-1"
          period  = 60
          yAxis    = {
            left = {
              label = "Requests/sec"
            }
          }
        }
      },
      {
        type = "metric"
        properties = {
          title = "API Latency Percentiles"
          metrics = [
            ["ShieldAI", "api_latency", "service", "api", "percentile", "p99", "statistic", "Average"],
            ["ShieldAI", "api_latency", "service", "api", "percentile", "p95", "statistic", "Average"],
            ["ShieldAI", "api_latency", "service", "api", "percentile", "p50", "statistic", "Average"]
          ]
          view     = "timeSeries"
          stacked  = false
          region   = "us-east-1"
          period  = 60
        }
      },
      {
        type = "metric"
        properties = {
          title = "API Error Rate"
          metrics = [
            ["ShieldAI", "api_errors", "service", "api", "statistic", "Sum"]
          ]
          view     = "timeSeries"
          stacked  = false
          region   = "us-east-1"
          period  = 60
        }
      },
      {
        type = "metric"
        properties = {
          title = "API Throughput"
          metrics = [
            ["ShieldAI", "api_requests", "service", "api", "statistic", "Sum"]
          ]
          view     = "timeSeries"
          stacked  = false
          region   = "us-east-1"
          period  = 60
        }
      },
      {
        type = "metric"
        properties = {
          title = "ECS Running Tasks"
          metrics = [
            ["AWS/ECS", "RunningTaskCount", "ClusterName", var.cluster_name]
          ]
          view     = "timeSeries"
          stacked  = false
          region   = "us-east-1"
          period  = 60
        }
      },
      {
        type = "metric"
        properties = {
          title = "RDS Read/Write IOPS"
          metrics = [
            ["AWS/RDS", "ReadIOPS", "DBInstanceIdentifier", var.rds_identifier],
            ["AWS/RDS", "WriteIOPS", "DBInstanceIdentifier", var.rds_identifier]
          ]
          view     = "timeSeries"
          stacked  = false
          region   = "us-east-1"
          period  = 60
        }
      }
    ]
  })
@@ -107,6 +242,7 @@ resource "aws_cloudwatch_metric_alarm" "ecs_cpu_high" {
  statistic           = "Average"
  threshold           = 80
  alarm_description   = "ECS CPU utilization above 80%"
  alarm_actions       = [aws_sns_topic.alerts.arn]
  dimensions = {
    ClusterName = var.cluster_name
@@ -123,6 +259,7 @@ resource "aws_cloudwatch_metric_alarm" "ecs_memory_high" {
  statistic           = "Average"
  threshold           = 85
  alarm_description   = "ECS memory utilization above 85%"
  alarm_actions       = [aws_sns_topic.alerts.arn]
  dimensions = {
    ClusterName = var.cluster_name
@@ -139,6 +276,7 @@ resource "aws_cloudwatch_metric_alarm" "alb_5xx" {
  statistic           = "Sum"
  threshold           = 10
  alarm_description   = "ALB 5xx errors above 10 per minute"
  alarm_actions       = [aws_sns_topic.alerts.arn]
  dimensions = {
    LoadBalancer = "${var.cluster_name}-alb"
@@ -155,6 +293,7 @@ resource "aws_cloudwatch_metric_alarm" "rds_cpu_high" {
  statistic           = "Average"
  threshold           = 75
  alarm_description   = "RDS CPU utilization above 75%"
  alarm_actions       = [aws_sns_topic.alerts.arn]
  dimensions = {
    DBInstanceIdentifier = var.rds_identifier
@@ -171,13 +310,155 @@ resource "aws_cloudwatch_metric_alarm" "rds_free_storage" {
  statistic           = "Average"
  threshold           = 524288000
  alarm_description   = "RDS free storage below 500MB"
  alarm_actions       = [aws_sns_topic.alerts.arn]
  dimensions = {
    DBInstanceIdentifier = var.rds_identifier
  }
 }
 resource "aws_cloudwatch_metric_alarm" "p99_latency_high" {
  alarm_name          = "${var.project_name}-${var.environment}-p99-latency-high"
  comparison_operator = "GreaterThanThreshold"
  evaluation_periods  = 3
  metric_name         = "TargetResponseTime"
  namespace           = "AWS/ApplicationELB"
  period              = 60
  statistic           = "p99"
  threshold           = 2
  alarm_description   = "P99 latency above 2 seconds"
  alarm_actions       = [aws_sns_topic.alerts.arn]
  dimensions = {
    LoadBalancer = "${var.cluster_name}-alb"
  }
 }
 resource "aws_cloudwatch_metric_alarm" "error_rate_high" {
  alarm_name          = "${var.project_name}-${var.environment}-error-rate-high"
  comparison_operator = "GreaterThanThreshold"
  evaluation_periods  = 3
  metric_name         = "HTTPCode_Elb_5XX_Count"
  namespace           = "AWS/ApplicationELB"
  period              = 60
  statistic           = "Sum"
  threshold           = 5
  alarm_description   = "Error rate above 5 errors per minute"
  alarm_actions       = [aws_sns_topic.alerts.arn]
  dimensions = {
    LoadBalancer = "${var.cluster_name}-alb"
  }
 }
 resource "aws_cloudwatch_metric_alarm" "throughput_low" {
  alarm_name          = "${var.project_name}-${var.environment}-throughput-low"
  comparison_operator = "LessThanThreshold"
  evaluation_periods  = 5
  metric_name         = "RequestCount"
  namespace           = "AWS/ApplicationELB"
  period              = 60
  statistic           = "Sum"
  threshold           = 10
  alarm_description   = "Throughput below 10 requests per minute"
  alarm_actions       = [aws_sns_topic.alerts.arn]
  dimensions = {
    LoadBalancer = "${var.cluster_name}-alb"
  }
 }
 resource "aws_cloudwatch_log_group" "api" {
  name              = "/${var.project_name}/${var.environment}/api"
  retention_in_days = 30
  tags = {
    Environment = var.environment
    Project     = var.project_name
    Service     = "api"
  }
 }
 resource "aws_cloudwatch_log_group" "datadog" {
  name              = "/${var.project_name}/${var.environment}/datadog"
  retention_in_days = 30
  tags = {
    Environment = var.environment
    Project     = var.project_name
    Service     = "datadog"
  }
 }
 resource "aws_cloudwatch_log_group" "sentry" {
  name              = "/${var.project_name}/${var.environment}/sentry"
  retention_in_days = 30
  tags = {
    Environment = var.environment
    Project     = var.project_name
    Service     = "sentry"
  }
 }
 resource "aws_cloudwatch_metric_alarm" "app_p99_latency_high" {
  alarm_name          = "${var.project_name}-${var.environment}-app-p99-latency-high"
  comparison_operator = "GreaterThanThreshold"
  evaluation_periods  = 3
  metric_name         = "api_latency"
  namespace           = "ShieldAI"
  period              = 60
  statistic           = "Average"
  threshold           = 2000
  alarm_description   = "Application P99 latency above 2000ms"
  alarm_actions       = [aws_sns_topic.alerts.arn]
  dimensions = {
    service    = "api"
    percentile = "p99"
  }
 }
 resource "aws_cloudwatch_metric_alarm" "app_error_rate_high" {
  alarm_name          = "${var.project_name}-${var.environment}-app-error-rate-high"
  comparison_operator = "GreaterThanThreshold"
  evaluation_periods  = 3
  metric_name         = "api_errors"
  namespace           = "ShieldAI"
  period              = 60
  statistic           = "Sum"
  threshold           = 10
  alarm_description   = "Application error count above 10 per minute"
  alarm_actions       = [aws_sns_topic.alerts.arn]
  dimensions = {
    service = "api"
  }
 }
 resource "aws_cloudwatch_metric_alarm" "app_throughput_low" {
  alarm_name          = "${var.project_name}-${var.environment}-app-throughput-low"
  comparison_operator = "LessThanThreshold"
  evaluation_periods  = 5
  metric_name         = "api_requests"
  namespace           = "ShieldAI"
  period              = 60
  statistic           = "Sum"
  threshold           = 10
  alarm_description   = "Application throughput below 10 requests per minute"
  alarm_actions       = [aws_sns_topic.alerts.arn]
  dimensions = {
    service = "api"
  }
 }
 output "dashboard_url" {
  description = "CloudWatch dashboard URL"
  value       = "https://us-east-1.console.aws.amazon.com/cloudwatch/home#dashboards/dashboard/${var.project_name}-${var.environment}-dashboard"
 }
 output "sns_topic_arn" {
  description = "SNS topic ARN for alerts"
  value       = aws_sns_topic.alerts.arn
 }
--- a/infra/modules/ecs/main.tf
+++ b/infra/modules/ecs/main.tf
@@ -96,6 +96,50 @@ resource "aws_ecs_task_definition" "services" {
        {
          name  = "PORT"
          value = tostring(each.port)
        },
        {
          name  = "DD_ENV"
          value = var.environment
        },
        {
          name  = "DD_SERVICE"
          value = "${var.cluster_name}-${each.key}"
        },
        {
          name  = "DD_VERSION"
          value = var.container_images[each.key]
        },
        {
          name  = "DD_TRACE_ENABLED"
          value = "true"
        },
        {
          name  = "DD_LOGS_INJECTION"
          value = "true"
        },
        {
          name  = "DD_AGENT_HOST"
          value = "localhost"
        },
        {
          name  = "DD_AGENT_PORT"
          value = "8126"
        },
        {
          name  = "SENTRY_ENVIRONMENT"
          value = var.environment
        },
        {
          name  = "SENTRY_RELEASE"
          value = var.container_images[each.key]
        },
        {
          name  = "AWS_REGION"
          value = "us-east-1"
        },
        {
          name  = "DD_SITE"
          value = "datadoghq.com"
        }
      ]
@@ -115,6 +159,14 @@ resource "aws_ecs_task_definition" "services" {
        {
          name      = "RESEND_API_KEY"
          valueFrom = "${var.secrets_arn}:RESEND_API_KEY::"
        },
        {
          name      = "SENTRY_DSN"
          valueFrom = "${var.secrets_arn}:SENTRY_DSN::"
        },
        {
          name      = "DD_API_KEY"
          valueFrom = "${var.secrets_arn}:DD_API_KEY::"
        }
      ]
--- a/package.json
+++ b/package.json
@@ -17,13 +17,17 @@
  },
  "devDependencies": {
    "@types/node": "^25.6.0",
-    "vitest": "^4.1.5",
+    "@types/ws": "^8.5.10",
    "@vitest/coverage-v8": "^4.1.5",
    "turbo": "^2.3.0",
-    "typescript": "^5.7.0"
+    "typescript": "^5.7.0",
    "vitest": "^4.1.5"
  },
  "engines": {
    "node": ">=20.0.0"
  },
-  "packageManager": "pnpm@9.0.0"
+  "packageManager": "pnpm@9.0.0",
  "dependencies": {
    "ws": "^8.16.0"
  }
 }
--- a/packages/api/src/middleware/error-handling.middleware.ts
+++ b/packages/api/src/middleware/error-handling.middleware.ts
@@ -1,4 +1,5 @@
 import { FastifyInstance, FastifyRequest, FastifyReply } from 'fastify';
 import { captureSentryError, setSentryContext, setSentryUser } from '@shieldai/monitoring';
 export interface ErrorResponse {
  error: string;
@@ -13,19 +14,37 @@ export interface ErrorResponse {
 export async function errorHandlingMiddleware(fastify: FastifyInstance) {
  // Custom error handler
  fastify.setErrorHandler((error, request: FastifyRequest, reply: FastifyReply) => {
    const err = error as Error & { statusCode?: number; code?: string };
    const response: ErrorResponse = {
-      error: error.name || 'Internal Server Error',
+      error: err.name || 'Internal Server Error',
-      message: error.message || 'An unexpected error occurred',
+      message: err.message || 'An unexpected error occurred',
-      statusCode: error.statusCode || 500,
+      statusCode: err.statusCode || 500,
-      code: (error as any).code,
+      code: err.code,
      timestamp: new Date().toISOString(),
      path: request.url,
    };
    // Send to Sentry (5xx errors only)
    if (response.statusCode >= 500) {
      const userId = (request as FastifyRequest & { user?: { id?: string } }).user?.id;
      if (userId) setSentryUser(userId);
      setSentryContext('request', {
        method: request.method,
        url: request.url,
        userAgent: request.headers['user-agent'],
        requestId: request.id,
      });
      captureSentryError(err, {
        statusCode: String(response.statusCode),
        path: request.url,
        method: request.method,
      });
    }
    // Log error
    fastify.log.error({
      error: response,
-      stack: error.stack,
+      stack: err.stack,
      method: request.method,
      userAgent: request.headers['user-agent'],
    });
--- a/packages/api/src/middleware/monitoring.middleware.ts
+++ b/packages/api/src/middleware/monitoring.middleware.ts
@@ -0,0 +1,46 @@
 import { FastifyInstance, FastifyRequest, FastifyReply } from 'fastify';
 import { emitLatency, emitRequestCount, emitError } from '@shieldai/monitoring';
 const SERVICE_NAME = process.env.DD_SERVICE || 'shieldai-api';
 export async function monitoringMiddleware(fastify: FastifyInstance) {
  fastify.addHook('onResponse', async (request: FastifyRequest, reply: FastifyReply) => {
    const statusCode = reply.statusCode;
    const responseTime = reply.elapsedTime;
    const method = request.method;
    const url = request.url;
    // Emit request count
    await emitRequestCount(SERVICE_NAME, statusCode);
    // Emit latency metrics
    await emitLatency(SERVICE_NAME, responseTime, 'p50');
    await emitLatency(SERVICE_NAME, responseTime, 'p95');
    await emitLatency(SERVICE_NAME, responseTime, 'p99');
    // Emit error metric for 5xx
    if (statusCode >= 500) {
      await emitError(SERVICE_NAME, 'server_error');
      fastify.log.warn({
        event: 'high_latency_or_error',
        method,
        url,
        statusCode,
        responseTime,
        service: SERVICE_NAME,
      });
    }
    // Log high latency requests (>2s)
    if (responseTime > 2000) {
      fastify.log.warn({
        event: 'high_latency',
        method,
        url,
        statusCode,
        responseTime,
        service: SERVICE_NAME,
      });
    }
  });
 }
--- a/packages/api/src/server.ts
+++ b/packages/api/src/server.ts
@@ -4,15 +4,19 @@ import helmet from "@fastify/helmet";
 import sensible from "@fastify/sensible";
 import { extractOrGenerateRequestId } from "@shieldai/types";
 import { authMiddleware } from "./middleware/auth.middleware";
 import { errorHandlingMiddleware } from "./middleware/error-handling.middleware";
 import { loggingMiddleware } from "./middleware/logging.middleware";
 import { monitoringMiddleware } from "./middleware/monitoring.middleware";
 import { darkwatchRoutes } from "./routes/darkwatch.routes";
 import { voiceprintRoutes } from "./routes/voiceprint.routes";
 import { correlationRoutes } from "./routes/correlation.routes";
 import { extensionRoutes } from "./routes/extension.routes";
-import { initDatadog, initSentry, captureSentryError } from "@shieldai/monitoring";
+import { initDatadog, initSentry, initDatadogLogs, captureSentryError } from "@shieldai/monitoring";
 import { getCorsOrigins } from "./config/api.config";
 initDatadog();
 initSentry();
 initDatadogLogs();
 const app = Fastify({
  logger: {
@@ -29,6 +33,15 @@ async function bootstrap() {
  // Register auth middleware to populate request.user
  await app.register(authMiddleware);
  // Register logging middleware (request/response logging)
  await app.register(loggingMiddleware);
  // Register monitoring middleware (CloudWatch metrics)
  await app.register(monitoringMiddleware);
  // Register error handling middleware (Sentry integration)
  await app.register(errorHandlingMiddleware);
  app.addHook("onRequest", async (request, _reply) => {
    const requestId = extractOrGenerateRequestId(request.headers);
    request.id = requestId;
--- a/packages/monitoring/package.json
+++ b/packages/monitoring/package.json
@@ -0,0 +1,23 @@
 {
  "name": "@shieldai/monitoring",
  "version": "0.1.0",
  "main": "./dist/index.js",
  "types": "./dist/index.d.ts",
  "scripts": {
    "build": "tsc",
    "lint": "eslint src/"
  },
  "dependencies": {
    "@aws-sdk/client-cloudwatch": "^3.500.0",
    "dd-trace": "^5.0.0",
    "@sentry/node": "^8.0.0",
    "zod": "^3.23.0"
  },
  "devDependencies": {
    "@types/node": "^25.6.0",
    "typescript": "^5.7.0"
  },
  "exports": {
    ".": "./src/index.ts"
  }
 }
--- a/packages/monitoring/src/cloudwatch.ts
+++ b/packages/monitoring/src/cloudwatch.ts
@@ -0,0 +1,97 @@
 import { CloudWatchClient, PutMetricDataCommand, StandardUnit } from '@aws-sdk/client-cloudwatch';
 import { getMonitoringConfig } from './config';
 let client: CloudWatchClient | null = null;
 function getClient(): CloudWatchClient | null {
  if (client) return client;
  const config = getMonitoringConfig();
  const region = process.env.AWS_REGION || 'us-east-1';
  try {
    client = new CloudWatchClient({ region });
    return client;
  } catch {
    console.warn('[CloudWatch] Metrics client initialization skipped');
    return null;
  }
 }
 export interface MetricDataPoint {
  MetricName: string;
  Dimensions?: { Name: string; Value: string }[];
  Value: number;
  Unit?: string;
  Timestamp?: Date;
 }
 const NAMESPACE = 'ShieldAI';
 export async function emitMetric(
  serviceName: string,
  metricName: string,
  value: number,
  unit: StandardUnit = 'Count',
  dimensions?: Record<string, string>
 ) {
  const cw = getClient();
  if (!cw) return;
  const dims: { Name: string; Value: string }[] = [
    { Name: 'service', Value: serviceName },
    ...(dimensions ? Object.entries(dimensions).map(([n, v]) => ({ Name: n, Value: v })) : []),
  ];
  const command = new PutMetricDataCommand({
    Namespace: NAMESPACE,
    MetricData: [
      {
        MetricName: metricName,
        Dimensions: dims,
        Value: value,
        Unit: unit,
      },
    ],
  });
  try {
    await cw.send(command);
  } catch (err) {
    console.warn('[CloudWatch] Metric emit failed:', (err as Error).message);
  }
 }
 export async function emitLatency(
  serviceName: string,
  latencyMs: number,
  percentile: 'p50' | 'p95' | 'p99'
 ) {
  await emitMetric(
    serviceName,
    'api_latency',
    latencyMs,
    'Milliseconds' as StandardUnit,
    { percentile }
  );
 }
 export async function emitRequestCount(serviceName: string, statusCode: number) {
  await emitMetric(
    serviceName,
    'api_requests',
    1,
    'Count' as StandardUnit,
    { status_class: String(Math.floor(statusCode / 100)) + 'xx' }
  );
 }
 export async function emitError(serviceName: string, errorType: string) {
  await emitMetric(
    serviceName,
    'api_errors',
    1,
    'Count' as StandardUnit,
    { error_type: errorType }
  );
 }
--- a/packages/monitoring/src/config.ts
+++ b/packages/monitoring/src/config.ts
@@ -0,0 +1,35 @@
 import { z } from 'zod';
 const monitoringEnvSchema = z.object({
  DD_SERVICE: z.string().default('shieldai-api'),
  DD_ENV: z.string().default(process.env.NODE_ENV || 'development'),
  DD_VERSION: z.string().default('0.1.0'),
  DD_TRACE_ENABLED: z.string().default('true'),
  DD_TRACE_SAMPLE_RATE: z.string().transform((v) => Number(v)).default('1.0'),
  DD_LOGS_INJECTION: z.string().default('true'),
  DD_AGENT_HOST: z.string().default('localhost'),
  DD_AGENT_PORT: z.string().transform((v) => Number(v)).default('8126'),
  SENTRY_DSN: z.string().default(''),
  SENTRY_ENVIRONMENT: z.string().default(process.env.NODE_ENV || 'development'),
  SENTRY_RELEASE: z.string().default('0.1.0'),
  SENTRY_TRACES_SAMPLE_RATE: z.string().transform((v) => Number(v)).default('0.1'),
 });
 export type MonitoringConfig = z.infer<typeof monitoringEnvSchema>;
 export function getMonitoringConfig(): MonitoringConfig {
  return monitoringEnvSchema.parse({
    DD_SERVICE: process.env.DD_SERVICE,
    DD_ENV: process.env.DD_ENV,
    DD_VERSION: process.env.DD_VERSION,
    DD_TRACE_ENABLED: process.env.DD_TRACE_ENABLED,
    DD_TRACE_SAMPLE_RATE: process.env.DD_TRACE_SAMPLE_RATE,
    DD_LOGS_INJECTION: process.env.DD_LOGS_INJECTION,
    DD_AGENT_HOST: process.env.DD_AGENT_HOST,
    DD_AGENT_PORT: process.env.DD_AGENT_PORT,
    SENTRY_DSN: process.env.SENTRY_DSN,
    SENTRY_ENVIRONMENT: process.env.SENTRY_ENVIRONMENT,
    SENTRY_RELEASE: process.env.SENTRY_RELEASE,
    SENTRY_TRACES_SAMPLE_RATE: process.env.SENTRY_TRACES_SAMPLE_RATE,
  });
 }
--- a/packages/monitoring/src/datadog-logs.ts
+++ b/packages/monitoring/src/datadog-logs.ts
@@ -0,0 +1,49 @@
 import { getMonitoringConfig } from './config';
 let logForwarder: { send: (log: string, service: string) => Promise<void> } | null = null;
 export function initDatadogLogs() {
  const config = getMonitoringConfig();
  if (!process.env.DD_API_KEY) {
    console.log('[Datadog Logs] API key not configured, log forwarding disabled');
    return;
  }
  const site = process.env.DD_SITE || 'datadoghq.com';
  const logIntakeUrl = `https://http-intake.logs.${site}`;
  logForwarder = {
    async send(log: string, service: string) {
      try {
        const payload = JSON.stringify({
          ddsource: 'nodejs',
          ddtags: `env:${config.DD_ENV},service:${service}`,
          hostname: config.DD_SERVICE,
          message: log,
          service,
        });
        await fetch(`${logIntakeUrl}/api/v2/logs`, {
          method: 'POST',
          headers: {
            'DD-API-KEY': process.env.DD_API_KEY!,
            'Content-Type': 'application/json',
          },
          body: payload,
        });
      } catch (err) {
        console.warn('[Datadog Logs] Forward failed:', (err as Error).message);
      }
    },
  };
 }
 export async function forwardLog(log: string, service: string = 'shieldai-api') {
  if (!logForwarder) return;
  await logForwarder.send(log, service);
 }
 export function getLogForwarder() {
  return logForwarder;
 }
--- a/packages/monitoring/src/datadog.ts
+++ b/packages/monitoring/src/datadog.ts
@@ -0,0 +1,49 @@
 import { getMonitoringConfig } from './config';
 let initialized = false;
 export function initDatadog() {
  if (initialized) return;
  const config = getMonitoringConfig();
  if (config.DD_TRACE_ENABLED !== 'true') {
    console.log('[Datadog] APM tracing disabled');
    return;
  }
  try {
    const tracer = require('dd-trace').init({
      service: config.DD_SERVICE,
      env: config.DD_ENV,
      version: config.DD_VERSION,
      sampleRate: config.DD_TRACE_SAMPLE_RATE,
      logInjection: config.DD_LOGS_INJECTION === 'true',
      agentHost: config.DD_AGENT_HOST,
      agentPort: config.DD_AGENT_PORT,
      plugins: true,
      debug: config.DD_ENV === 'development',
    });
    initialized = true;
    console.log(`[Datadog] APM initialized for service "${config.DD_SERVICE}" in "${config.DD_ENV}"`);
    return tracer;
  } catch (err) {
    console.warn('[Datadog] APM initialization skipped:', (err as Error).message);
  }
 }
 export function getDatadogTracer() {
  try {
    return require('dd-trace').tracer;
  } catch {
    return null;
  }
 }
 export function createDatadogSpan(name: string, options?: Record<string, unknown>) {
  const tracer = getDatadogTracer();
  if (!tracer) return;
  return tracer.startChild(name, options);
 }
--- a/packages/monitoring/src/index.ts
+++ b/packages/monitoring/src/index.ts
@@ -0,0 +1,5 @@
 export * from './datadog';
 export * from './sentry';
 export * from './config';
 export * from './cloudwatch';
 export * from './datadog-logs';
--- a/packages/monitoring/src/sentry.ts
+++ b/packages/monitoring/src/sentry.ts
@@ -0,0 +1,90 @@
 import { getMonitoringConfig } from './config';
 let initialized = false;
 export function initSentry() {
  if (initialized) return;
  const config = getMonitoringConfig();
  if (!config.SENTRY_DSN) {
    console.log('[Sentry] DSN not configured, error tracking disabled');
    return;
  }
  try {
    const Sentry = require('@sentry/node');
    Sentry.init({
      dsn: config.SENTRY_DSN,
      environment: config.SENTRY_ENVIRONMENT,
      release: config.SENTRY_RELEASE,
      tracesSampleRate: config.SENTRY_TRACES_SAMPLE_RATE,
      attachStacktrace: true,
      debug: config.SENTRY_ENVIRONMENT === 'development',
      beforeSend(event: any) {
        const req = (event as any).request;
        if (req?.url) {
          try {
            const url = new URL(req.url);
            req.url = url.origin + url.pathname;
          } catch {
            // fallback: keep original URL
          }
        }
        return event;
      },
    });
    initialized = true;
    console.log(`[Sentry] Error tracking initialized for "${config.SENTRY_ENVIRONMENT}"`);
  } catch (err) {
    console.warn('[Sentry] Initialization skipped:', (err as Error).message);
  }
 }
 export function captureSentryError(error: Error | string, context?: Record<string, unknown>) {
  try {
    const Sentry = require('@sentry/node');
    const err = typeof error === 'string' ? new Error(error) : error;
    Sentry.captureException(err, { tags: context as Record<string, string> | undefined });
  } catch {
    console.warn('[Sentry] Error capture skipped (not initialized):', error);
  }
 }
 export function captureSentryMessage(message: string, level: 'info' | 'warning' | 'error' = 'info') {
  try {
    const Sentry = require('@sentry/node');
    Sentry.captureMessage(message, { level });
  } catch {
    console.warn('[Sentry] Message capture skipped (not initialized)');
  }
 }
 export function setSentryUser(userId: string, metadata?: Record<string, string>) {
  try {
    const Sentry = require('@sentry/node');
    Sentry.setUser({ id: userId, ...metadata });
  } catch {
    // silently ignore
  }
 }
 export function setSentryContext(name: string, data: Record<string, unknown>) {
  try {
    const Sentry = require('@sentry/node');
    Sentry.setContext(name, data);
  } catch {
    // silently ignore
  }
 }
 export function getSentryHub() {
  try {
    const Sentry = require('@sentry/node');
    return Sentry.getCurrentHub?.() || Sentry.hub;
  } catch {
    return null;
  }
 }
--- a/packages/monitoring/tsconfig.json
+++ b/packages/monitoring/tsconfig.json
@@ -0,0 +1,9 @@
 {
  "extends": "../../tsconfig.base.json",
  "compilerOptions": {
    "outDir": "./dist",
    "rootDir": "./src",
    "composite": true
  },
  "include": ["src"]
 }
--- a/packages/monitoring/tsconfig.tsbuildinfo
+++ b/packages/monitoring/tsconfig.tsbuildinfo
--- a/pnpm-lock.yaml
+++ b/pnpm-lock.yaml