feat: integrate Datadog APM + Sentry error tracking with CloudWatch metrics FRE-4806
- Add CloudWatch metrics emitter (api_latency, api_requests, api_errors) - Add request monitoring middleware for API (latency, error rate, throughput) - Register error-handling, logging, and monitoring middleware in server.ts - Add Datadog log forwarding via HTTP intake API - Add application-level CloudWatch alarms for P99 latency, error rate, throughput - Inject Datadog/Sentry env vars and secrets into ECS task definitions - Add DD_API_KEY and SENTRY_DSN to ECS secrets - Create CloudWatch log groups for datadog and sentry services - Update .env.example with AWS_REGION and monitoring variables - Add @aws-sdk/client-cloudwatch dependency to monitoring package Co-Authored-By: Paperclip <noreply@paperclip.ing>
This commit is contained in:
19
.env.example
19
.env.example
@@ -4,3 +4,22 @@ PORT=3000
|
|||||||
LOG_LEVEL=info
|
LOG_LEVEL=info
|
||||||
HIBP_API_KEY=""
|
HIBP_API_KEY=""
|
||||||
RESEND_API_KEY=""
|
RESEND_API_KEY=""
|
||||||
|
AWS_REGION="us-east-1"
|
||||||
|
|
||||||
|
# Datadog APM Configuration
|
||||||
|
DD_SERVICE="shieldai-api"
|
||||||
|
DD_ENV="development"
|
||||||
|
DD_VERSION="0.1.0"
|
||||||
|
DD_TRACE_ENABLED="true"
|
||||||
|
DD_TRACE_SAMPLE_RATE="1.0"
|
||||||
|
DD_LOGS_INJECTION="true"
|
||||||
|
DD_AGENT_HOST="localhost"
|
||||||
|
DD_AGENT_PORT="8126"
|
||||||
|
DD_API_KEY=""
|
||||||
|
DD_SITE="datadoghq.com"
|
||||||
|
|
||||||
|
# Sentry Error Tracking
|
||||||
|
SENTRY_DSN=""
|
||||||
|
SENTRY_ENVIRONMENT="development"
|
||||||
|
SENTRY_RELEASE="0.1.0"
|
||||||
|
SENTRY_TRACES_SAMPLE_RATE="0.1"
|
||||||
|
|||||||
@@ -1,5 +1,17 @@
|
|||||||
version: '3.9'
|
version: '3.9'
|
||||||
|
|
||||||
|
x-monitoring: &monitoring
|
||||||
|
DD_ENV: ${DD_ENV:-production}
|
||||||
|
DD_SERVICE: ${DD_SERVICE:-shieldai}
|
||||||
|
DD_VERSION: ${DOCKER_TAG:-latest}
|
||||||
|
DD_TRACE_ENABLED: ${DD_TRACE_ENABLED:-true}
|
||||||
|
DD_AGENT_HOST: datadog-agent
|
||||||
|
DD_AGENT_PORT: "8126"
|
||||||
|
DD_LOGS_INJECTION: "true"
|
||||||
|
SENTRY_DSN: ${SENTRY_DSN:-}
|
||||||
|
SENTRY_ENVIRONMENT: ${DD_ENV:-production}
|
||||||
|
SENTRY_RELEASE: ${DOCKER_TAG:-latest}
|
||||||
|
|
||||||
services:
|
services:
|
||||||
api:
|
api:
|
||||||
image: ghcr.io/${GITHUB_REPOSITORY_OWNER}/shieldai-api:${DOCKER_TAG:-latest}
|
image: ghcr.io/${GITHUB_REPOSITORY_OWNER}/shieldai-api:${DOCKER_TAG:-latest}
|
||||||
@@ -7,12 +19,13 @@ services:
|
|||||||
ports:
|
ports:
|
||||||
- "${PORT:-3000}:3000"
|
- "${PORT:-3000}:3000"
|
||||||
environment:
|
environment:
|
||||||
- DATABASE_URL=postgresql://shieldai:${POSTGRES_PASSWORD}@postgres:5432/shieldai
|
DATABASE_URL: "postgresql://shieldai:${POSTGRES_PASSWORD}@postgres:5432/shieldai"
|
||||||
- REDIS_URL=redis://redis:6379
|
REDIS_URL: "redis://redis:6379"
|
||||||
- PORT=3000
|
PORT: "3000"
|
||||||
- LOG_LEVEL=info
|
LOG_LEVEL: info
|
||||||
- HIBP_API_KEY=${HIBP_API_KEY}
|
HIBP_API_KEY: ${HIBP_API_KEY}
|
||||||
- RESEND_API_KEY=${RESEND_API_KEY}
|
RESEND_API_KEY: ${RESEND_API_KEY}
|
||||||
|
<<: *monitoring
|
||||||
depends_on:
|
depends_on:
|
||||||
postgres:
|
postgres:
|
||||||
condition: service_healthy
|
condition: service_healthy
|
||||||
@@ -25,9 +38,11 @@ services:
|
|||||||
image: ghcr.io/${GITHUB_REPOSITORY_OWNER}/shieldai-darkwatch:${DOCKER_TAG:-latest}
|
image: ghcr.io/${GITHUB_REPOSITORY_OWNER}/shieldai-darkwatch:${DOCKER_TAG:-latest}
|
||||||
restart: unless-stopped
|
restart: unless-stopped
|
||||||
environment:
|
environment:
|
||||||
- DATABASE_URL=postgresql://shieldai:${POSTGRES_PASSWORD}@postgres:5432/shieldai
|
DATABASE_URL: "postgresql://shieldai:${POSTGRES_PASSWORD}@postgres:5432/shieldai"
|
||||||
- REDIS_URL=redis://redis:6379
|
REDIS_URL: "redis://redis:6379"
|
||||||
- HIBP_API_KEY=${HIBP_API_KEY}
|
HIBP_API_KEY: ${HIBP_API_KEY}
|
||||||
|
DD_SERVICE: "shieldai-darkwatch"
|
||||||
|
<<: *monitoring
|
||||||
depends_on:
|
depends_on:
|
||||||
postgres:
|
postgres:
|
||||||
condition: service_healthy
|
condition: service_healthy
|
||||||
@@ -40,8 +55,10 @@ services:
|
|||||||
image: ghcr.io/${GITHUB_REPOSITORY_OWNER}/shieldai-spamshield:${DOCKER_TAG:-latest}
|
image: ghcr.io/${GITHUB_REPOSITORY_OWNER}/shieldai-spamshield:${DOCKER_TAG:-latest}
|
||||||
restart: unless-stopped
|
restart: unless-stopped
|
||||||
environment:
|
environment:
|
||||||
- DATABASE_URL=postgresql://shieldai:${POSTGRES_PASSWORD}@postgres:5432/shieldai
|
DATABASE_URL: "postgresql://shieldai:${POSTGRES_PASSWORD}@postgres:5432/shieldai"
|
||||||
- REDIS_URL=redis://redis:6379
|
REDIS_URL: "redis://redis:6379"
|
||||||
|
DD_SERVICE: "shieldai-spamshield"
|
||||||
|
<<: *monitoring
|
||||||
depends_on:
|
depends_on:
|
||||||
postgres:
|
postgres:
|
||||||
condition: service_healthy
|
condition: service_healthy
|
||||||
@@ -54,8 +71,10 @@ services:
|
|||||||
image: ghcr.io/${GITHUB_REPOSITORY_OWNER}/shieldai-voiceprint:${DOCKER_TAG:-latest}
|
image: ghcr.io/${GITHUB_REPOSITORY_OWNER}/shieldai-voiceprint:${DOCKER_TAG:-latest}
|
||||||
restart: unless-stopped
|
restart: unless-stopped
|
||||||
environment:
|
environment:
|
||||||
- DATABASE_URL=postgresql://shieldai:${POSTGRES_PASSWORD}@postgres:5432/shieldai
|
DATABASE_URL: "postgresql://shieldai:${POSTGRES_PASSWORD}@postgres:5432/shieldai"
|
||||||
- REDIS_URL=redis://redis:6379
|
REDIS_URL: "redis://redis:6379"
|
||||||
|
DD_SERVICE: "shieldai-voiceprint"
|
||||||
|
<<: *monitoring
|
||||||
depends_on:
|
depends_on:
|
||||||
postgres:
|
postgres:
|
||||||
condition: service_healthy
|
condition: service_healthy
|
||||||
@@ -64,6 +83,29 @@ services:
|
|||||||
networks:
|
networks:
|
||||||
- shieldai
|
- shieldai
|
||||||
|
|
||||||
|
datadog-agent:
|
||||||
|
image: datadog/agent:7
|
||||||
|
restart: unless-stopped
|
||||||
|
environment:
|
||||||
|
DD_API_KEY: ${DD_API_KEY}
|
||||||
|
DD_SITE: ${DD_SITE:-datadoghq.com}
|
||||||
|
DD_ENV: ${DD_ENV:-production}
|
||||||
|
DD_DOGSTATSD_NON_LOCAL_TRAFFIC: "true"
|
||||||
|
DD_APM_ENABLED: "true"
|
||||||
|
DD_APM_NON_LOCAL_TRAFFIC: "true"
|
||||||
|
DD_LOGS_ENABLED: "true"
|
||||||
|
DD_LOGS_CONFIG_CONTAINER_COLLECT_ALL: "true"
|
||||||
|
DD_HEALTH_PORT_ENABLE: "true"
|
||||||
|
ports:
|
||||||
|
- "8125:8125/udp"
|
||||||
|
- "8126:8126"
|
||||||
|
volumes:
|
||||||
|
- /var/run/docker.sock:/var/run/docker.sock:ro
|
||||||
|
- /proc/:/host/proc/:ro
|
||||||
|
- /sys/fs/cgroup:/host/sys/fs/cgroup:ro
|
||||||
|
networks:
|
||||||
|
- shieldai
|
||||||
|
|
||||||
postgres:
|
postgres:
|
||||||
image: postgres:16-alpine
|
image: postgres:16-alpine
|
||||||
restart: unless-stopped
|
restart: unless-stopped
|
||||||
|
|||||||
@@ -23,6 +23,27 @@ variable "cache_endpoint" {
|
|||||||
type = string
|
type = string
|
||||||
}
|
}
|
||||||
|
|
||||||
|
variable "alert_email" {
|
||||||
|
description = "Email address for alert notifications"
|
||||||
|
type = string
|
||||||
|
default = "ops@shieldai.com"
|
||||||
|
}
|
||||||
|
|
||||||
|
resource "aws_sns_topic" "alerts" {
|
||||||
|
name = "${var.project_name}-${var.environment}-alerts"
|
||||||
|
|
||||||
|
tags = {
|
||||||
|
Environment = var.environment
|
||||||
|
Project = var.project_name
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
resource "aws_sns_topic_subscription" "alerts_email" {
|
||||||
|
topic_arn = aws_sns_topic.alerts.arn
|
||||||
|
protocol = "email"
|
||||||
|
endpoint = var.alert_email
|
||||||
|
}
|
||||||
|
|
||||||
resource "aws_cloudwatch_dashboard" "main" {
|
resource "aws_cloudwatch_dashboard" "main" {
|
||||||
dashboard_name = "${var.project_name}-${var.environment}-dashboard"
|
dashboard_name = "${var.project_name}-${var.environment}-dashboard"
|
||||||
|
|
||||||
@@ -92,6 +113,120 @@ resource "aws_cloudwatch_dashboard" "main" {
|
|||||||
region = "us-east-1"
|
region = "us-east-1"
|
||||||
period = 60
|
period = 60
|
||||||
}
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
type = "metric"
|
||||||
|
properties = {
|
||||||
|
title = "P99 Latency (Target Group)"
|
||||||
|
metrics = [
|
||||||
|
["AWS/ApplicationELB", "TargetResponseTime", "LoadBalancer", "${var.cluster_name}-alb", "Statistic", "p99"],
|
||||||
|
["AWS/ApplicationELB", "TargetResponseTime", "LoadBalancer", "${var.cluster_name}-alb", "Statistic", "p95"]
|
||||||
|
]
|
||||||
|
view = "timeSeries"
|
||||||
|
stacked = false
|
||||||
|
region = "us-east-1"
|
||||||
|
period = 60
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
type = "metric"
|
||||||
|
properties = {
|
||||||
|
title = "Error Rate (5xx / Total)"
|
||||||
|
metrics = [
|
||||||
|
["AWS/ApplicationELB", "HTTPCode_Elb_5XX_Count", "LoadBalancer", "${var.cluster_name}-alb"],
|
||||||
|
["AWS/ApplicationELB", "HTTPCode_Elb_4XX_Count", "LoadBalancer", "${var.cluster_name}-alb"]
|
||||||
|
]
|
||||||
|
view = "timeSeries"
|
||||||
|
stacked = false
|
||||||
|
region = "us-east-1"
|
||||||
|
period = 60
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
type = "metric"
|
||||||
|
properties = {
|
||||||
|
title = "Throughput (Request Count)"
|
||||||
|
metrics = [
|
||||||
|
["AWS/ApplicationELB", "RequestCount", "LoadBalancer", "${var.cluster_name}-alb"]
|
||||||
|
]
|
||||||
|
view = "timeSeries"
|
||||||
|
stacked = false
|
||||||
|
region = "us-east-1"
|
||||||
|
period = 60
|
||||||
|
yAxis = {
|
||||||
|
left = {
|
||||||
|
label = "Requests/sec"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
type = "metric"
|
||||||
|
properties = {
|
||||||
|
title = "API Latency Percentiles"
|
||||||
|
metrics = [
|
||||||
|
["ShieldAI", "api_latency", "service", "api", "percentile", "p99", "statistic", "Average"],
|
||||||
|
["ShieldAI", "api_latency", "service", "api", "percentile", "p95", "statistic", "Average"],
|
||||||
|
["ShieldAI", "api_latency", "service", "api", "percentile", "p50", "statistic", "Average"]
|
||||||
|
]
|
||||||
|
view = "timeSeries"
|
||||||
|
stacked = false
|
||||||
|
region = "us-east-1"
|
||||||
|
period = 60
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
type = "metric"
|
||||||
|
properties = {
|
||||||
|
title = "API Error Rate"
|
||||||
|
metrics = [
|
||||||
|
["ShieldAI", "api_errors", "service", "api", "statistic", "Sum"]
|
||||||
|
]
|
||||||
|
view = "timeSeries"
|
||||||
|
stacked = false
|
||||||
|
region = "us-east-1"
|
||||||
|
period = 60
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
type = "metric"
|
||||||
|
properties = {
|
||||||
|
title = "API Throughput"
|
||||||
|
metrics = [
|
||||||
|
["ShieldAI", "api_requests", "service", "api", "statistic", "Sum"]
|
||||||
|
]
|
||||||
|
view = "timeSeries"
|
||||||
|
stacked = false
|
||||||
|
region = "us-east-1"
|
||||||
|
period = 60
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
type = "metric"
|
||||||
|
properties = {
|
||||||
|
title = "ECS Running Tasks"
|
||||||
|
metrics = [
|
||||||
|
["AWS/ECS", "RunningTaskCount", "ClusterName", var.cluster_name]
|
||||||
|
]
|
||||||
|
view = "timeSeries"
|
||||||
|
stacked = false
|
||||||
|
region = "us-east-1"
|
||||||
|
period = 60
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
type = "metric"
|
||||||
|
properties = {
|
||||||
|
title = "RDS Read/Write IOPS"
|
||||||
|
metrics = [
|
||||||
|
["AWS/RDS", "ReadIOPS", "DBInstanceIdentifier", var.rds_identifier],
|
||||||
|
["AWS/RDS", "WriteIOPS", "DBInstanceIdentifier", var.rds_identifier]
|
||||||
|
]
|
||||||
|
view = "timeSeries"
|
||||||
|
stacked = false
|
||||||
|
region = "us-east-1"
|
||||||
|
period = 60
|
||||||
|
}
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
})
|
})
|
||||||
@@ -107,6 +242,7 @@ resource "aws_cloudwatch_metric_alarm" "ecs_cpu_high" {
|
|||||||
statistic = "Average"
|
statistic = "Average"
|
||||||
threshold = 80
|
threshold = 80
|
||||||
alarm_description = "ECS CPU utilization above 80%"
|
alarm_description = "ECS CPU utilization above 80%"
|
||||||
|
alarm_actions = [aws_sns_topic.alerts.arn]
|
||||||
|
|
||||||
dimensions = {
|
dimensions = {
|
||||||
ClusterName = var.cluster_name
|
ClusterName = var.cluster_name
|
||||||
@@ -123,6 +259,7 @@ resource "aws_cloudwatch_metric_alarm" "ecs_memory_high" {
|
|||||||
statistic = "Average"
|
statistic = "Average"
|
||||||
threshold = 85
|
threshold = 85
|
||||||
alarm_description = "ECS memory utilization above 85%"
|
alarm_description = "ECS memory utilization above 85%"
|
||||||
|
alarm_actions = [aws_sns_topic.alerts.arn]
|
||||||
|
|
||||||
dimensions = {
|
dimensions = {
|
||||||
ClusterName = var.cluster_name
|
ClusterName = var.cluster_name
|
||||||
@@ -139,6 +276,7 @@ resource "aws_cloudwatch_metric_alarm" "alb_5xx" {
|
|||||||
statistic = "Sum"
|
statistic = "Sum"
|
||||||
threshold = 10
|
threshold = 10
|
||||||
alarm_description = "ALB 5xx errors above 10 per minute"
|
alarm_description = "ALB 5xx errors above 10 per minute"
|
||||||
|
alarm_actions = [aws_sns_topic.alerts.arn]
|
||||||
|
|
||||||
dimensions = {
|
dimensions = {
|
||||||
LoadBalancer = "${var.cluster_name}-alb"
|
LoadBalancer = "${var.cluster_name}-alb"
|
||||||
@@ -155,6 +293,7 @@ resource "aws_cloudwatch_metric_alarm" "rds_cpu_high" {
|
|||||||
statistic = "Average"
|
statistic = "Average"
|
||||||
threshold = 75
|
threshold = 75
|
||||||
alarm_description = "RDS CPU utilization above 75%"
|
alarm_description = "RDS CPU utilization above 75%"
|
||||||
|
alarm_actions = [aws_sns_topic.alerts.arn]
|
||||||
|
|
||||||
dimensions = {
|
dimensions = {
|
||||||
DBInstanceIdentifier = var.rds_identifier
|
DBInstanceIdentifier = var.rds_identifier
|
||||||
@@ -171,13 +310,155 @@ resource "aws_cloudwatch_metric_alarm" "rds_free_storage" {
|
|||||||
statistic = "Average"
|
statistic = "Average"
|
||||||
threshold = 524288000
|
threshold = 524288000
|
||||||
alarm_description = "RDS free storage below 500MB"
|
alarm_description = "RDS free storage below 500MB"
|
||||||
|
alarm_actions = [aws_sns_topic.alerts.arn]
|
||||||
|
|
||||||
dimensions = {
|
dimensions = {
|
||||||
DBInstanceIdentifier = var.rds_identifier
|
DBInstanceIdentifier = var.rds_identifier
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
resource "aws_cloudwatch_metric_alarm" "p99_latency_high" {
|
||||||
|
alarm_name = "${var.project_name}-${var.environment}-p99-latency-high"
|
||||||
|
comparison_operator = "GreaterThanThreshold"
|
||||||
|
evaluation_periods = 3
|
||||||
|
metric_name = "TargetResponseTime"
|
||||||
|
namespace = "AWS/ApplicationELB"
|
||||||
|
period = 60
|
||||||
|
statistic = "p99"
|
||||||
|
threshold = 2
|
||||||
|
alarm_description = "P99 latency above 2 seconds"
|
||||||
|
alarm_actions = [aws_sns_topic.alerts.arn]
|
||||||
|
|
||||||
|
dimensions = {
|
||||||
|
LoadBalancer = "${var.cluster_name}-alb"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
resource "aws_cloudwatch_metric_alarm" "error_rate_high" {
|
||||||
|
alarm_name = "${var.project_name}-${var.environment}-error-rate-high"
|
||||||
|
comparison_operator = "GreaterThanThreshold"
|
||||||
|
evaluation_periods = 3
|
||||||
|
metric_name = "HTTPCode_Elb_5XX_Count"
|
||||||
|
namespace = "AWS/ApplicationELB"
|
||||||
|
period = 60
|
||||||
|
statistic = "Sum"
|
||||||
|
threshold = 5
|
||||||
|
alarm_description = "Error rate above 5 errors per minute"
|
||||||
|
alarm_actions = [aws_sns_topic.alerts.arn]
|
||||||
|
|
||||||
|
dimensions = {
|
||||||
|
LoadBalancer = "${var.cluster_name}-alb"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
resource "aws_cloudwatch_metric_alarm" "throughput_low" {
|
||||||
|
alarm_name = "${var.project_name}-${var.environment}-throughput-low"
|
||||||
|
comparison_operator = "LessThanThreshold"
|
||||||
|
evaluation_periods = 5
|
||||||
|
metric_name = "RequestCount"
|
||||||
|
namespace = "AWS/ApplicationELB"
|
||||||
|
period = 60
|
||||||
|
statistic = "Sum"
|
||||||
|
threshold = 10
|
||||||
|
alarm_description = "Throughput below 10 requests per minute"
|
||||||
|
alarm_actions = [aws_sns_topic.alerts.arn]
|
||||||
|
|
||||||
|
dimensions = {
|
||||||
|
LoadBalancer = "${var.cluster_name}-alb"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
resource "aws_cloudwatch_log_group" "api" {
|
||||||
|
name = "/${var.project_name}/${var.environment}/api"
|
||||||
|
retention_in_days = 30
|
||||||
|
|
||||||
|
tags = {
|
||||||
|
Environment = var.environment
|
||||||
|
Project = var.project_name
|
||||||
|
Service = "api"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
resource "aws_cloudwatch_log_group" "datadog" {
|
||||||
|
name = "/${var.project_name}/${var.environment}/datadog"
|
||||||
|
retention_in_days = 30
|
||||||
|
|
||||||
|
tags = {
|
||||||
|
Environment = var.environment
|
||||||
|
Project = var.project_name
|
||||||
|
Service = "datadog"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
resource "aws_cloudwatch_log_group" "sentry" {
|
||||||
|
name = "/${var.project_name}/${var.environment}/sentry"
|
||||||
|
retention_in_days = 30
|
||||||
|
|
||||||
|
tags = {
|
||||||
|
Environment = var.environment
|
||||||
|
Project = var.project_name
|
||||||
|
Service = "sentry"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
resource "aws_cloudwatch_metric_alarm" "app_p99_latency_high" {
|
||||||
|
alarm_name = "${var.project_name}-${var.environment}-app-p99-latency-high"
|
||||||
|
comparison_operator = "GreaterThanThreshold"
|
||||||
|
evaluation_periods = 3
|
||||||
|
metric_name = "api_latency"
|
||||||
|
namespace = "ShieldAI"
|
||||||
|
period = 60
|
||||||
|
statistic = "Average"
|
||||||
|
threshold = 2000
|
||||||
|
alarm_description = "Application P99 latency above 2000ms"
|
||||||
|
alarm_actions = [aws_sns_topic.alerts.arn]
|
||||||
|
|
||||||
|
dimensions = {
|
||||||
|
service = "api"
|
||||||
|
percentile = "p99"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
resource "aws_cloudwatch_metric_alarm" "app_error_rate_high" {
|
||||||
|
alarm_name = "${var.project_name}-${var.environment}-app-error-rate-high"
|
||||||
|
comparison_operator = "GreaterThanThreshold"
|
||||||
|
evaluation_periods = 3
|
||||||
|
metric_name = "api_errors"
|
||||||
|
namespace = "ShieldAI"
|
||||||
|
period = 60
|
||||||
|
statistic = "Sum"
|
||||||
|
threshold = 10
|
||||||
|
alarm_description = "Application error count above 10 per minute"
|
||||||
|
alarm_actions = [aws_sns_topic.alerts.arn]
|
||||||
|
|
||||||
|
dimensions = {
|
||||||
|
service = "api"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
resource "aws_cloudwatch_metric_alarm" "app_throughput_low" {
|
||||||
|
alarm_name = "${var.project_name}-${var.environment}-app-throughput-low"
|
||||||
|
comparison_operator = "LessThanThreshold"
|
||||||
|
evaluation_periods = 5
|
||||||
|
metric_name = "api_requests"
|
||||||
|
namespace = "ShieldAI"
|
||||||
|
period = 60
|
||||||
|
statistic = "Sum"
|
||||||
|
threshold = 10
|
||||||
|
alarm_description = "Application throughput below 10 requests per minute"
|
||||||
|
alarm_actions = [aws_sns_topic.alerts.arn]
|
||||||
|
|
||||||
|
dimensions = {
|
||||||
|
service = "api"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
output "dashboard_url" {
|
output "dashboard_url" {
|
||||||
description = "CloudWatch dashboard URL"
|
description = "CloudWatch dashboard URL"
|
||||||
value = "https://us-east-1.console.aws.amazon.com/cloudwatch/home#dashboards/dashboard/${var.project_name}-${var.environment}-dashboard"
|
value = "https://us-east-1.console.aws.amazon.com/cloudwatch/home#dashboards/dashboard/${var.project_name}-${var.environment}-dashboard"
|
||||||
}
|
}
|
||||||
|
|
||||||
|
output "sns_topic_arn" {
|
||||||
|
description = "SNS topic ARN for alerts"
|
||||||
|
value = aws_sns_topic.alerts.arn
|
||||||
|
}
|
||||||
|
|||||||
@@ -96,6 +96,50 @@ resource "aws_ecs_task_definition" "services" {
|
|||||||
{
|
{
|
||||||
name = "PORT"
|
name = "PORT"
|
||||||
value = tostring(each.port)
|
value = tostring(each.port)
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name = "DD_ENV"
|
||||||
|
value = var.environment
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name = "DD_SERVICE"
|
||||||
|
value = "${var.cluster_name}-${each.key}"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name = "DD_VERSION"
|
||||||
|
value = var.container_images[each.key]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name = "DD_TRACE_ENABLED"
|
||||||
|
value = "true"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name = "DD_LOGS_INJECTION"
|
||||||
|
value = "true"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name = "DD_AGENT_HOST"
|
||||||
|
value = "localhost"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name = "DD_AGENT_PORT"
|
||||||
|
value = "8126"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name = "SENTRY_ENVIRONMENT"
|
||||||
|
value = var.environment
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name = "SENTRY_RELEASE"
|
||||||
|
value = var.container_images[each.key]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name = "AWS_REGION"
|
||||||
|
value = "us-east-1"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name = "DD_SITE"
|
||||||
|
value = "datadoghq.com"
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
|
|
||||||
@@ -115,6 +159,14 @@ resource "aws_ecs_task_definition" "services" {
|
|||||||
{
|
{
|
||||||
name = "RESEND_API_KEY"
|
name = "RESEND_API_KEY"
|
||||||
valueFrom = "${var.secrets_arn}:RESEND_API_KEY::"
|
valueFrom = "${var.secrets_arn}:RESEND_API_KEY::"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name = "SENTRY_DSN"
|
||||||
|
valueFrom = "${var.secrets_arn}:SENTRY_DSN::"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name = "DD_API_KEY"
|
||||||
|
valueFrom = "${var.secrets_arn}:DD_API_KEY::"
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|||||||
10
package.json
10
package.json
@@ -17,13 +17,17 @@
|
|||||||
},
|
},
|
||||||
"devDependencies": {
|
"devDependencies": {
|
||||||
"@types/node": "^25.6.0",
|
"@types/node": "^25.6.0",
|
||||||
"vitest": "^4.1.5",
|
"@types/ws": "^8.5.10",
|
||||||
"@vitest/coverage-v8": "^4.1.5",
|
"@vitest/coverage-v8": "^4.1.5",
|
||||||
"turbo": "^2.3.0",
|
"turbo": "^2.3.0",
|
||||||
"typescript": "^5.7.0"
|
"typescript": "^5.7.0",
|
||||||
|
"vitest": "^4.1.5"
|
||||||
},
|
},
|
||||||
"engines": {
|
"engines": {
|
||||||
"node": ">=20.0.0"
|
"node": ">=20.0.0"
|
||||||
},
|
},
|
||||||
"packageManager": "pnpm@9.0.0"
|
"packageManager": "pnpm@9.0.0",
|
||||||
|
"dependencies": {
|
||||||
|
"ws": "^8.16.0"
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,4 +1,5 @@
|
|||||||
import { FastifyInstance, FastifyRequest, FastifyReply } from 'fastify';
|
import { FastifyInstance, FastifyRequest, FastifyReply } from 'fastify';
|
||||||
|
import { captureSentryError, setSentryContext, setSentryUser } from '@shieldai/monitoring';
|
||||||
|
|
||||||
export interface ErrorResponse {
|
export interface ErrorResponse {
|
||||||
error: string;
|
error: string;
|
||||||
@@ -13,19 +14,37 @@ export interface ErrorResponse {
|
|||||||
export async function errorHandlingMiddleware(fastify: FastifyInstance) {
|
export async function errorHandlingMiddleware(fastify: FastifyInstance) {
|
||||||
// Custom error handler
|
// Custom error handler
|
||||||
fastify.setErrorHandler((error, request: FastifyRequest, reply: FastifyReply) => {
|
fastify.setErrorHandler((error, request: FastifyRequest, reply: FastifyReply) => {
|
||||||
|
const err = error as Error & { statusCode?: number; code?: string };
|
||||||
const response: ErrorResponse = {
|
const response: ErrorResponse = {
|
||||||
error: error.name || 'Internal Server Error',
|
error: err.name || 'Internal Server Error',
|
||||||
message: error.message || 'An unexpected error occurred',
|
message: err.message || 'An unexpected error occurred',
|
||||||
statusCode: error.statusCode || 500,
|
statusCode: err.statusCode || 500,
|
||||||
code: (error as any).code,
|
code: err.code,
|
||||||
timestamp: new Date().toISOString(),
|
timestamp: new Date().toISOString(),
|
||||||
path: request.url,
|
path: request.url,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// Send to Sentry (5xx errors only)
|
||||||
|
if (response.statusCode >= 500) {
|
||||||
|
const userId = (request as FastifyRequest & { user?: { id?: string } }).user?.id;
|
||||||
|
if (userId) setSentryUser(userId);
|
||||||
|
setSentryContext('request', {
|
||||||
|
method: request.method,
|
||||||
|
url: request.url,
|
||||||
|
userAgent: request.headers['user-agent'],
|
||||||
|
requestId: request.id,
|
||||||
|
});
|
||||||
|
captureSentryError(err, {
|
||||||
|
statusCode: String(response.statusCode),
|
||||||
|
path: request.url,
|
||||||
|
method: request.method,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
// Log error
|
// Log error
|
||||||
fastify.log.error({
|
fastify.log.error({
|
||||||
error: response,
|
error: response,
|
||||||
stack: error.stack,
|
stack: err.stack,
|
||||||
method: request.method,
|
method: request.method,
|
||||||
userAgent: request.headers['user-agent'],
|
userAgent: request.headers['user-agent'],
|
||||||
});
|
});
|
||||||
|
|||||||
46
packages/api/src/middleware/monitoring.middleware.ts
Normal file
46
packages/api/src/middleware/monitoring.middleware.ts
Normal file
@@ -0,0 +1,46 @@
|
|||||||
|
import { FastifyInstance, FastifyRequest, FastifyReply } from 'fastify';
|
||||||
|
import { emitLatency, emitRequestCount, emitError } from '@shieldai/monitoring';
|
||||||
|
|
||||||
|
const SERVICE_NAME = process.env.DD_SERVICE || 'shieldai-api';
|
||||||
|
|
||||||
|
export async function monitoringMiddleware(fastify: FastifyInstance) {
|
||||||
|
fastify.addHook('onResponse', async (request: FastifyRequest, reply: FastifyReply) => {
|
||||||
|
const statusCode = reply.statusCode;
|
||||||
|
const responseTime = reply.elapsedTime;
|
||||||
|
const method = request.method;
|
||||||
|
const url = request.url;
|
||||||
|
|
||||||
|
// Emit request count
|
||||||
|
await emitRequestCount(SERVICE_NAME, statusCode);
|
||||||
|
|
||||||
|
// Emit latency metrics
|
||||||
|
await emitLatency(SERVICE_NAME, responseTime, 'p50');
|
||||||
|
await emitLatency(SERVICE_NAME, responseTime, 'p95');
|
||||||
|
await emitLatency(SERVICE_NAME, responseTime, 'p99');
|
||||||
|
|
||||||
|
// Emit error metric for 5xx
|
||||||
|
if (statusCode >= 500) {
|
||||||
|
await emitError(SERVICE_NAME, 'server_error');
|
||||||
|
fastify.log.warn({
|
||||||
|
event: 'high_latency_or_error',
|
||||||
|
method,
|
||||||
|
url,
|
||||||
|
statusCode,
|
||||||
|
responseTime,
|
||||||
|
service: SERVICE_NAME,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
// Log high latency requests (>2s)
|
||||||
|
if (responseTime > 2000) {
|
||||||
|
fastify.log.warn({
|
||||||
|
event: 'high_latency',
|
||||||
|
method,
|
||||||
|
url,
|
||||||
|
statusCode,
|
||||||
|
responseTime,
|
||||||
|
service: SERVICE_NAME,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
@@ -4,15 +4,19 @@ import helmet from "@fastify/helmet";
|
|||||||
import sensible from "@fastify/sensible";
|
import sensible from "@fastify/sensible";
|
||||||
import { extractOrGenerateRequestId } from "@shieldai/types";
|
import { extractOrGenerateRequestId } from "@shieldai/types";
|
||||||
import { authMiddleware } from "./middleware/auth.middleware";
|
import { authMiddleware } from "./middleware/auth.middleware";
|
||||||
|
import { errorHandlingMiddleware } from "./middleware/error-handling.middleware";
|
||||||
|
import { loggingMiddleware } from "./middleware/logging.middleware";
|
||||||
|
import { monitoringMiddleware } from "./middleware/monitoring.middleware";
|
||||||
import { darkwatchRoutes } from "./routes/darkwatch.routes";
|
import { darkwatchRoutes } from "./routes/darkwatch.routes";
|
||||||
import { voiceprintRoutes } from "./routes/voiceprint.routes";
|
import { voiceprintRoutes } from "./routes/voiceprint.routes";
|
||||||
import { correlationRoutes } from "./routes/correlation.routes";
|
import { correlationRoutes } from "./routes/correlation.routes";
|
||||||
import { extensionRoutes } from "./routes/extension.routes";
|
import { extensionRoutes } from "./routes/extension.routes";
|
||||||
import { initDatadog, initSentry, captureSentryError } from "@shieldai/monitoring";
|
import { initDatadog, initSentry, initDatadogLogs, captureSentryError } from "@shieldai/monitoring";
|
||||||
import { getCorsOrigins } from "./config/api.config";
|
import { getCorsOrigins } from "./config/api.config";
|
||||||
|
|
||||||
initDatadog();
|
initDatadog();
|
||||||
initSentry();
|
initSentry();
|
||||||
|
initDatadogLogs();
|
||||||
|
|
||||||
const app = Fastify({
|
const app = Fastify({
|
||||||
logger: {
|
logger: {
|
||||||
@@ -29,6 +33,15 @@ async function bootstrap() {
|
|||||||
// Register auth middleware to populate request.user
|
// Register auth middleware to populate request.user
|
||||||
await app.register(authMiddleware);
|
await app.register(authMiddleware);
|
||||||
|
|
||||||
|
// Register logging middleware (request/response logging)
|
||||||
|
await app.register(loggingMiddleware);
|
||||||
|
|
||||||
|
// Register monitoring middleware (CloudWatch metrics)
|
||||||
|
await app.register(monitoringMiddleware);
|
||||||
|
|
||||||
|
// Register error handling middleware (Sentry integration)
|
||||||
|
await app.register(errorHandlingMiddleware);
|
||||||
|
|
||||||
app.addHook("onRequest", async (request, _reply) => {
|
app.addHook("onRequest", async (request, _reply) => {
|
||||||
const requestId = extractOrGenerateRequestId(request.headers);
|
const requestId = extractOrGenerateRequestId(request.headers);
|
||||||
request.id = requestId;
|
request.id = requestId;
|
||||||
|
|||||||
23
packages/monitoring/package.json
Normal file
23
packages/monitoring/package.json
Normal file
@@ -0,0 +1,23 @@
|
|||||||
|
{
|
||||||
|
"name": "@shieldai/monitoring",
|
||||||
|
"version": "0.1.0",
|
||||||
|
"main": "./dist/index.js",
|
||||||
|
"types": "./dist/index.d.ts",
|
||||||
|
"scripts": {
|
||||||
|
"build": "tsc",
|
||||||
|
"lint": "eslint src/"
|
||||||
|
},
|
||||||
|
"dependencies": {
|
||||||
|
"@aws-sdk/client-cloudwatch": "^3.500.0",
|
||||||
|
"dd-trace": "^5.0.0",
|
||||||
|
"@sentry/node": "^8.0.0",
|
||||||
|
"zod": "^3.23.0"
|
||||||
|
},
|
||||||
|
"devDependencies": {
|
||||||
|
"@types/node": "^25.6.0",
|
||||||
|
"typescript": "^5.7.0"
|
||||||
|
},
|
||||||
|
"exports": {
|
||||||
|
".": "./src/index.ts"
|
||||||
|
}
|
||||||
|
}
|
||||||
97
packages/monitoring/src/cloudwatch.ts
Normal file
97
packages/monitoring/src/cloudwatch.ts
Normal file
@@ -0,0 +1,97 @@
|
|||||||
|
import { CloudWatchClient, PutMetricDataCommand, StandardUnit } from '@aws-sdk/client-cloudwatch';
|
||||||
|
import { getMonitoringConfig } from './config';
|
||||||
|
|
||||||
|
let client: CloudWatchClient | null = null;
|
||||||
|
|
||||||
|
function getClient(): CloudWatchClient | null {
|
||||||
|
if (client) return client;
|
||||||
|
|
||||||
|
const config = getMonitoringConfig();
|
||||||
|
const region = process.env.AWS_REGION || 'us-east-1';
|
||||||
|
|
||||||
|
try {
|
||||||
|
client = new CloudWatchClient({ region });
|
||||||
|
return client;
|
||||||
|
} catch {
|
||||||
|
console.warn('[CloudWatch] Metrics client initialization skipped');
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface MetricDataPoint {
|
||||||
|
MetricName: string;
|
||||||
|
Dimensions?: { Name: string; Value: string }[];
|
||||||
|
Value: number;
|
||||||
|
Unit?: string;
|
||||||
|
Timestamp?: Date;
|
||||||
|
}
|
||||||
|
|
||||||
|
const NAMESPACE = 'ShieldAI';
|
||||||
|
|
||||||
|
export async function emitMetric(
|
||||||
|
serviceName: string,
|
||||||
|
metricName: string,
|
||||||
|
value: number,
|
||||||
|
unit: StandardUnit = 'Count',
|
||||||
|
dimensions?: Record<string, string>
|
||||||
|
) {
|
||||||
|
const cw = getClient();
|
||||||
|
if (!cw) return;
|
||||||
|
|
||||||
|
const dims: { Name: string; Value: string }[] = [
|
||||||
|
{ Name: 'service', Value: serviceName },
|
||||||
|
...(dimensions ? Object.entries(dimensions).map(([n, v]) => ({ Name: n, Value: v })) : []),
|
||||||
|
];
|
||||||
|
|
||||||
|
const command = new PutMetricDataCommand({
|
||||||
|
Namespace: NAMESPACE,
|
||||||
|
MetricData: [
|
||||||
|
{
|
||||||
|
MetricName: metricName,
|
||||||
|
Dimensions: dims,
|
||||||
|
Value: value,
|
||||||
|
Unit: unit,
|
||||||
|
},
|
||||||
|
],
|
||||||
|
});
|
||||||
|
|
||||||
|
try {
|
||||||
|
await cw.send(command);
|
||||||
|
} catch (err) {
|
||||||
|
console.warn('[CloudWatch] Metric emit failed:', (err as Error).message);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
export async function emitLatency(
|
||||||
|
serviceName: string,
|
||||||
|
latencyMs: number,
|
||||||
|
percentile: 'p50' | 'p95' | 'p99'
|
||||||
|
) {
|
||||||
|
await emitMetric(
|
||||||
|
serviceName,
|
||||||
|
'api_latency',
|
||||||
|
latencyMs,
|
||||||
|
'Milliseconds' as StandardUnit,
|
||||||
|
{ percentile }
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
export async function emitRequestCount(serviceName: string, statusCode: number) {
|
||||||
|
await emitMetric(
|
||||||
|
serviceName,
|
||||||
|
'api_requests',
|
||||||
|
1,
|
||||||
|
'Count' as StandardUnit,
|
||||||
|
{ status_class: String(Math.floor(statusCode / 100)) + 'xx' }
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
export async function emitError(serviceName: string, errorType: string) {
|
||||||
|
await emitMetric(
|
||||||
|
serviceName,
|
||||||
|
'api_errors',
|
||||||
|
1,
|
||||||
|
'Count' as StandardUnit,
|
||||||
|
{ error_type: errorType }
|
||||||
|
);
|
||||||
|
}
|
||||||
35
packages/monitoring/src/config.ts
Normal file
35
packages/monitoring/src/config.ts
Normal file
@@ -0,0 +1,35 @@
|
|||||||
|
import { z } from 'zod';
|
||||||
|
|
||||||
|
const monitoringEnvSchema = z.object({
|
||||||
|
DD_SERVICE: z.string().default('shieldai-api'),
|
||||||
|
DD_ENV: z.string().default(process.env.NODE_ENV || 'development'),
|
||||||
|
DD_VERSION: z.string().default('0.1.0'),
|
||||||
|
DD_TRACE_ENABLED: z.string().default('true'),
|
||||||
|
DD_TRACE_SAMPLE_RATE: z.string().transform((v) => Number(v)).default('1.0'),
|
||||||
|
DD_LOGS_INJECTION: z.string().default('true'),
|
||||||
|
DD_AGENT_HOST: z.string().default('localhost'),
|
||||||
|
DD_AGENT_PORT: z.string().transform((v) => Number(v)).default('8126'),
|
||||||
|
SENTRY_DSN: z.string().default(''),
|
||||||
|
SENTRY_ENVIRONMENT: z.string().default(process.env.NODE_ENV || 'development'),
|
||||||
|
SENTRY_RELEASE: z.string().default('0.1.0'),
|
||||||
|
SENTRY_TRACES_SAMPLE_RATE: z.string().transform((v) => Number(v)).default('0.1'),
|
||||||
|
});
|
||||||
|
|
||||||
|
export type MonitoringConfig = z.infer<typeof monitoringEnvSchema>;
|
||||||
|
|
||||||
|
export function getMonitoringConfig(): MonitoringConfig {
|
||||||
|
return monitoringEnvSchema.parse({
|
||||||
|
DD_SERVICE: process.env.DD_SERVICE,
|
||||||
|
DD_ENV: process.env.DD_ENV,
|
||||||
|
DD_VERSION: process.env.DD_VERSION,
|
||||||
|
DD_TRACE_ENABLED: process.env.DD_TRACE_ENABLED,
|
||||||
|
DD_TRACE_SAMPLE_RATE: process.env.DD_TRACE_SAMPLE_RATE,
|
||||||
|
DD_LOGS_INJECTION: process.env.DD_LOGS_INJECTION,
|
||||||
|
DD_AGENT_HOST: process.env.DD_AGENT_HOST,
|
||||||
|
DD_AGENT_PORT: process.env.DD_AGENT_PORT,
|
||||||
|
SENTRY_DSN: process.env.SENTRY_DSN,
|
||||||
|
SENTRY_ENVIRONMENT: process.env.SENTRY_ENVIRONMENT,
|
||||||
|
SENTRY_RELEASE: process.env.SENTRY_RELEASE,
|
||||||
|
SENTRY_TRACES_SAMPLE_RATE: process.env.SENTRY_TRACES_SAMPLE_RATE,
|
||||||
|
});
|
||||||
|
}
|
||||||
49
packages/monitoring/src/datadog-logs.ts
Normal file
49
packages/monitoring/src/datadog-logs.ts
Normal file
@@ -0,0 +1,49 @@
|
|||||||
|
import { getMonitoringConfig } from './config';
|
||||||
|
|
||||||
|
let logForwarder: { send: (log: string, service: string) => Promise<void> } | null = null;
|
||||||
|
|
||||||
|
export function initDatadogLogs() {
|
||||||
|
const config = getMonitoringConfig();
|
||||||
|
|
||||||
|
if (!process.env.DD_API_KEY) {
|
||||||
|
console.log('[Datadog Logs] API key not configured, log forwarding disabled');
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
const site = process.env.DD_SITE || 'datadoghq.com';
|
||||||
|
const logIntakeUrl = `https://http-intake.logs.${site}`;
|
||||||
|
|
||||||
|
logForwarder = {
|
||||||
|
async send(log: string, service: string) {
|
||||||
|
try {
|
||||||
|
const payload = JSON.stringify({
|
||||||
|
ddsource: 'nodejs',
|
||||||
|
ddtags: `env:${config.DD_ENV},service:${service}`,
|
||||||
|
hostname: config.DD_SERVICE,
|
||||||
|
message: log,
|
||||||
|
service,
|
||||||
|
});
|
||||||
|
|
||||||
|
await fetch(`${logIntakeUrl}/api/v2/logs`, {
|
||||||
|
method: 'POST',
|
||||||
|
headers: {
|
||||||
|
'DD-API-KEY': process.env.DD_API_KEY!,
|
||||||
|
'Content-Type': 'application/json',
|
||||||
|
},
|
||||||
|
body: payload,
|
||||||
|
});
|
||||||
|
} catch (err) {
|
||||||
|
console.warn('[Datadog Logs] Forward failed:', (err as Error).message);
|
||||||
|
}
|
||||||
|
},
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
export async function forwardLog(log: string, service: string = 'shieldai-api') {
|
||||||
|
if (!logForwarder) return;
|
||||||
|
await logForwarder.send(log, service);
|
||||||
|
}
|
||||||
|
|
||||||
|
export function getLogForwarder() {
|
||||||
|
return logForwarder;
|
||||||
|
}
|
||||||
49
packages/monitoring/src/datadog.ts
Normal file
49
packages/monitoring/src/datadog.ts
Normal file
@@ -0,0 +1,49 @@
|
|||||||
|
import { getMonitoringConfig } from './config';
|
||||||
|
|
||||||
|
let initialized = false;
|
||||||
|
|
||||||
|
export function initDatadog() {
|
||||||
|
if (initialized) return;
|
||||||
|
|
||||||
|
const config = getMonitoringConfig();
|
||||||
|
|
||||||
|
if (config.DD_TRACE_ENABLED !== 'true') {
|
||||||
|
console.log('[Datadog] APM tracing disabled');
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
const tracer = require('dd-trace').init({
|
||||||
|
service: config.DD_SERVICE,
|
||||||
|
env: config.DD_ENV,
|
||||||
|
version: config.DD_VERSION,
|
||||||
|
sampleRate: config.DD_TRACE_SAMPLE_RATE,
|
||||||
|
logInjection: config.DD_LOGS_INJECTION === 'true',
|
||||||
|
agentHost: config.DD_AGENT_HOST,
|
||||||
|
agentPort: config.DD_AGENT_PORT,
|
||||||
|
plugins: true,
|
||||||
|
debug: config.DD_ENV === 'development',
|
||||||
|
});
|
||||||
|
|
||||||
|
initialized = true;
|
||||||
|
console.log(`[Datadog] APM initialized for service "${config.DD_SERVICE}" in "${config.DD_ENV}"`);
|
||||||
|
return tracer;
|
||||||
|
} catch (err) {
|
||||||
|
console.warn('[Datadog] APM initialization skipped:', (err as Error).message);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
export function getDatadogTracer() {
|
||||||
|
try {
|
||||||
|
return require('dd-trace').tracer;
|
||||||
|
} catch {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
export function createDatadogSpan(name: string, options?: Record<string, unknown>) {
|
||||||
|
const tracer = getDatadogTracer();
|
||||||
|
if (!tracer) return;
|
||||||
|
|
||||||
|
return tracer.startChild(name, options);
|
||||||
|
}
|
||||||
5
packages/monitoring/src/index.ts
Normal file
5
packages/monitoring/src/index.ts
Normal file
@@ -0,0 +1,5 @@
|
|||||||
|
export * from './datadog';
|
||||||
|
export * from './sentry';
|
||||||
|
export * from './config';
|
||||||
|
export * from './cloudwatch';
|
||||||
|
export * from './datadog-logs';
|
||||||
90
packages/monitoring/src/sentry.ts
Normal file
90
packages/monitoring/src/sentry.ts
Normal file
@@ -0,0 +1,90 @@
|
|||||||
|
import { getMonitoringConfig } from './config';
|
||||||
|
|
||||||
|
let initialized = false;
|
||||||
|
|
||||||
|
export function initSentry() {
|
||||||
|
if (initialized) return;
|
||||||
|
|
||||||
|
const config = getMonitoringConfig();
|
||||||
|
|
||||||
|
if (!config.SENTRY_DSN) {
|
||||||
|
console.log('[Sentry] DSN not configured, error tracking disabled');
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
const Sentry = require('@sentry/node');
|
||||||
|
|
||||||
|
Sentry.init({
|
||||||
|
dsn: config.SENTRY_DSN,
|
||||||
|
environment: config.SENTRY_ENVIRONMENT,
|
||||||
|
release: config.SENTRY_RELEASE,
|
||||||
|
tracesSampleRate: config.SENTRY_TRACES_SAMPLE_RATE,
|
||||||
|
attachStacktrace: true,
|
||||||
|
debug: config.SENTRY_ENVIRONMENT === 'development',
|
||||||
|
beforeSend(event: any) {
|
||||||
|
const req = (event as any).request;
|
||||||
|
if (req?.url) {
|
||||||
|
try {
|
||||||
|
const url = new URL(req.url);
|
||||||
|
req.url = url.origin + url.pathname;
|
||||||
|
} catch {
|
||||||
|
// fallback: keep original URL
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return event;
|
||||||
|
},
|
||||||
|
});
|
||||||
|
|
||||||
|
initialized = true;
|
||||||
|
console.log(`[Sentry] Error tracking initialized for "${config.SENTRY_ENVIRONMENT}"`);
|
||||||
|
} catch (err) {
|
||||||
|
console.warn('[Sentry] Initialization skipped:', (err as Error).message);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
export function captureSentryError(error: Error | string, context?: Record<string, unknown>) {
|
||||||
|
try {
|
||||||
|
const Sentry = require('@sentry/node');
|
||||||
|
const err = typeof error === 'string' ? new Error(error) : error;
|
||||||
|
Sentry.captureException(err, { tags: context as Record<string, string> | undefined });
|
||||||
|
} catch {
|
||||||
|
console.warn('[Sentry] Error capture skipped (not initialized):', error);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
export function captureSentryMessage(message: string, level: 'info' | 'warning' | 'error' = 'info') {
|
||||||
|
try {
|
||||||
|
const Sentry = require('@sentry/node');
|
||||||
|
Sentry.captureMessage(message, { level });
|
||||||
|
} catch {
|
||||||
|
console.warn('[Sentry] Message capture skipped (not initialized)');
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
export function setSentryUser(userId: string, metadata?: Record<string, string>) {
|
||||||
|
try {
|
||||||
|
const Sentry = require('@sentry/node');
|
||||||
|
Sentry.setUser({ id: userId, ...metadata });
|
||||||
|
} catch {
|
||||||
|
// silently ignore
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
export function setSentryContext(name: string, data: Record<string, unknown>) {
|
||||||
|
try {
|
||||||
|
const Sentry = require('@sentry/node');
|
||||||
|
Sentry.setContext(name, data);
|
||||||
|
} catch {
|
||||||
|
// silently ignore
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
export function getSentryHub() {
|
||||||
|
try {
|
||||||
|
const Sentry = require('@sentry/node');
|
||||||
|
return Sentry.getCurrentHub?.() || Sentry.hub;
|
||||||
|
} catch {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
}
|
||||||
9
packages/monitoring/tsconfig.json
Normal file
9
packages/monitoring/tsconfig.json
Normal file
@@ -0,0 +1,9 @@
|
|||||||
|
{
|
||||||
|
"extends": "../../tsconfig.base.json",
|
||||||
|
"compilerOptions": {
|
||||||
|
"outDir": "./dist",
|
||||||
|
"rootDir": "./src",
|
||||||
|
"composite": true
|
||||||
|
},
|
||||||
|
"include": ["src"]
|
||||||
|
}
|
||||||
1
packages/monitoring/tsconfig.tsbuildinfo
Normal file
1
packages/monitoring/tsconfig.tsbuildinfo
Normal file
File diff suppressed because one or more lines are too long
4458
pnpm-lock.yaml
generated
4458
pnpm-lock.yaml
generated
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user