feat: integrate Datadog APM + Sentry error tracking with CloudWatch metrics FRE-4806

- Add CloudWatch metrics emitter (api_latency, api_requests, api_errors)
- Add request monitoring middleware for API (latency, error rate, throughput)
- Register error-handling, logging, and monitoring middleware in server.ts
- Add Datadog log forwarding via HTTP intake API
- Add application-level CloudWatch alarms for P99 latency, error rate, throughput
- Inject Datadog/Sentry env vars and secrets into ECS task definitions
- Add DD_API_KEY and SENTRY_DSN to ECS secrets
- Create CloudWatch log groups for datadog and sentry services
- Update .env.example with AWS_REGION and monitoring variables
- Add @aws-sdk/client-cloudwatch dependency to monitoring package

Co-Authored-By: Paperclip <noreply@paperclip.ing>
This commit is contained in:
2026-05-10 02:15:11 -04:00
parent 57a206d7b3
commit c7df40ac26
18 changed files with 5260 additions and 76 deletions

View File

@@ -23,6 +23,27 @@ variable "cache_endpoint" {
type = string
}
variable "alert_email" {
description = "Email address for alert notifications"
type = string
default = "ops@shieldai.com"
}
resource "aws_sns_topic" "alerts" {
name = "${var.project_name}-${var.environment}-alerts"
tags = {
Environment = var.environment
Project = var.project_name
}
}
resource "aws_sns_topic_subscription" "alerts_email" {
topic_arn = aws_sns_topic.alerts.arn
protocol = "email"
endpoint = var.alert_email
}
resource "aws_cloudwatch_dashboard" "main" {
dashboard_name = "${var.project_name}-${var.environment}-dashboard"
@@ -92,6 +113,120 @@ resource "aws_cloudwatch_dashboard" "main" {
region = "us-east-1"
period = 60
}
},
{
type = "metric"
properties = {
title = "P99 Latency (Target Group)"
metrics = [
["AWS/ApplicationELB", "TargetResponseTime", "LoadBalancer", "${var.cluster_name}-alb", "Statistic", "p99"],
["AWS/ApplicationELB", "TargetResponseTime", "LoadBalancer", "${var.cluster_name}-alb", "Statistic", "p95"]
]
view = "timeSeries"
stacked = false
region = "us-east-1"
period = 60
}
},
{
type = "metric"
properties = {
title = "Error Rate (5xx / Total)"
metrics = [
["AWS/ApplicationELB", "HTTPCode_Elb_5XX_Count", "LoadBalancer", "${var.cluster_name}-alb"],
["AWS/ApplicationELB", "HTTPCode_Elb_4XX_Count", "LoadBalancer", "${var.cluster_name}-alb"]
]
view = "timeSeries"
stacked = false
region = "us-east-1"
period = 60
}
},
{
type = "metric"
properties = {
title = "Throughput (Request Count)"
metrics = [
["AWS/ApplicationELB", "RequestCount", "LoadBalancer", "${var.cluster_name}-alb"]
]
view = "timeSeries"
stacked = false
region = "us-east-1"
period = 60
yAxis = {
left = {
label = "Requests/sec"
}
}
}
},
{
type = "metric"
properties = {
title = "API Latency Percentiles"
metrics = [
["ShieldAI", "api_latency", "service", "api", "percentile", "p99", "statistic", "Average"],
["ShieldAI", "api_latency", "service", "api", "percentile", "p95", "statistic", "Average"],
["ShieldAI", "api_latency", "service", "api", "percentile", "p50", "statistic", "Average"]
]
view = "timeSeries"
stacked = false
region = "us-east-1"
period = 60
}
},
{
type = "metric"
properties = {
title = "API Error Rate"
metrics = [
["ShieldAI", "api_errors", "service", "api", "statistic", "Sum"]
]
view = "timeSeries"
stacked = false
region = "us-east-1"
period = 60
}
},
{
type = "metric"
properties = {
title = "API Throughput"
metrics = [
["ShieldAI", "api_requests", "service", "api", "statistic", "Sum"]
]
view = "timeSeries"
stacked = false
region = "us-east-1"
period = 60
}
},
{
type = "metric"
properties = {
title = "ECS Running Tasks"
metrics = [
["AWS/ECS", "RunningTaskCount", "ClusterName", var.cluster_name]
]
view = "timeSeries"
stacked = false
region = "us-east-1"
period = 60
}
},
{
type = "metric"
properties = {
title = "RDS Read/Write IOPS"
metrics = [
["AWS/RDS", "ReadIOPS", "DBInstanceIdentifier", var.rds_identifier],
["AWS/RDS", "WriteIOPS", "DBInstanceIdentifier", var.rds_identifier]
]
view = "timeSeries"
stacked = false
region = "us-east-1"
period = 60
}
}
]
})
@@ -107,6 +242,7 @@ resource "aws_cloudwatch_metric_alarm" "ecs_cpu_high" {
statistic = "Average"
threshold = 80
alarm_description = "ECS CPU utilization above 80%"
alarm_actions = [aws_sns_topic.alerts.arn]
dimensions = {
ClusterName = var.cluster_name
@@ -123,6 +259,7 @@ resource "aws_cloudwatch_metric_alarm" "ecs_memory_high" {
statistic = "Average"
threshold = 85
alarm_description = "ECS memory utilization above 85%"
alarm_actions = [aws_sns_topic.alerts.arn]
dimensions = {
ClusterName = var.cluster_name
@@ -139,6 +276,7 @@ resource "aws_cloudwatch_metric_alarm" "alb_5xx" {
statistic = "Sum"
threshold = 10
alarm_description = "ALB 5xx errors above 10 per minute"
alarm_actions = [aws_sns_topic.alerts.arn]
dimensions = {
LoadBalancer = "${var.cluster_name}-alb"
@@ -155,6 +293,7 @@ resource "aws_cloudwatch_metric_alarm" "rds_cpu_high" {
statistic = "Average"
threshold = 75
alarm_description = "RDS CPU utilization above 75%"
alarm_actions = [aws_sns_topic.alerts.arn]
dimensions = {
DBInstanceIdentifier = var.rds_identifier
@@ -171,13 +310,155 @@ resource "aws_cloudwatch_metric_alarm" "rds_free_storage" {
statistic = "Average"
threshold = 524288000
alarm_description = "RDS free storage below 500MB"
alarm_actions = [aws_sns_topic.alerts.arn]
dimensions = {
DBInstanceIdentifier = var.rds_identifier
}
}
resource "aws_cloudwatch_metric_alarm" "p99_latency_high" {
alarm_name = "${var.project_name}-${var.environment}-p99-latency-high"
comparison_operator = "GreaterThanThreshold"
evaluation_periods = 3
metric_name = "TargetResponseTime"
namespace = "AWS/ApplicationELB"
period = 60
statistic = "p99"
threshold = 2
alarm_description = "P99 latency above 2 seconds"
alarm_actions = [aws_sns_topic.alerts.arn]
dimensions = {
LoadBalancer = "${var.cluster_name}-alb"
}
}
resource "aws_cloudwatch_metric_alarm" "error_rate_high" {
alarm_name = "${var.project_name}-${var.environment}-error-rate-high"
comparison_operator = "GreaterThanThreshold"
evaluation_periods = 3
metric_name = "HTTPCode_Elb_5XX_Count"
namespace = "AWS/ApplicationELB"
period = 60
statistic = "Sum"
threshold = 5
alarm_description = "Error rate above 5 errors per minute"
alarm_actions = [aws_sns_topic.alerts.arn]
dimensions = {
LoadBalancer = "${var.cluster_name}-alb"
}
}
resource "aws_cloudwatch_metric_alarm" "throughput_low" {
alarm_name = "${var.project_name}-${var.environment}-throughput-low"
comparison_operator = "LessThanThreshold"
evaluation_periods = 5
metric_name = "RequestCount"
namespace = "AWS/ApplicationELB"
period = 60
statistic = "Sum"
threshold = 10
alarm_description = "Throughput below 10 requests per minute"
alarm_actions = [aws_sns_topic.alerts.arn]
dimensions = {
LoadBalancer = "${var.cluster_name}-alb"
}
}
resource "aws_cloudwatch_log_group" "api" {
name = "/${var.project_name}/${var.environment}/api"
retention_in_days = 30
tags = {
Environment = var.environment
Project = var.project_name
Service = "api"
}
}
resource "aws_cloudwatch_log_group" "datadog" {
name = "/${var.project_name}/${var.environment}/datadog"
retention_in_days = 30
tags = {
Environment = var.environment
Project = var.project_name
Service = "datadog"
}
}
resource "aws_cloudwatch_log_group" "sentry" {
name = "/${var.project_name}/${var.environment}/sentry"
retention_in_days = 30
tags = {
Environment = var.environment
Project = var.project_name
Service = "sentry"
}
}
resource "aws_cloudwatch_metric_alarm" "app_p99_latency_high" {
alarm_name = "${var.project_name}-${var.environment}-app-p99-latency-high"
comparison_operator = "GreaterThanThreshold"
evaluation_periods = 3
metric_name = "api_latency"
namespace = "ShieldAI"
period = 60
statistic = "Average"
threshold = 2000
alarm_description = "Application P99 latency above 2000ms"
alarm_actions = [aws_sns_topic.alerts.arn]
dimensions = {
service = "api"
percentile = "p99"
}
}
resource "aws_cloudwatch_metric_alarm" "app_error_rate_high" {
alarm_name = "${var.project_name}-${var.environment}-app-error-rate-high"
comparison_operator = "GreaterThanThreshold"
evaluation_periods = 3
metric_name = "api_errors"
namespace = "ShieldAI"
period = 60
statistic = "Sum"
threshold = 10
alarm_description = "Application error count above 10 per minute"
alarm_actions = [aws_sns_topic.alerts.arn]
dimensions = {
service = "api"
}
}
resource "aws_cloudwatch_metric_alarm" "app_throughput_low" {
alarm_name = "${var.project_name}-${var.environment}-app-throughput-low"
comparison_operator = "LessThanThreshold"
evaluation_periods = 5
metric_name = "api_requests"
namespace = "ShieldAI"
period = 60
statistic = "Sum"
threshold = 10
alarm_description = "Application throughput below 10 requests per minute"
alarm_actions = [aws_sns_topic.alerts.arn]
dimensions = {
service = "api"
}
}
output "dashboard_url" {
description = "CloudWatch dashboard URL"
value = "https://us-east-1.console.aws.amazon.com/cloudwatch/home#dashboards/dashboard/${var.project_name}-${var.environment}-dashboard"
}
output "sns_topic_arn" {
description = "SNS topic ARN for alerts"
value = aws_sns_topic.alerts.arn
}

View File

@@ -96,6 +96,50 @@ resource "aws_ecs_task_definition" "services" {
{
name = "PORT"
value = tostring(each.port)
},
{
name = "DD_ENV"
value = var.environment
},
{
name = "DD_SERVICE"
value = "${var.cluster_name}-${each.key}"
},
{
name = "DD_VERSION"
value = var.container_images[each.key]
},
{
name = "DD_TRACE_ENABLED"
value = "true"
},
{
name = "DD_LOGS_INJECTION"
value = "true"
},
{
name = "DD_AGENT_HOST"
value = "localhost"
},
{
name = "DD_AGENT_PORT"
value = "8126"
},
{
name = "SENTRY_ENVIRONMENT"
value = var.environment
},
{
name = "SENTRY_RELEASE"
value = var.container_images[each.key]
},
{
name = "AWS_REGION"
value = "us-east-1"
},
{
name = "DD_SITE"
value = "datadoghq.com"
}
]
@@ -115,6 +159,14 @@ resource "aws_ecs_task_definition" "services" {
{
name = "RESEND_API_KEY"
valueFrom = "${var.secrets_arn}:RESEND_API_KEY::"
},
{
name = "SENTRY_DSN"
valueFrom = "${var.secrets_arn}:SENTRY_DSN::"
},
{
name = "DD_API_KEY"
valueFrom = "${var.secrets_arn}:DD_API_KEY::"
}
]