- Add CloudWatch metrics emitter (api_latency, api_requests, api_errors) - Add request monitoring middleware for API (latency, error rate, throughput) - Register error-handling, logging, and monitoring middleware in server.ts - Add Datadog log forwarding via HTTP intake API - Add application-level CloudWatch alarms for P99 latency, error rate, throughput - Inject Datadog/Sentry env vars and secrets into ECS task definitions - Add DD_API_KEY and SENTRY_DSN to ECS secrets - Create CloudWatch log groups for datadog and sentry services - Update .env.example with AWS_REGION and monitoring variables - Add @aws-sdk/client-cloudwatch dependency to monitoring package Co-Authored-By: Paperclip <noreply@paperclip.ing>
465 lines
13 KiB
HCL
465 lines
13 KiB
HCL
variable "environment" {
|
|
description = "Deployment environment"
|
|
type = string
|
|
}
|
|
|
|
variable "cluster_name" {
|
|
description = "ECS cluster name"
|
|
type = string
|
|
}
|
|
|
|
variable "project_name" {
|
|
description = "Project name"
|
|
type = string
|
|
}
|
|
|
|
variable "rds_identifier" {
|
|
description = "RDS instance identifier"
|
|
type = string
|
|
}
|
|
|
|
variable "cache_endpoint" {
|
|
description = "ElastiCache endpoint"
|
|
type = string
|
|
}
|
|
|
|
variable "alert_email" {
|
|
description = "Email address for alert notifications"
|
|
type = string
|
|
default = "ops@shieldai.com"
|
|
}
|
|
|
|
resource "aws_sns_topic" "alerts" {
|
|
name = "${var.project_name}-${var.environment}-alerts"
|
|
|
|
tags = {
|
|
Environment = var.environment
|
|
Project = var.project_name
|
|
}
|
|
}
|
|
|
|
resource "aws_sns_topic_subscription" "alerts_email" {
|
|
topic_arn = aws_sns_topic.alerts.arn
|
|
protocol = "email"
|
|
endpoint = var.alert_email
|
|
}
|
|
|
|
resource "aws_cloudwatch_dashboard" "main" {
|
|
dashboard_name = "${var.project_name}-${var.environment}-dashboard"
|
|
|
|
dashboard_body = jsonencode({
|
|
widgets = [
|
|
{
|
|
type = "metric"
|
|
properties = {
|
|
title = "ECS CPU Utilization"
|
|
metrics = [
|
|
["AWS/ECS", "CPUUtilization", "ClusterName", var.cluster_name]
|
|
]
|
|
view = "timeSeries"
|
|
stacked = false
|
|
region = "us-east-1"
|
|
period = 300
|
|
}
|
|
},
|
|
{
|
|
type = "metric"
|
|
properties = {
|
|
title = "ECS Memory Utilization"
|
|
metrics = [
|
|
["AWS/ECS", "MemoryUtilization", "ClusterName", var.cluster_name]
|
|
]
|
|
view = "timeSeries"
|
|
stacked = false
|
|
region = "us-east-1"
|
|
period = 300
|
|
}
|
|
},
|
|
{
|
|
type = "metric"
|
|
properties = {
|
|
title = "RDS CPU Utilization"
|
|
metrics = [
|
|
["AWS/RDS", "CPUUtilization", "DBInstanceIdentifier", var.rds_identifier]
|
|
]
|
|
view = "timeSeries"
|
|
stacked = false
|
|
region = "us-east-1"
|
|
period = 300
|
|
}
|
|
},
|
|
{
|
|
type = "metric"
|
|
properties = {
|
|
title = "ALB Request Count"
|
|
metrics = [
|
|
["AWS/ApplicationELB", "RequestCount", "LoadBalancer", "${var.cluster_name}-alb"]
|
|
]
|
|
view = "timeSeries"
|
|
stacked = false
|
|
region = "us-east-1"
|
|
period = 60
|
|
}
|
|
},
|
|
{
|
|
type = "metric"
|
|
properties = {
|
|
title = "ALB 5xx Errors"
|
|
metrics = [
|
|
["AWS/ApplicationELB", "HTTPCode_Elb_5XX_Count", "LoadBalancer", "${var.cluster_name}-alb"]
|
|
]
|
|
view = "timeSeries"
|
|
stacked = false
|
|
region = "us-east-1"
|
|
period = 60
|
|
}
|
|
},
|
|
{
|
|
type = "metric"
|
|
properties = {
|
|
title = "P99 Latency (Target Group)"
|
|
metrics = [
|
|
["AWS/ApplicationELB", "TargetResponseTime", "LoadBalancer", "${var.cluster_name}-alb", "Statistic", "p99"],
|
|
["AWS/ApplicationELB", "TargetResponseTime", "LoadBalancer", "${var.cluster_name}-alb", "Statistic", "p95"]
|
|
]
|
|
view = "timeSeries"
|
|
stacked = false
|
|
region = "us-east-1"
|
|
period = 60
|
|
}
|
|
},
|
|
{
|
|
type = "metric"
|
|
properties = {
|
|
title = "Error Rate (5xx / Total)"
|
|
metrics = [
|
|
["AWS/ApplicationELB", "HTTPCode_Elb_5XX_Count", "LoadBalancer", "${var.cluster_name}-alb"],
|
|
["AWS/ApplicationELB", "HTTPCode_Elb_4XX_Count", "LoadBalancer", "${var.cluster_name}-alb"]
|
|
]
|
|
view = "timeSeries"
|
|
stacked = false
|
|
region = "us-east-1"
|
|
period = 60
|
|
}
|
|
},
|
|
{
|
|
type = "metric"
|
|
properties = {
|
|
title = "Throughput (Request Count)"
|
|
metrics = [
|
|
["AWS/ApplicationELB", "RequestCount", "LoadBalancer", "${var.cluster_name}-alb"]
|
|
]
|
|
view = "timeSeries"
|
|
stacked = false
|
|
region = "us-east-1"
|
|
period = 60
|
|
yAxis = {
|
|
left = {
|
|
label = "Requests/sec"
|
|
}
|
|
}
|
|
}
|
|
},
|
|
{
|
|
type = "metric"
|
|
properties = {
|
|
title = "API Latency Percentiles"
|
|
metrics = [
|
|
["ShieldAI", "api_latency", "service", "api", "percentile", "p99", "statistic", "Average"],
|
|
["ShieldAI", "api_latency", "service", "api", "percentile", "p95", "statistic", "Average"],
|
|
["ShieldAI", "api_latency", "service", "api", "percentile", "p50", "statistic", "Average"]
|
|
]
|
|
view = "timeSeries"
|
|
stacked = false
|
|
region = "us-east-1"
|
|
period = 60
|
|
}
|
|
},
|
|
{
|
|
type = "metric"
|
|
properties = {
|
|
title = "API Error Rate"
|
|
metrics = [
|
|
["ShieldAI", "api_errors", "service", "api", "statistic", "Sum"]
|
|
]
|
|
view = "timeSeries"
|
|
stacked = false
|
|
region = "us-east-1"
|
|
period = 60
|
|
}
|
|
},
|
|
{
|
|
type = "metric"
|
|
properties = {
|
|
title = "API Throughput"
|
|
metrics = [
|
|
["ShieldAI", "api_requests", "service", "api", "statistic", "Sum"]
|
|
]
|
|
view = "timeSeries"
|
|
stacked = false
|
|
region = "us-east-1"
|
|
period = 60
|
|
}
|
|
},
|
|
{
|
|
type = "metric"
|
|
properties = {
|
|
title = "ECS Running Tasks"
|
|
metrics = [
|
|
["AWS/ECS", "RunningTaskCount", "ClusterName", var.cluster_name]
|
|
]
|
|
view = "timeSeries"
|
|
stacked = false
|
|
region = "us-east-1"
|
|
period = 60
|
|
}
|
|
},
|
|
{
|
|
type = "metric"
|
|
properties = {
|
|
title = "RDS Read/Write IOPS"
|
|
metrics = [
|
|
["AWS/RDS", "ReadIOPS", "DBInstanceIdentifier", var.rds_identifier],
|
|
["AWS/RDS", "WriteIOPS", "DBInstanceIdentifier", var.rds_identifier]
|
|
]
|
|
view = "timeSeries"
|
|
stacked = false
|
|
region = "us-east-1"
|
|
period = 60
|
|
}
|
|
}
|
|
]
|
|
})
|
|
}
|
|
|
|
resource "aws_cloudwatch_metric_alarm" "ecs_cpu_high" {
|
|
alarm_name = "${var.project_name}-${var.environment}-ecs-cpu-high"
|
|
comparison_operator = "GreaterThanThreshold"
|
|
evaluation_periods = 2
|
|
metric_name = "CPUUtilization"
|
|
namespace = "AWS/ECS"
|
|
period = 300
|
|
statistic = "Average"
|
|
threshold = 80
|
|
alarm_description = "ECS CPU utilization above 80%"
|
|
alarm_actions = [aws_sns_topic.alerts.arn]
|
|
|
|
dimensions = {
|
|
ClusterName = var.cluster_name
|
|
}
|
|
}
|
|
|
|
resource "aws_cloudwatch_metric_alarm" "ecs_memory_high" {
|
|
alarm_name = "${var.project_name}-${var.environment}-ecs-memory-high"
|
|
comparison_operator = "GreaterThanThreshold"
|
|
evaluation_periods = 2
|
|
metric_name = "MemoryUtilization"
|
|
namespace = "AWS/ECS"
|
|
period = 300
|
|
statistic = "Average"
|
|
threshold = 85
|
|
alarm_description = "ECS memory utilization above 85%"
|
|
alarm_actions = [aws_sns_topic.alerts.arn]
|
|
|
|
dimensions = {
|
|
ClusterName = var.cluster_name
|
|
}
|
|
}
|
|
|
|
resource "aws_cloudwatch_metric_alarm" "alb_5xx" {
|
|
alarm_name = "${var.project_name}-${var.environment}-alb-5xx"
|
|
comparison_operator = "GreaterThanThreshold"
|
|
evaluation_periods = 3
|
|
metric_name = "HTTPCode_Elb_5XX_Count"
|
|
namespace = "AWS/ApplicationELB"
|
|
period = 60
|
|
statistic = "Sum"
|
|
threshold = 10
|
|
alarm_description = "ALB 5xx errors above 10 per minute"
|
|
alarm_actions = [aws_sns_topic.alerts.arn]
|
|
|
|
dimensions = {
|
|
LoadBalancer = "${var.cluster_name}-alb"
|
|
}
|
|
}
|
|
|
|
resource "aws_cloudwatch_metric_alarm" "rds_cpu_high" {
|
|
alarm_name = "${var.project_name}-${var.environment}-rds-cpu-high"
|
|
comparison_operator = "GreaterThanThreshold"
|
|
evaluation_periods = 2
|
|
metric_name = "CPUUtilization"
|
|
namespace = "AWS/RDS"
|
|
period = 300
|
|
statistic = "Average"
|
|
threshold = 75
|
|
alarm_description = "RDS CPU utilization above 75%"
|
|
alarm_actions = [aws_sns_topic.alerts.arn]
|
|
|
|
dimensions = {
|
|
DBInstanceIdentifier = var.rds_identifier
|
|
}
|
|
}
|
|
|
|
resource "aws_cloudwatch_metric_alarm" "rds_free_storage" {
|
|
alarm_name = "${var.project_name}-${var.environment}-rds-free-storage"
|
|
comparison_operator = "LessThanThreshold"
|
|
evaluation_periods = 2
|
|
metric_name = "FreeStorageSpace"
|
|
namespace = "AWS/RDS"
|
|
period = 300
|
|
statistic = "Average"
|
|
threshold = 524288000
|
|
alarm_description = "RDS free storage below 500MB"
|
|
alarm_actions = [aws_sns_topic.alerts.arn]
|
|
|
|
dimensions = {
|
|
DBInstanceIdentifier = var.rds_identifier
|
|
}
|
|
}
|
|
|
|
resource "aws_cloudwatch_metric_alarm" "p99_latency_high" {
|
|
alarm_name = "${var.project_name}-${var.environment}-p99-latency-high"
|
|
comparison_operator = "GreaterThanThreshold"
|
|
evaluation_periods = 3
|
|
metric_name = "TargetResponseTime"
|
|
namespace = "AWS/ApplicationELB"
|
|
period = 60
|
|
statistic = "p99"
|
|
threshold = 2
|
|
alarm_description = "P99 latency above 2 seconds"
|
|
alarm_actions = [aws_sns_topic.alerts.arn]
|
|
|
|
dimensions = {
|
|
LoadBalancer = "${var.cluster_name}-alb"
|
|
}
|
|
}
|
|
|
|
resource "aws_cloudwatch_metric_alarm" "error_rate_high" {
|
|
alarm_name = "${var.project_name}-${var.environment}-error-rate-high"
|
|
comparison_operator = "GreaterThanThreshold"
|
|
evaluation_periods = 3
|
|
metric_name = "HTTPCode_Elb_5XX_Count"
|
|
namespace = "AWS/ApplicationELB"
|
|
period = 60
|
|
statistic = "Sum"
|
|
threshold = 5
|
|
alarm_description = "Error rate above 5 errors per minute"
|
|
alarm_actions = [aws_sns_topic.alerts.arn]
|
|
|
|
dimensions = {
|
|
LoadBalancer = "${var.cluster_name}-alb"
|
|
}
|
|
}
|
|
|
|
resource "aws_cloudwatch_metric_alarm" "throughput_low" {
|
|
alarm_name = "${var.project_name}-${var.environment}-throughput-low"
|
|
comparison_operator = "LessThanThreshold"
|
|
evaluation_periods = 5
|
|
metric_name = "RequestCount"
|
|
namespace = "AWS/ApplicationELB"
|
|
period = 60
|
|
statistic = "Sum"
|
|
threshold = 10
|
|
alarm_description = "Throughput below 10 requests per minute"
|
|
alarm_actions = [aws_sns_topic.alerts.arn]
|
|
|
|
dimensions = {
|
|
LoadBalancer = "${var.cluster_name}-alb"
|
|
}
|
|
}
|
|
|
|
resource "aws_cloudwatch_log_group" "api" {
|
|
name = "/${var.project_name}/${var.environment}/api"
|
|
retention_in_days = 30
|
|
|
|
tags = {
|
|
Environment = var.environment
|
|
Project = var.project_name
|
|
Service = "api"
|
|
}
|
|
}
|
|
|
|
resource "aws_cloudwatch_log_group" "datadog" {
|
|
name = "/${var.project_name}/${var.environment}/datadog"
|
|
retention_in_days = 30
|
|
|
|
tags = {
|
|
Environment = var.environment
|
|
Project = var.project_name
|
|
Service = "datadog"
|
|
}
|
|
}
|
|
|
|
resource "aws_cloudwatch_log_group" "sentry" {
|
|
name = "/${var.project_name}/${var.environment}/sentry"
|
|
retention_in_days = 30
|
|
|
|
tags = {
|
|
Environment = var.environment
|
|
Project = var.project_name
|
|
Service = "sentry"
|
|
}
|
|
}
|
|
|
|
resource "aws_cloudwatch_metric_alarm" "app_p99_latency_high" {
|
|
alarm_name = "${var.project_name}-${var.environment}-app-p99-latency-high"
|
|
comparison_operator = "GreaterThanThreshold"
|
|
evaluation_periods = 3
|
|
metric_name = "api_latency"
|
|
namespace = "ShieldAI"
|
|
period = 60
|
|
statistic = "Average"
|
|
threshold = 2000
|
|
alarm_description = "Application P99 latency above 2000ms"
|
|
alarm_actions = [aws_sns_topic.alerts.arn]
|
|
|
|
dimensions = {
|
|
service = "api"
|
|
percentile = "p99"
|
|
}
|
|
}
|
|
|
|
resource "aws_cloudwatch_metric_alarm" "app_error_rate_high" {
|
|
alarm_name = "${var.project_name}-${var.environment}-app-error-rate-high"
|
|
comparison_operator = "GreaterThanThreshold"
|
|
evaluation_periods = 3
|
|
metric_name = "api_errors"
|
|
namespace = "ShieldAI"
|
|
period = 60
|
|
statistic = "Sum"
|
|
threshold = 10
|
|
alarm_description = "Application error count above 10 per minute"
|
|
alarm_actions = [aws_sns_topic.alerts.arn]
|
|
|
|
dimensions = {
|
|
service = "api"
|
|
}
|
|
}
|
|
|
|
resource "aws_cloudwatch_metric_alarm" "app_throughput_low" {
|
|
alarm_name = "${var.project_name}-${var.environment}-app-throughput-low"
|
|
comparison_operator = "LessThanThreshold"
|
|
evaluation_periods = 5
|
|
metric_name = "api_requests"
|
|
namespace = "ShieldAI"
|
|
period = 60
|
|
statistic = "Sum"
|
|
threshold = 10
|
|
alarm_description = "Application throughput below 10 requests per minute"
|
|
alarm_actions = [aws_sns_topic.alerts.arn]
|
|
|
|
dimensions = {
|
|
service = "api"
|
|
}
|
|
}
|
|
|
|
output "dashboard_url" {
|
|
description = "CloudWatch dashboard URL"
|
|
value = "https://us-east-1.console.aws.amazon.com/cloudwatch/home#dashboards/dashboard/${var.project_name}-${var.environment}-dashboard"
|
|
}
|
|
|
|
output "sns_topic_arn" {
|
|
description = "SNS topic ARN for alerts"
|
|
value = aws_sns_topic.alerts.arn
|
|
}
|