variable "environment" { description = "Deployment environment" type = string } variable "cluster_name" { description = "ECS cluster name" type = string } variable "project_name" { description = "Project name" type = string } variable "rds_identifier" { description = "RDS instance identifier" type = string } variable "cache_endpoint" { description = "ElastiCache endpoint" type = string } variable "alert_email" { description = "Email address for alert notifications" type = string default = "ops@shieldai.com" } resource "aws_sns_topic" "alerts" { name = "${var.project_name}-${var.environment}-alerts" tags = { Environment = var.environment Project = var.project_name } } resource "aws_sns_topic_subscription" "alerts_email" { topic_arn = aws_sns_topic.alerts.arn protocol = "email" endpoint = var.alert_email } resource "aws_cloudwatch_dashboard" "main" { dashboard_name = "${var.project_name}-${var.environment}-dashboard" dashboard_body = jsonencode({ widgets = [ { type = "metric" properties = { title = "ECS CPU Utilization" metrics = [ ["AWS/ECS", "CPUUtilization", "ClusterName", var.cluster_name] ] view = "timeSeries" stacked = false region = "us-east-1" period = 300 } }, { type = "metric" properties = { title = "ECS Memory Utilization" metrics = [ ["AWS/ECS", "MemoryUtilization", "ClusterName", var.cluster_name] ] view = "timeSeries" stacked = false region = "us-east-1" period = 300 } }, { type = "metric" properties = { title = "RDS CPU Utilization" metrics = [ ["AWS/RDS", "CPUUtilization", "DBInstanceIdentifier", var.rds_identifier] ] view = "timeSeries" stacked = false region = "us-east-1" period = 300 } }, { type = "metric" properties = { title = "ALB Request Count" metrics = [ ["AWS/ApplicationELB", "RequestCount", "LoadBalancer", "${var.cluster_name}-alb"] ] view = "timeSeries" stacked = false region = "us-east-1" period = 60 } }, { type = "metric" properties = { title = "ALB 5xx Errors" metrics = [ ["AWS/ApplicationELB", "HTTPCode_Elb_5XX_Count", "LoadBalancer", "${var.cluster_name}-alb"] ] view = "timeSeries" stacked = false region = "us-east-1" period = 60 } }, { type = "metric" properties = { title = "P99 Latency (Target Group)" metrics = [ ["AWS/ApplicationELB", "TargetResponseTime", "LoadBalancer", "${var.cluster_name}-alb", "Statistic", "p99"], ["AWS/ApplicationELB", "TargetResponseTime", "LoadBalancer", "${var.cluster_name}-alb", "Statistic", "p95"] ] view = "timeSeries" stacked = false region = "us-east-1" period = 60 } }, { type = "metric" properties = { title = "Error Rate (5xx / Total)" metrics = [ ["AWS/ApplicationELB", "HTTPCode_Elb_5XX_Count", "LoadBalancer", "${var.cluster_name}-alb"], ["AWS/ApplicationELB", "HTTPCode_Elb_4XX_Count", "LoadBalancer", "${var.cluster_name}-alb"] ] view = "timeSeries" stacked = false region = "us-east-1" period = 60 } }, { type = "metric" properties = { title = "Throughput (Request Count)" metrics = [ ["AWS/ApplicationELB", "RequestCount", "LoadBalancer", "${var.cluster_name}-alb"] ] view = "timeSeries" stacked = false region = "us-east-1" period = 60 yAxis = { left = { label = "Requests/sec" } } } }, { type = "metric" properties = { title = "API Latency Percentiles" metrics = [ ["ShieldAI", "api_latency", "service", "api", "percentile", "p99", "statistic", "Average"], ["ShieldAI", "api_latency", "service", "api", "percentile", "p95", "statistic", "Average"], ["ShieldAI", "api_latency", "service", "api", "percentile", "p50", "statistic", "Average"] ] view = "timeSeries" stacked = false region = "us-east-1" period = 60 } }, { type = "metric" properties = { title = "API Error Rate" metrics = [ ["ShieldAI", "api_errors", "service", "api", "statistic", "Sum"] ] view = "timeSeries" stacked = false region = "us-east-1" period = 60 } }, { type = "metric" properties = { title = "API Throughput" metrics = [ ["ShieldAI", "api_requests", "service", "api", "statistic", "Sum"] ] view = "timeSeries" stacked = false region = "us-east-1" period = 60 } }, { type = "metric" properties = { title = "ECS Running Tasks" metrics = [ ["AWS/ECS", "RunningTaskCount", "ClusterName", var.cluster_name] ] view = "timeSeries" stacked = false region = "us-east-1" period = 60 } }, { type = "metric" properties = { title = "RDS Read/Write IOPS" metrics = [ ["AWS/RDS", "ReadIOPS", "DBInstanceIdentifier", var.rds_identifier], ["AWS/RDS", "WriteIOPS", "DBInstanceIdentifier", var.rds_identifier] ] view = "timeSeries" stacked = false region = "us-east-1" period = 60 } } ] }) } resource "aws_cloudwatch_metric_alarm" "ecs_cpu_high" { alarm_name = "${var.project_name}-${var.environment}-ecs-cpu-high" comparison_operator = "GreaterThanThreshold" evaluation_periods = 2 metric_name = "CPUUtilization" namespace = "AWS/ECS" period = 300 statistic = "Average" threshold = 80 alarm_description = "ECS CPU utilization above 80%" alarm_actions = [aws_sns_topic.alerts.arn] dimensions = { ClusterName = var.cluster_name } } resource "aws_cloudwatch_metric_alarm" "ecs_memory_high" { alarm_name = "${var.project_name}-${var.environment}-ecs-memory-high" comparison_operator = "GreaterThanThreshold" evaluation_periods = 2 metric_name = "MemoryUtilization" namespace = "AWS/ECS" period = 300 statistic = "Average" threshold = 85 alarm_description = "ECS memory utilization above 85%" alarm_actions = [aws_sns_topic.alerts.arn] dimensions = { ClusterName = var.cluster_name } } resource "aws_cloudwatch_metric_alarm" "alb_5xx" { alarm_name = "${var.project_name}-${var.environment}-alb-5xx" comparison_operator = "GreaterThanThreshold" evaluation_periods = 3 metric_name = "HTTPCode_Elb_5XX_Count" namespace = "AWS/ApplicationELB" period = 60 statistic = "Sum" threshold = 10 alarm_description = "ALB 5xx errors above 10 per minute" alarm_actions = [aws_sns_topic.alerts.arn] dimensions = { LoadBalancer = "${var.cluster_name}-alb" } } resource "aws_cloudwatch_metric_alarm" "rds_cpu_high" { alarm_name = "${var.project_name}-${var.environment}-rds-cpu-high" comparison_operator = "GreaterThanThreshold" evaluation_periods = 2 metric_name = "CPUUtilization" namespace = "AWS/RDS" period = 300 statistic = "Average" threshold = 75 alarm_description = "RDS CPU utilization above 75%" alarm_actions = [aws_sns_topic.alerts.arn] dimensions = { DBInstanceIdentifier = var.rds_identifier } } resource "aws_cloudwatch_metric_alarm" "rds_free_storage" { alarm_name = "${var.project_name}-${var.environment}-rds-free-storage" comparison_operator = "LessThanThreshold" evaluation_periods = 2 metric_name = "FreeStorageSpace" namespace = "AWS/RDS" period = 300 statistic = "Average" threshold = 524288000 alarm_description = "RDS free storage below 500MB" alarm_actions = [aws_sns_topic.alerts.arn] dimensions = { DBInstanceIdentifier = var.rds_identifier } } resource "aws_cloudwatch_metric_alarm" "p99_latency_high" { alarm_name = "${var.project_name}-${var.environment}-p99-latency-high" comparison_operator = "GreaterThanThreshold" evaluation_periods = 3 metric_name = "TargetResponseTime" namespace = "AWS/ApplicationELB" period = 60 statistic = "p99" threshold = 2 alarm_description = "P99 latency above 2 seconds" alarm_actions = [aws_sns_topic.alerts.arn] dimensions = { LoadBalancer = "${var.cluster_name}-alb" } } resource "aws_cloudwatch_metric_alarm" "error_rate_high" { alarm_name = "${var.project_name}-${var.environment}-error-rate-high" comparison_operator = "GreaterThanThreshold" evaluation_periods = 3 metric_name = "HTTPCode_Elb_5XX_Count" namespace = "AWS/ApplicationELB" period = 60 statistic = "Sum" threshold = 5 alarm_description = "Error rate above 5 errors per minute" alarm_actions = [aws_sns_topic.alerts.arn] dimensions = { LoadBalancer = "${var.cluster_name}-alb" } } resource "aws_cloudwatch_metric_alarm" "throughput_low" { alarm_name = "${var.project_name}-${var.environment}-throughput-low" comparison_operator = "LessThanThreshold" evaluation_periods = 5 metric_name = "RequestCount" namespace = "AWS/ApplicationELB" period = 60 statistic = "Sum" threshold = 10 alarm_description = "Throughput below 10 requests per minute" alarm_actions = [aws_sns_topic.alerts.arn] dimensions = { LoadBalancer = "${var.cluster_name}-alb" } } resource "aws_cloudwatch_log_group" "api" { name = "/${var.project_name}/${var.environment}/api" retention_in_days = 30 tags = { Environment = var.environment Project = var.project_name Service = "api" } } resource "aws_cloudwatch_log_group" "datadog" { name = "/${var.project_name}/${var.environment}/datadog" retention_in_days = 30 tags = { Environment = var.environment Project = var.project_name Service = "datadog" } } resource "aws_cloudwatch_log_group" "sentry" { name = "/${var.project_name}/${var.environment}/sentry" retention_in_days = 30 tags = { Environment = var.environment Project = var.project_name Service = "sentry" } } resource "aws_cloudwatch_metric_alarm" "app_p99_latency_high" { alarm_name = "${var.project_name}-${var.environment}-app-p99-latency-high" comparison_operator = "GreaterThanThreshold" evaluation_periods = 3 metric_name = "api_latency" namespace = "ShieldAI" period = 60 statistic = "Average" threshold = 2000 alarm_description = "Application P99 latency above 2000ms" alarm_actions = [aws_sns_topic.alerts.arn] dimensions = { service = "api" percentile = "p99" } } resource "aws_cloudwatch_metric_alarm" "app_error_rate_high" { alarm_name = "${var.project_name}-${var.environment}-app-error-rate-high" comparison_operator = "GreaterThanThreshold" evaluation_periods = 3 metric_name = "api_errors" namespace = "ShieldAI" period = 60 statistic = "Sum" threshold = 10 alarm_description = "Application error count above 10 per minute" alarm_actions = [aws_sns_topic.alerts.arn] dimensions = { service = "api" } } resource "aws_cloudwatch_metric_alarm" "app_throughput_low" { alarm_name = "${var.project_name}-${var.environment}-app-throughput-low" comparison_operator = "LessThanThreshold" evaluation_periods = 5 metric_name = "api_requests" namespace = "ShieldAI" period = 60 statistic = "Sum" threshold = 10 alarm_description = "Application throughput below 10 requests per minute" alarm_actions = [aws_sns_topic.alerts.arn] dimensions = { service = "api" } } output "dashboard_url" { description = "CloudWatch dashboard URL" value = "https://us-east-1.console.aws.amazon.com/cloudwatch/home#dashboards/dashboard/${var.project_name}-${var.environment}-dashboard" } output "sns_topic_arn" { description = "SNS topic ARN for alerts" value = aws_sns_topic.alerts.arn }