Files
ShieldAI/infra/modules/ecs/main.tf
Michael Freno 7b925c89bd Fix 3 Code Review findings on FRE-4574
- P2: Replace wget with curl for ECS health check (Alpine lacks wget)
- P2: Add AWS credentials step to CI terraform-plan job for S3 backend auth
- P3: Remove unused GitHub provider from infra/main.tf

Co-Authored-By: Paperclip <noreply@paperclip.ing>
2026-05-10 07:09:39 -04:00

520 lines
12 KiB
HCL

variable "environment" {
description = "Deployment environment"
type = string
}
variable "cluster_name" {
description = "ECS cluster name"
type = string
}
variable "vpc_id" {
description = "VPC ID"
type = string
}
variable "subnet_ids" {
description = "Private subnet IDs for ECS tasks"
type = list(string)
}
variable "public_subnet_ids" {
description = "Public subnet IDs for ALB"
type = list(string)
}
variable "security_group_ids" {
description = "Security group IDs"
type = list(string)
}
variable "alb_security_group_id" {
description = "ALB security group ID"
type = string
}
variable "services" {
description = "ECS services to deploy"
type = map(object({
cpu = number
memory = number
port = number
}))
}
variable "container_images" {
description = "Container image tags"
type = map(string)
}
variable "secrets_arn" {
description = "Secrets Manager ARN"
type = string
}
variable "cache_cluster_arn" {
description = "ElastiCache replication group ARN"
type = string
}
variable "domain_name" {
description = "Route53 hosted zone domain for ACM cert validation"
type = string
default = "shieldai.app"
}
resource "aws_ecs_cluster" "main" {
name = var.cluster_name
settings {
name = "containerInsights"
value = "enabled"
}
tags = {
Name = var.cluster_name
}
}
resource "aws_ecs_cluster_capacity_providers" "main" {
cluster_name = aws_ecs_cluster.main.name
capacity_providers = ["FARGATE"]
default_capacity_provider_strategy {
base = 1
weight = 100
capacity_provider = "FARGATE"
}
}
resource "aws_ecs_task_definition" "services" {
for_each = var.services
family = "${var.cluster_name}-${each.key}"
container_definitions = jsonencode([
{
name = each.key
image = "ghcr.io/shieldai/shieldai-${each.key}:${var.container_images[each.key]}"
cpu = each.cpu
memory = each.memory
essential = true
portMappings = [
{
containerPort = each.port
hostPort = each.port
protocol = "tcp"
}
]
environment = [
{
name = "NODE_ENV"
value = var.environment
},
{
name = "PORT"
value = tostring(each.port)
},
{
name = "DD_ENV"
value = var.environment
},
{
name = "DD_SERVICE"
value = "${var.cluster_name}-${each.key}"
},
{
name = "DD_VERSION"
value = var.container_images[each.key]
},
{
name = "DD_TRACE_ENABLED"
value = "true"
},
{
name = "DD_LOGS_INJECTION"
value = "true"
},
{
name = "DD_AGENT_HOST"
value = "localhost"
},
{
name = "DD_AGENT_PORT"
value = "8126"
},
{
name = "SENTRY_ENVIRONMENT"
value = var.environment
},
{
name = "SENTRY_RELEASE"
value = var.container_images[each.key]
},
{
name = "AWS_REGION"
value = "us-east-1"
},
{
name = "DD_SITE"
value = "datadoghq.com"
}
]
secrets = [
{
name = "DATABASE_URL"
valueFrom = "${var.secrets_arn}:DATABASE_URL::"
},
{
name = "REDIS_URL"
valueFrom = "${var.secrets_arn}:REDIS_URL::"
},
{
name = "HIBP_API_KEY"
valueFrom = "${var.secrets_arn}:HIBP_API_KEY::"
},
{
name = "RESEND_API_KEY"
valueFrom = "${var.secrets_arn}:RESEND_API_KEY::"
},
{
name = "SENTRY_DSN"
valueFrom = "${var.secrets_arn}:SENTRY_DSN::"
},
{
name = "DD_API_KEY"
valueFrom = "${var.secrets_arn}:DD_API_KEY::"
}
]
logConfiguration = {
logDriver = "awslogs"
options = {
"awslogs-group" = "/ecs/${var.cluster_name}-${each.key}"
"awslogs-region" = "us-east-1"
"awslogs-stream-prefix" = each.key
}
}
healthCheck = {
command = ["CMD-SHELL", "curl -f http://localhost:${each.port}/health || exit 1"]
interval = 30
timeout = 5
retries = 3
startPeriod = 60
}
}
])
network_mode = "awsvpc"
memory = each.memory
cpu = each.cpu
requires_compatibilities = ["FARGATE"]
execution_role_arn = aws_iam_role.execution[each.key].arn
task_role_arn = aws_iam_role.task[each.key].arn
tags = {
Name = "${var.cluster_name}-${each.key}"
}
}
resource "aws_iam_role" "execution" {
for_each = var.services
name = "${var.cluster_name}-${each.key}-execution"
assume_role_policy = jsonencode({
Version = "2012-10-17"
Statement = [
{
Action = "sts:AssumeRole"
Effect = "Allow"
Principal = {
Service = "ecs-tasks.amazonaws.com"
}
}
]
})
managed_policy_arns = [
"arn:aws:iam::aws:policy/service-role/AmazonECSTaskExecutionRolePolicy"
]
}
resource "aws_iam_role" "task" {
for_each = var.services
name = "${var.cluster_name}-${each.key}-task"
assume_role_policy = jsonencode({
Version = "2012-10-17"
Statement = [
{
Action = "sts:AssumeRole"
Effect = "Allow"
Principal = {
Service = "ecs-tasks.amazonaws.com"
}
}
]
})
inline_policy {
name = "secrets-manager-access"
policy = jsonencode({
Version = "2012-10-17"
Statement = [
{
Effect = "Allow"
Action = [
"secretsmanager:GetSecretValue",
"secretsmanager:DescribeSecret"
]
Resource = var.secrets_arn
}
]
})
}
inline_policy {
name = "elasticache-access"
policy = jsonencode({
Version = "2012-10-17"
Statement = [
{
Effect = "Allow"
Action = [
"elasticache:DescribeCacheClusters",
"elasticache:DescribeCacheSubnetGroups"
]
Resource = var.cache_cluster_arn
}
]
})
}
}
resource "aws_ecs_service" "services" {
for_each = var.services
name = "${var.cluster_name}-${each.key}"
cluster = aws_ecs_cluster.main.id
task_definition = aws_ecs_task_definition.services[each.key].arn
desired_count = var.environment == "production" ? 3 : 1
launch_type = "FARGATE"
network_configuration {
subnets = var.subnet_ids
security_groups = var.security_group_ids
assign_public_ip = false
}
load_balancer {
target_group_arn = aws_lb_target_group.services[each.key].arn
container_name = each.key
container_port = each.port
}
auto_scaling {
max_capacity = var.environment == "production" ? 10 : 3
min_capacity = var.environment == "production" ? 2 : 1
}
tags = {
Name = "${var.cluster_name}-${each.key}"
Service = each.key
}
depends_on = [
aws_lb_listener.https
]
}
resource "aws_lb" "main" {
name = "${var.cluster_name}-alb"
internal = false
load_balancer_type = "application"
security_groups = [var.alb_security_group_id]
subnets = var.public_subnet_ids
tags = {
Name = "${var.cluster_name}-alb"
}
}
resource "aws_acm_certificate" "main" {
domain_name = "${var.cluster_name}.${var.environment}.shieldai.app"
validation_method = "DNS"
tags = {
Name = "${var.cluster_name}-cert"
}
}
data "aws_route53_zone" "main" {
name = var.domain_name
}
resource "aws_route53_record" "acm_validation" {
for_each = {
for rv in aws_acm_certificate.main.domain_validation_options : rv.domain_name => rv
if rv.resource_record_name != null
}
zone_id = data.aws_route53_zone.main.zone_id
name = each.value.resource_record_name
type = each.value.resource_record_type
ttl = 60
records = [each.value.resource_record_value]
}
resource "aws_acm_certificate_validation" "main" {
certificate_arn = aws_acm_certificate.main.arn
validation_record_fqdns = [aws_route53_record.acm_validation[*].fqdn]
}
resource "aws_lb_target_group" "services" {
for_each = var.services
name = "${var.cluster_name}-${each.key}-tg"
port = each.port
protocol = "HTTP"
vpc_id = var.vpc_id
health_check {
enabled = true
healthy_threshold = 3
interval = 30
matcher = "200"
path = "/health"
port = "traffic-port"
protocol = "HTTP"
timeout = 5
unhealthy_threshold = 3
}
stickiness {
type = "lb_cookie"
cookie_duration = 86400
}
}
resource "aws_lb_listener" "https" {
load_balancer_arn = aws_lb.main.arn
port = 443
protocol = "HTTPS"
ssl_certificate_arn = aws_acm_certificate_validation.main.certificate_arn
default_action {
type = "forward"
target_group_arn = aws_lb_target_group.services["api"].arn
}
}
resource "aws_lb_listener_rule" "services" {
for_each = { for k, v in var.services : k => v if k != "api" }
listener_arn = aws_lb_listener.https.arn
action {
type = "forward"
target_group_arn = aws_lb_target_group.services[each.key].arn
}
condition {
path_pattern {
values = ["/${each.key}/*", "/${each.key}"]
}
}
}
resource "aws_lb_listener" "http_redirect" {
load_balancer_arn = aws_lb.main.arn
port = 80
protocol = "HTTP"
default_action {
type = "redirect"
redirect {
port = "443"
protocol = "HTTPS"
status_code = "HTTP_301"
}
}
}
resource "aws_appautoscaling_target" "services" {
for_each = var.services
service_namespace = "ecs"
resource_id = "service/${aws_ecs_cluster.main.name}/${aws_ecs_service.services[each.key].name}"
scalable_dimension = "ecs:service:DesiredCount"
min_capacity = var.environment == "production" ? 2 : 1
max_capacity = var.environment == "production" ? 10 : 3
}
resource "aws_appautoscaling_policy" "cpu" {
for_each = var.services
name = "${var.cluster_name}-${each.key}-cpu-scaling"
service_namespace = "ecs"
resource_id = "service/${aws_ecs_cluster.main.name}/${aws_ecs_service.services[each.key].name}"
scalable_dimension = "ecs:service:DesiredCount"
target_tracking_scaling_policy_configuration {
target_value = 70.0
scale_in_cooldown = 60
scale_out_cooldown = 30
customized_metric_specification {
metric_name = "CPUUtilization"
namespace = "AWS/ECS"
statistic = "Average"
dimensions = [{ name = "ClusterName", value = aws_ecs_cluster.main.name }]
}
}
}
resource "aws_kms_key" "logs" {
description = "${var.cluster_name} logs encryption key"
deletion_window_in_days = 7
enable_key_rotation = true
tags = {
Name = "${var.cluster_name}-logs-kms"
}
}
resource "aws_cloudwatch_log_group" "services" {
for_each = var.services
name = "/ecs/${var.cluster_name}-${each.key}"
retention_in_days = var.environment == "production" ? 30 : 7
kms_key_id = aws_kms_key.logs.arn
tags = {
Name = "${var.cluster_name}-${each.key}-logs"
}
}
output "cluster_arn" {
description = "ECS cluster ARN"
value = aws_ecs_cluster.main.arn
}
output "alb_dns_name" {
description = "ALB DNS name"
value = aws_lb.main.dns_name
}
output "kms_key_arn" {
description = "KMS key ARN for log encryption"
value = aws_kms_key.logs.arn
}