Files
ShieldAI/infra/modules/ecs/main.tf
Senior Engineer a0799c0647 Add Terraform AWS infrastructure and enhanced CI/CD pipeline (FRE-4574)
- Terraform modules: VPC, ECS Fargate, RDS PostgreSQL, ElastiCache Redis, S3, Secrets Manager, CloudWatch
- Multi-environment support: staging and production configs
- ECS auto-scaling: CPU-based scaling with configurable min/max
- CI/CD: pnpm caching, Docker Buildx, Trivy security scanning, Terraform plan on PR
- Deploy: ECS service updates with automatic rollback on health check failure
- Backup: automated RDS snapshots, S3 versioning, ElastiCache snapshots
- Monitoring: CloudWatch dashboards, CPU/memory/5xx alarms
- Rollback script for manual service rollback
- Infrastructure documentation with architecture overview
2026-05-08 02:54:39 -04:00

356 lines
7.9 KiB
HCL

variable "environment" {
description = "Deployment environment"
type = string
}
variable "cluster_name" {
description = "ECS cluster name"
type = string
}
variable "vpc_id" {
description = "VPC ID"
type = string
}
variable "subnet_ids" {
description = "Private subnet IDs"
type = list(string)
}
variable "security_group_ids" {
description = "Security group IDs"
type = list(string)
}
variable "services" {
description = "ECS services to deploy"
type = map(object({
cpu = number
memory = number
port = number
}))
}
variable "container_images" {
description = "Container image tags"
type = map(string)
}
variable "secrets_arn" {
description = "Secrets Manager ARN"
type = string
}
resource "aws_ecs_cluster" "main" {
name = var.cluster_name
settings {
name = "containerInsights"
value = "enabled"
}
tags = {
Name = var.cluster_name
}
}
resource "aws_ecs_cluster_capacity_providers" "main" {
cluster_name = aws_ecs_cluster.main.name
capacity_providers = ["FARGATE"]
default_capacity_provider_strategy {
base = 1
weight = 100
capacity_provider = "FARGATE"
}
}
resource "aws_ecs_task_definition" "services" {
for_each = var.services
family = "${var.cluster_name}-${each.key}"
container_definitions = jsonencode([
{
name = each.key
image = "ghcr.io/shieldai/shieldai-${each.key}:${var.container_images[each.key]}"
cpu = each.cpu
memory = each.memory
essential = true
portMappings = [
{
containerPort = each.port
hostPort = each.port
protocol = "tcp"
}
]
environment = [
{
name = "NODE_ENV"
value = var.environment
},
{
name = "PORT"
value = tostring(each.port)
}
]
secrets = [
{
name = "DATABASE_URL"
valueFrom = "${var.secrets_arn}:DATABASE_URL::"
},
{
name = "REDIS_URL"
valueFrom = "${var.secrets_arn}:REDIS_URL::"
},
{
name = "HIBP_API_KEY"
valueFrom = "${var.secrets_arn}:HIBP_API_KEY::"
},
{
name = "RESEND_API_KEY"
valueFrom = "${var.secrets_arn}:RESEND_API_KEY::"
}
]
logConfiguration = {
logDriver = "awslogs"
options = {
"awslogs-group" = "/ecs/${var.cluster_name}-${each.key}"
"awslogs-region" = "us-east-1"
"awslogs-stream-prefix" = each.key
}
}
healthCheck = {
command = ["CMD-SHELL", "wget -q --spider http://localhost:${each.port}/health || exit 1"]
interval = 30
timeout = 5
retries = 3
startPeriod = 60
}
}
])
network_mode = "awsvpc"
memory = each.memory
cpu = each.cpu
requires_compatibilities = ["FARGATE"]
execution_role_arn = aws_iam_role.execution[each.key].arn
task_role_arn = aws_iam_role.task[each.key].arn
tags = {
Name = "${var.cluster_name}-${each.key}"
}
}
resource "aws_iam_role" "execution" {
for_each = var.services
name = "${var.cluster_name}-${each.key}-execution"
assume_role_policy = jsonencode({
Version = "2012-10-17"
Statement = [
{
Action = "sts:AssumeRole"
Effect = "Allow"
Principal = {
Service = "ecs-tasks.amazonaws.com"
}
}
]
})
managed_policy_arns = [
"arn:aws:iam::aws:policy/service-role/AmazonECSTaskExecutionRolePolicy"
]
}
resource "aws_iam_role" "task" {
for_each = var.services
name = "${var.cluster_name}-${each.key}-task"
assume_role_policy = jsonencode({
Version = "2012-10-17"
Statement = [
{
Action = "sts:AssumeRole"
Effect = "Allow"
Principal = {
Service = "ecs-tasks.amazonaws.com"
}
}
]
})
managed_policy_arns = [
"arn:aws:iam::aws:policy/SecretsManagerReadOnly"
]
inline_policy {
name = "elasticache-access"
policy = jsonencode({
Version = "2012-10-17"
Statement = [
{
Effect = "Allow"
Action = [
"elasticache:DescribeCacheClusters",
"elasticache:DescribeCacheSubnetGroups"
]
Resource = "*"
}
]
})
}
}
resource "aws_ecs_service" "services" {
for_each = var.services
name = "${var.cluster_name}-${each.key}"
cluster = aws_ecs_cluster.main.id
task_definition = aws_ecs_task_definition.services[each.key].arn
desired_count = var.environment == "production" ? 3 : 1
launch_desired_count = "FARGATE"
network_configuration {
subnets = var.subnet_ids
security_groups = var.security_group_ids
assign_public_ip = false
}
load_balancer {
target_group_arn = aws_lb_target_group.services[each.key].arn
container_name = each.key
container_port = each.port
}
auto_scaling {
max_capacity = var.environment == "production" ? 10 : 3
min_capacity = var.environment == "production" ? 2 : 1
}
tags = {
Name = "${var.cluster_name}-${each.key}"
Service = each.key
}
depends_on = [
aws_lb_listener.services
]
}
resource "aws_lb" "main" {
name = "${var.cluster_name}-alb"
internal = false
load_balancer_type = "application"
security_groups = var.security_group_ids
subnets = var.subnet_ids
tags = {
Name = "${var.cluster_name}-alb"
}
}
resource "aws_lb_target_group" "services" {
for_each = var.services
name = "${var.cluster_name}-${each.key}-tg"
port = each.port
protocol = "HTTP"
vpc_id = var.vpc_id
health_check {
enabled = true
healthy_threshold = 3
interval = 30
matcher = "200"
path = "/health"
port = "traffic-port"
protocol = "HTTP"
timeout = 5
unhealthy_threshold = 3
}
stickiness {
type = "lb_cookie"
cookie_duration = 86400
}
}
resource "aws_lb_listener" "services" {
for_each = var.services
load_balancer_arn = aws_lb.main.arn
port = 80
protocol = "HTTP"
default_action {
type = "forward"
target_group_arn = aws_lb_target_group.services[each.key].arn
}
}
resource "aws_appautoscaling_target" "services" {
for_each = var.services
service_namespace = "ecs"
resource_id = "service/${aws_ecs_cluster.main.name}/${aws_ecs_service.services[each.key].name}"
scalable_dimension = "ecs:service:DesiredCount"
min_capacity = var.environment == "production" ? 2 : 1
max_capacity = var.environment == "production" ? 10 : 3
}
resource "aws_appautoscaling_policy" "cpu" {
for_each = var.services
name = "${var.cluster_name}-${each.key}-cpu-scaling"
service_namespace = "ecs"
resource_id = "service/${aws_ecs_cluster.main.name}/${aws_ecs_service.services[each.key].name}"
scalable_dimension = "ecs:service:DesiredCount"
target_tracking_scaling_policy_configuration {
target_value = 70.0
scale_in_cooldown = 60
scale_out_cooldown = 30
customized_metric_specification {
metric_name = "CPUUtilization"
namespace = "AWS/ECS"
statistic = "Average"
dimensions = [{ name = "ClusterName", value = aws_ecs_cluster.main.name }]
}
}
}
resource "aws_cloudwatch_log_group" "services" {
for_each = var.services
name = "/ecs/${var.cluster_name}-${each.key}"
retention_in_days = var.environment == "production" ? 30 : 7
tags = {
Name = "${var.cluster_name}-${each.key}-logs"
}
}
output "cluster_arn" {
description = "ECS cluster ARN"
value = aws_ecs_cluster.main.arn
}
output "alb_dns_name" {
description = "ALB DNS name"
value = aws_lb.main.dns_name
}