Fix 6 P1 infrastructure issues from code review (FRE-4574)

- ALB: deploy to public subnets instead of private (adds public_subnet_ids var)
- ECS: fix launch_desired_count → launch_type = FARGATE
- Secrets: accept actual RDS/ElastiCache endpoints from parent module
- Deploy: fix circular dependency (needs.detect → steps.detect)
- Health check: dynamic ALB DNS lookup via aws elbv2 CLI
- Health check: exit 1 on failure so rollback triggers

Co-Authored-By: Paperclip <noreply@paperclip.ing>
This commit is contained in:
2026-05-10 02:28:48 -04:00
parent c7df40ac26
commit 4ddd24fd72
4 changed files with 62 additions and 35 deletions

View File

@@ -33,7 +33,7 @@ jobs:
- name: Calculate tag
id: tag
run: |
if [ "${{ needs.detect-environment.outputs.environment }}" = "production" ]; then
if [ "${{ steps.detect.outputs.environment }}" = "production" ]; then
echo "tag=${{ github.event.release.tag_name }}" >> $GITHUB_OUTPUT
else
echo "tag=${{ github.sha }}" >> $GITHUB_OUTPUT
@@ -169,36 +169,47 @@ jobs:
needs: [detect-environment, deploy-ecs]
environment: ${{ needs.detect-environment.outputs.environment }}
steps:
- name: Configure AWS
uses: aws-actions/configure-aws-credentials@v4
with:
aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
aws-region: us-east-1
- name: Wait for deployment
run: sleep 30
- name: Health Check
uses: jasongd/retry-action@v2
with:
timeout-minutes: 5
retry-minutes: 10
command: |
ALB_DNS=$(aws ecs describe-services \
--cluster "shieldai-${{ needs.detect-environment.outputs.environment }}" \
--services "shieldai-${{ needs.detect-environment.outputs.environment }}-api" \
--query 'services[0].loadBalancers[0].targetGroupArn' --output text)
id: health
run: |
ENV="${{ needs.detect-environment.outputs.environment }}"
CLUSTER="shieldai-${ENV}"
for service in api darkwatch spamshield voiceprint; do
PORT=$(case $service in
api) echo 3000;;
darkwatch) echo 3001;;
spamshield) echo 3002;;
voiceprint) echo 3003;;
esac)
ALB_DNS=$(aws elbv2 describe-load-balancers \
--query "LoadBalancers[?contains(LoadBalancerName, '${CLUSTER}-alb')].DNSName" \
--output text)
HTTP_CODE=$(curl -s -o /dev/null -w "%{http_code}" \
"https://shieldai-${{ needs.detect-environment.outputs.environment }}-alb.us-east-1.elb.amazonaws.com/health" || true)
if [ -z "$ALB_DNS" ]; then
echo "Health check failed: ALB DNS not found"
exit 1
fi
if [ "$HTTP_CODE" = "200" ]; then
echo "Health check passed: $service"
else
echo "Health check failed: $service (HTTP $HTTP_CODE)"
fi
done
echo "ALB DNS: $ALB_DNS"
FAILED=0
for service in api darkwatch spamshield voiceprint; do
HTTP_CODE=$(curl -s -o /dev/null -w "%{http_code}" \
"http://${ALB_DNS}/health" || true)
if [ "$HTTP_CODE" = "200" ]; then
echo "Health check passed: $service"
else
echo "Health check failed: $service (HTTP $HTTP_CODE)"
FAILED=1
fi
done
if [ "$FAILED" -eq 1 ]; then
exit 1
fi
rollback:
name: Rollback on Failure

View File

@@ -49,6 +49,7 @@ module "ecs" {
cluster_name = "${var.project_name}-${var.environment}"
vpc_id = module.vpc.vpc_id
subnet_ids = module.vpc.private_subnet_ids
public_subnet_ids = module.vpc.public_subnet_ids
security_group_ids = [module.vpc.ecs_security_group_id]
services = var.services
container_images = var.container_images
@@ -91,9 +92,11 @@ module "s3" {
module "secrets" {
source = "./modules/secrets"
environment = var.environment
project_name = var.project_name
secrets = var.secrets
environment = var.environment
project_name = var.project_name
rds_endpoint = module.rds.db_endpoint
elasticache_endpoint = module.elasticache.cache_endpoint
secrets = var.secrets
}
module "cloudwatch" {

View File

@@ -14,7 +14,12 @@ variable "vpc_id" {
}
variable "subnet_ids" {
description = "Private subnet IDs"
description = "Private subnet IDs for ECS tasks"
type = list(string)
}
variable "public_subnet_ids" {
description = "Public subnet IDs for ALB"
type = list(string)
}
@@ -273,7 +278,7 @@ resource "aws_ecs_service" "services" {
task_definition = aws_ecs_task_definition.services[each.key].arn
desired_count = var.environment == "production" ? 3 : 1
launch_desired_count = "FARGATE"
launch_type = "FARGATE"
network_configuration {
subnets = var.subnet_ids
@@ -307,7 +312,7 @@ resource "aws_lb" "main" {
internal = false
load_balancer_type = "application"
security_groups = var.security_group_ids
subnets = var.subnet_ids
subnets = var.public_subnet_ids
tags = {
Name = "${var.cluster_name}-alb"

View File

@@ -8,6 +8,16 @@ variable "project_name" {
type = string
}
variable "rds_endpoint" {
description = "RDS instance endpoint"
type = string
}
variable "elasticache_endpoint" {
description = "ElastiCache primary endpoint"
type = string
}
variable "secrets" {
description = "Secrets to store"
type = map(string)
@@ -29,15 +39,13 @@ resource "aws_secretsmanager_secret_version" "main" {
secret_id = aws_secretsmanager_secret.main.id
secret_string = jsonencode(merge({
DATABASE_URL = "postgresql://shieldai:${var.project_name}@${var.project_name}-${var.environment}-db.${data.aws_caller_identity.current.account_id}.us-east-1.rds.amazonaws.com:5432/shieldai"
REDIS_URL = "redis://${var.project_name}-${var.environment}-redis.${data.aws_caller_identity.current.account_id}.us-east-1.cache.amazonaws.com:6379"
DATABASE_URL = "postgresql://shieldai:${var.project_name}@${var.rds_endpoint}:5432/shieldai"
REDIS_URL = "redis://${var.elasticache_endpoint}:6379"
NODE_ENV = var.environment
LOG_LEVEL = var.environment == "production" ? "info" : "debug"
}, var.secrets))
}
data "aws_caller_identity" "current" {}
output "secrets_manager_arn" {
description = "Secrets Manager ARN"
value = aws_secretsmanager_secret.main.arn