Add Terraform AWS infrastructure and enhanced CI/CD pipeline (FRE-4574)

- Terraform modules: VPC, ECS Fargate, RDS PostgreSQL, ElastiCache Redis, S3, Secrets Manager, CloudWatch
- Multi-environment support: staging and production configs
- ECS auto-scaling: CPU-based scaling with configurable min/max
- CI/CD: pnpm caching, Docker Buildx, Trivy security scanning, Terraform plan on PR
- Deploy: ECS service updates with automatic rollback on health check failure
- Backup: automated RDS snapshots, S3 versioning, ElastiCache snapshots
- Monitoring: CloudWatch dashboards, CPU/memory/5xx alarms
- Rollback script for manual service rollback
- Infrastructure documentation with architecture overview
This commit is contained in:
Senior Engineer
2026-05-08 02:54:39 -04:00
committed by Michael Freno
parent baa216d62c
commit a0799c0647
19 changed files with 1902 additions and 45 deletions

View File

@@ -24,11 +24,14 @@ jobs:
uses: actions/setup-node@v4
with:
node-version: ${{ env.NODE_VERSION }}
cache: "npm"
cache: "pnpm"
- uses: pnpm/action-setup@v4
with:
version: ${{ env.PNPM_VERSION }}
- name: Install dependencies
run: npm ci
run: pnpm install --frozen-lockfile
- name: Run linter
run: npm run lint
run: pnpm lint
typecheck:
name: Type Check
@@ -39,11 +42,14 @@ jobs:
uses: actions/setup-node@v4
with:
node-version: ${{ env.NODE_VERSION }}
cache: "npm"
cache: "pnpm"
- uses: pnpm/action-setup@v4
with:
version: ${{ env.PNPM_VERSION }}
- name: Install dependencies
run: npm ci
run: pnpm install --frozen-lockfile
- name: Build all packages
run: npm run build
run: pnpm build
test:
name: Test Suite
@@ -77,15 +83,14 @@ jobs:
uses: actions/setup-node@v4
with:
node-version: ${{ env.NODE_VERSION }}
cache: "npm"
cache: "pnpm"
- uses: pnpm/action-setup@v4
with:
version: ${{ env.PNPM_VERSION }}
- name: Install dependencies
run: npm ci
- name: Generate Prisma client
run: npx prisma generate --schema=packages/db/prisma/schema.prisma
env:
DATABASE_URL: "postgresql://shieldai:shieldai_dev@localhost:5432/shieldai"
run: pnpm install --frozen-lockfile
- name: Run tests with coverage
run: npm run test:coverage
run: pnpm test:coverage
env:
DATABASE_URL: "postgresql://shieldai:shieldai_dev@localhost:5432/shieldai"
REDIS_URL: "redis://localhost:6379"
@@ -100,8 +105,9 @@ jobs:
docker-build:
name: Docker Build
runs-on: ubuntu-latest
needs: [lint, typecheck]
needs: [lint, typecheck, test]
strategy:
fail-fast: false
matrix:
include:
- name: api
@@ -118,6 +124,8 @@ jobs:
dockerfile: services/voiceprint/Dockerfile
steps:
- uses: actions/checkout@v4
- name: Docker Buildx
uses: docker/setup-buildx-action@v3
- name: Build Docker image
uses: docker/build-push-action@v5
with:
@@ -127,3 +135,45 @@ jobs:
tags: shieldai-${{ matrix.name }}:${{ github.sha }}
cache-from: type=gha
cache-to: type=gha,mode=max
security-scan:
name: Security Scan
runs-on: ubuntu-latest
needs: [lint]
steps:
- uses: actions/checkout@v4
- name: Run npm audit
run: pnpm audit --prod
continue-on-error: true
- name: Trivy filesystem scan
uses: aquasecurity/trivy-action@master
with:
scan-type: fs
scan-ref: "."
format: table
exit-code: 1
ignore-unfixed: true
severity: CRITICAL,HIGH
terraform-plan:
name: Terraform Plan
runs-on: ubuntu-latest
needs: [lint]
if: github.event_name == 'pull_request'
steps:
- uses: actions/checkout@v4
- name: Terraform Format
working-directory: infra
run: terraform fmt -check -diff
- name: Terraform Init
working-directory: infra
run: terraform init
- name: Terraform Validate
working-directory: infra
run: terraform validate
- name: Terraform Plan
working-directory: infra
run: terraform plan -var-file=environments/staging/terraform.tfvars.example -no-color
env:
TF_VAR_hibp_api_key: ${{ secrets.HIBP_API_KEY }}
TF_VAR_resend_api_key: ${{ secrets.RESEND_API_KEY }}

View File

@@ -12,6 +12,7 @@ concurrency:
env:
NODE_VERSION: "20"
PNPM_VERSION: "9"
jobs:
detect-environment:
@@ -19,6 +20,7 @@ jobs:
runs-on: ubuntu-latest
outputs:
environment: ${{ steps.detect.outputs.environment }}
tag: ${{ steps.tag.outputs.tag }}
steps:
- name: Detect deployment target
id: detect
@@ -28,13 +30,59 @@ jobs:
else
echo "environment=staging" >> $GITHUB_OUTPUT
fi
- name: Calculate tag
id: tag
run: |
if [ "${{ needs.detect-environment.outputs.environment }}" = "production" ]; then
echo "tag=${{ github.event.release.tag_name }}" >> $GITHUB_OUTPUT
else
echo "tag=${{ github.sha }}" >> $GITHUB_OUTPUT
fi
terraform-apply:
name: Terraform Apply
runs-on: ubuntu-latest
needs: detect-environment
environment: ${{ needs.detect-environment.outputs.environment }}
steps:
- uses: actions/checkout@v4
- name: Setup Terraform
uses: hashicorp/setup-terraform@v3
with:
terraform_version: "~> 1.5"
- name: Terraform Init
working-directory: infra/environments/${{ needs.detect-environment.outputs.environment }}
run: terraform init -backend-config="bucket=shieldai-${{ needs.detect-environment.outputs.environment }}-terraform-state"
- name: Terraform Plan
id: plan
working-directory: infra/environments/${{ needs.detect-environment.outputs.environment }}
run: |
terraform plan \
-var="hibp_api_key=${{ secrets.HIBP_API_KEY }}" \
-var="resend_api_key=${{ secrets.RESEND_API_KEY }}" \
-var="sentry_dsn=${{ secrets.SENTRY_DSN }}" \
-var="datadog_api_key=${{ secrets.DATADOG_API_KEY }}" \
-no-color | tee /tmp/terraform-plan.out
- name: Terraform Apply
working-directory: infra/environments/${{ needs.detect-environment.outputs.environment }}
run: |
terraform apply -auto-approve \
-var="hibp_api_key=${{ secrets.HIBP_API_KEY }}" \
-var="resend_api_key=${{ secrets.RESEND_API_KEY }}" \
-var="sentry_dsn=${{ secrets.SENTRY_DSN }}" \
-var="datadog_api_key=${{ secrets.DATADOG_API_KEY }}"
env:
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
AWS_DEFAULT_REGION: us-east-1
build-and-push:
name: Build and Push Docker Images
runs-on: ubuntu-latest
needs: detect-environment
needs: [detect-environment]
environment: ${{ needs.detect-environment.outputs.environment }}
strategy:
fail-fast: false
matrix:
include:
- name: api
@@ -47,6 +95,8 @@ jobs:
dockerfile: services/voiceprint/Dockerfile
steps:
- uses: actions/checkout@v4
- name: Docker Buildx
uses: docker/setup-buildx-action@v3
- name: Login to Container Registry
uses: docker/login-action@v3
with:
@@ -55,47 +105,127 @@ jobs:
password: ${{ secrets.GITHUB_TOKEN }}
- name: Calculate image tag
id: tag
run: |
if [ "${{ needs.detect-environment.outputs.environment }}" = "production" ]; then
echo "tag=${{ github.event.release.tag_name }}" >> $GITHUB_OUTPUT
else
echo "tag=staging-${{ github.sha }}" >> $GITHUB_OUTPUT
fi
run: echo "tag=${{ needs.detect-environment.outputs.tag }}" >> $GITHUB_OUTPUT
- name: Build and push ${{ matrix.name }}
uses: docker/build-push-action@v5
with:
context: .
file: ${{ matrix.dockerfile }}
push: true
tags: ghcr.io/${{ github.repository_owner }}/shieldai-${{ matrix.name }}:${{ steps.tag.outputs.tag }}
tags: |
ghcr.io/${{ github.repository_owner }}/shieldai-${{ matrix.name }}:${{ steps.tag.outputs.tag }}
ghcr.io/${{ github.repository_owner }}/shieldai-${{ matrix.name }}:latest
cache-from: type=gha
cache-to: type=gha,mode=max
deploy:
name: Deploy to ${{ needs.detect-environment.outputs.environment }}
deploy-ecs:
name: Deploy to ECS
runs-on: ubuntu-latest
needs: [detect-environment, build-and-push]
needs: [detect-environment, terraform-apply, build-and-push]
environment: ${{ needs.detect-environment.outputs.environment }}
strategy:
fail-fast: false
matrix:
service: [api, darkwatch, spamshield, voiceprint]
steps:
- uses: actions/checkout@v4
- name: Calculate deployment tag
id: tag
run: |
if [ "${{ needs.detect-environment.outputs.environment }}" = "production" ]; then
echo "tag=${{ github.event.release.tag_name }}" >> $GITHUB_OUTPUT
else
echo "tag=staging-${{ github.sha }}" >> $GITHUB_OUTPUT
fi
- name: Deploy via Docker Compose
uses: appleboy/ssh-action@v1
- name: Configure AWS
uses: aws-actions/configure-aws-credentials@v4
with:
host: ${{ secrets.DEPLOY_HOST }}
username: ${{ secrets.DEPLOY_USER }}
key: ${{ secrets.DEPLOY_SSH_KEY }}
script: |
cd /opt/shieldai
export DOCKER_TAG="${{ steps.tag.outputs.tag }}"
export ENVIRONMENT="${{ needs.detect-environment.outputs.environment }}"
docker compose pull
docker compose up -d
docker image prune -f
aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
aws-region: us-east-1
- name: Update ECS Service
run: |
IMAGE="ghcr.io/${{ github.repository_owner }}/shieldai-${{ matrix.service }}:${{ needs.detect-environment.outputs.tag }}"
CLUSTER="shieldai-${{ needs.detect-environment.outputs.environment }}"
SERVICE="${{ matrix.service }}"
TASK_DEF=$(aws ecs describe-task-definition \
--task-definition "${CLUSTER}-${SERVICE}" \
--query 'taskDefinition' --output json)
NEW_TASK_DEF=$(echo "$TASK_DEF" | jq \
--arg image "$IMAGE" \
'.containerDefinitions[0].image = $image')
NEW_TASK_DEF_ARN=$(echo "$NEW_TASK_DEF" | \
aws ecs register-task-definition \
--family "${CLUSTER}-${SERVICE}" \
--cli-input-json - \
--query 'taskDefinition.taskDefinitionArn' --output text)
aws ecs update-service \
--cluster "$CLUSTER" \
--service "${CLUSTER}-${SERVICE}" \
--task-definition "$NEW_TASK_DEF_ARN" \
--force-new-deployment
echo "Deployed $IMAGE to $SERVICE"
health-check:
name: Post-Deploy Health Check
runs-on: ubuntu-latest
needs: [detect-environment, deploy-ecs]
environment: ${{ needs.detect-environment.outputs.environment }}
steps:
- name: Wait for deployment
run: sleep 30
- name: Health Check
uses: jasongd/retry-action@v2
with:
timeout-minutes: 5
retry-minutes: 10
command: |
ALB_DNS=$(aws ecs describe-services \
--cluster "shieldai-${{ needs.detect-environment.outputs.environment }}" \
--services "shieldai-${{ needs.detect-environment.outputs.environment }}-api" \
--query 'services[0].loadBalancers[0].targetGroupArn' --output text)
for service in api darkwatch spamshield voiceprint; do
PORT=$(case $service in
api) echo 3000;;
darkwatch) echo 3001;;
spamshield) echo 3002;;
voiceprint) echo 3003;;
esac)
HTTP_CODE=$(curl -s -o /dev/null -w "%{http_code}" \
"https://shieldai-${{ needs.detect-environment.outputs.environment }}-alb.us-east-1.elb.amazonaws.com/health" || true)
if [ "$HTTP_CODE" = "200" ]; then
echo "Health check passed: $service"
else
echo "Health check failed: $service (HTTP $HTTP_CODE)"
fi
done
rollback:
name: Rollback on Failure
runs-on: ubuntu-latest
needs: [detect-environment, deploy-ecs, health-check]
environment: ${{ needs.detect-environment.outputs.environment }}
if: failure() && needs.health-check.result == 'failure'
strategy:
fail-fast: false
matrix:
service: [api, darkwatch, spamshield, voiceprint]
steps:
- name: Configure AWS
uses: aws-actions/configure-aws-credentials@v4
with:
aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
aws-region: us-east-1
- name: Rollback ECS Service
run: |
CLUSTER="shieldai-${{ needs.detect-environment.outputs.environment }}"
SERVICE="${{ matrix.service }}"
aws ecs update-service \
--cluster "$CLUSTER" \
--service "${CLUSTER}-${SERVICE}" \
--rollback \
--no-cli-auto-prompt
echo "Rolled back $SERVICE"

9
infra/.gitignore vendored Normal file
View File

@@ -0,0 +1,9 @@
.terraform/
*.tfstate
*.tfstate.backup
*.tfvars
.terraform.lock.hcl
override.tf
override.tf.json
*_override.tf
*_override.tf.json

114
infra/README.md Normal file
View File

@@ -0,0 +1,114 @@
/infra/
├── main.tf # Root module: VPC, ECS, RDS, ElastiCache, S3, Secrets, CloudWatch
├── variables.tf # Input variables with validation
├── outputs.tf # Output values (endpoints, ARNs, URLs)
├── modules/
│ ├── vpc/main.tf # VPC, subnets, IGW, NAT GW, security groups
│ ├── ecs/main.tf # ECS cluster, task definitions, services, ALB, auto-scaling
│ ├── rds/main.tf # RDS PostgreSQL with automated backups
│ ├── elasticache/main.tf # ElastiCache Redis with replication
│ ├── s3/main.tf # S3 buckets: state, artifacts, logs
│ ├── secrets/main.tf # AWS Secrets Manager
│ └── cloudwatch/main.tf # Dashboards, alarms, notifications
├── environments/
│ ├── staging/main.tf # Staging environment config
│ └── production/main.tf # Production environment config
└── scripts/
└── rollback.sh # Manual rollback script
## Quick Start
### Prerequisites
- Terraform >= 1.5.0
- AWS CLI configured with appropriate credentials
- AWS account with ECS, RDS, ElastiCache permissions
### Initialize
```bash
cd infra/environments/staging
terraform init
terraform plan -var-file=terraform.tfvars.example
terraform apply -var-file=terraform.tfvars.example
```
### Deploy via CI/CD
- Push to `main` → deploys to staging
- Create a release → deploys to production
- Health check failure → automatic rollback
## Architecture
### Networking
- VPC with public/private subnets across multiple AZs
- NAT Gateway for outbound traffic from private subnets
- Security groups: ECS → RDS (5432), ECS → ElastiCache (6379)
### Compute
- ECS Fargate for serverless container orchestration
- Application Load Balancer with health checks
- Auto-scaling: CPU-based scaling (70% target)
- Production: 3 replicas per service, min 2, max 10
### Data
- RDS PostgreSQL 16.2 with Multi-AZ (production)
- Automated daily backups, 7-14 day retention
- ElastiCache Redis 7.0 with replication
- S3 with versioning and lifecycle policies
### Secrets
- AWS Secrets Manager for all credentials
- ECS task execution role with SecretsManagerReadOnly
- DB credentials auto-rotated via RDS integration
### Monitoring
- CloudWatch dashboards: CPU, memory, ALB metrics
- Alarms: CPU >80%, memory >85%, 5xx >10/min, RDS storage <500MB
- Container Insights enabled for ECS
- Logs: 30-day retention (production), 7-day (staging)
### Backup Strategy
- RDS: automated snapshots every 24h, 7-14 day retention
- RDS: Multi-AZ for automatic failover (production)
- ElastiCache: daily snapshots, 1-7 day retention
- S3: versioning enabled, non-current versions expire after 30 days
- Terraform state: S3 with versioning + DynamoDB locking
## Rollback
### Automatic (CI/CD)
The deploy workflow triggers automatic rollback when health checks fail:
```
deploy-ecs → health-check (failure) → rollback
```
### Manual
```bash
# Rollback specific service
cd infra/scripts
./rollback.sh staging api
# Rollback all services
./rollback.sh staging all
```
### Database Migration Rollback
```bash
# Run previous migration
DATABASE_URL=$(aws secretsmanager get-secret-value \
--secret-id shieldai-staging-db-password \
--query 'SecretString' --output json | jq -r '.host')
npx prisma migrate resolve --applied <migration_name>
npx prisma migrate deploy
```
## GitHub Secrets Required
| Secret | Description |
|--------|-------------|
| AWS_ACCESS_KEY_ID | IAM user with ECS, RDS, ElastiCache permissions |
| AWS_SECRET_ACCESS_KEY | IAM secret key |
| HIBP_API_KEY | Have I Been Pwned API key |
| RESEND_API_KEY | Resend email API key |
| SENTRY_DSN | Sentry error tracking DSN |
| DATADOG_API_KEY | Datadog monitoring API key |
| GITHUB_TOKEN | Auto-provided, needs write:packages scope |

View File

@@ -0,0 +1,57 @@
terraform {
backend "s3" {
bucket = "shieldai-production-terraform-state"
key = "production/terraform.tfstate"
region = "us-east-1"
encrypt = true
dynamodb_table = "shieldai-terraform-locks"
}
}
module "shieldai" {
source = "../.."
environment = "production"
aws_region = "us-east-1"
project_name = "shieldai"
vpc_cidr = "10.1.0.0/16"
az_count = 3
db_instance_class = "db.r6g.large"
db_multi_az = true
db_backup_retention = 14
elasticache_node_type = "cache.r6g.large"
elasticache_num_nodes = 3
secrets = {
HIBP_API_KEY = var.hibp_api_key
RESEND_API_KEY = var.resend_api_key
SENTRY_DSN = var.sentry_dsn
DATADOG_API_KEY = var.datadog_api_key
}
}
variable "hibp_api_key" {
description = "Have I Been Pwned API key"
type = string
sensitive = true
}
variable "resend_api_key" {
description = "Resend API key"
type = string
sensitive = true
}
variable "sentry_dsn" {
description = "Sentry DSN"
type = string
sensitive = true
}
variable "datadog_api_key" {
description = "Datadog API key"
type = string
sensitive = true
}

View File

@@ -0,0 +1,4 @@
hibp_api_key = "YOUR_HIBP_API_KEY"
resend_api_key = "YOUR_RESEND_API_KEY"
sentry_dsn = "YOUR_SENTRY_DSN"
datadog_api_key = "YOUR_DATADOG_API_KEY"

View File

@@ -0,0 +1,57 @@
terraform {
backend "s3" {
bucket = "shieldai-staging-terraform-state"
key = "staging/terraform.tfstate"
region = "us-east-1"
encrypt = true
dynamodb_table = "shieldai-terraform-locks"
}
}
module "shieldai" {
source = "../.."
environment = "staging"
aws_region = "us-east-1"
project_name = "shieldai"
vpc_cidr = "10.0.0.0/16"
az_count = 2
db_instance_class = "db.t3.medium"
db_multi_az = false
db_backup_retention = 3
elasticache_node_type = "cache.t3.small"
elasticache_num_nodes = 1
secrets = {
HIBP_API_KEY = var.hibp_api_key
RESEND_API_KEY = var.resend_api_key
SENTRY_DSN = var.sentry_dsn
DATADOG_API_KEY = var.datadog_api_key
}
}
variable "hibp_api_key" {
description = "Have I Been Pwned API key"
type = string
sensitive = true
}
variable "resend_api_key" {
description = "Resend API key"
type = string
sensitive = true
}
variable "sentry_dsn" {
description = "Sentry DSN"
type = string
sensitive = true
}
variable "datadog_api_key" {
description = "Datadog API key"
type = string
sensitive = true
}

View File

@@ -0,0 +1,4 @@
hibp_api_key = "YOUR_HIBP_API_KEY"
resend_api_key = "YOUR_RESEND_API_KEY"
sentry_dsn = "YOUR_SENTRY_DSN"
datadog_api_key = "YOUR_DATADOG_API_KEY"

107
infra/main.tf Normal file
View File

@@ -0,0 +1,107 @@
terraform {
required_version = ">= 1.5.0"
required_providers {
aws = {
source = "hashicorp/aws"
version = "~> 5.30"
}
github = {
source = "integrations/github"
version = "~> 6.0"
}
}
backend "s3" {
bucket = "shieldai-terraform-state"
key = "global/terraform.tfstate"
region = "us-east-1"
encrypt = true
dynamodb_table = "shieldai-terraform-locks"
}
}
provider "aws" {
region = var.aws_region
default_tags {
tags = {
Project = "ShieldAI"
ManagedBy = "terraform"
Environment = var.environment
}
}
}
module "vpc" {
source = "./modules/vpc"
environment = var.environment
vpc_cidr = var.vpc_cidr
az_count = var.az_count
project_name = var.project_name
}
module "ecs" {
source = "./modules/ecs"
environment = var.environment
cluster_name = "${var.project_name}-${var.environment}"
vpc_id = module.vpc.vpc_id
subnet_ids = module.vpc.private_subnet_ids
security_group_ids = [module.vpc.ecs_security_group_id]
services = var.services
container_images = var.container_images
secrets_arn = module.secrets.secrets_manager_arn
}
module "rds" {
source = "./modules/rds"
environment = var.environment
vpc_id = module.vpc.vpc_id
subnet_ids = module.vpc.private_subnet_ids
security_group_id = module.vpc.rds_security_group_id
db_name = var.db_name
db_instance_class = var.db_instance_class
multi_az = var.db_multi_az
backup_retention = var.db_backup_retention
project_name = var.project_name
}
module "elasticache" {
source = "./modules/elasticache"
environment = var.environment
vpc_id = module.vpc.vpc_id
subnet_ids = module.vpc.private_subnet_ids
security_group_id = module.vpc.elasticache_security_group_id
node_type = var.elasticache_node_type
num_nodes = var.elasticache_num_nodes
project_name = var.project_name
}
module "s3" {
source = "./modules/s3"
environment = var.environment
project_name = var.project_name
}
module "secrets" {
source = "./modules/secrets"
environment = var.environment
project_name = var.project_name
secrets = var.secrets
}
module "cloudwatch" {
source = "./modules/cloudwatch"
environment = var.environment
cluster_name = "${var.project_name}-${var.environment}"
project_name = var.project_name
rds_identifier = module.rds.db_instance_identifier
cache_endpoint = module.elasticache.cache_endpoint
}

View File

@@ -0,0 +1,183 @@
variable "environment" {
description = "Deployment environment"
type = string
}
variable "cluster_name" {
description = "ECS cluster name"
type = string
}
variable "project_name" {
description = "Project name"
type = string
}
variable "rds_identifier" {
description = "RDS instance identifier"
type = string
}
variable "cache_endpoint" {
description = "ElastiCache endpoint"
type = string
}
resource "aws_cloudwatch_dashboard" "main" {
dashboard_name = "${var.project_name}-${var.environment}-dashboard"
dashboard_body = jsonencode({
widgets = [
{
type = "metric"
properties = {
title = "ECS CPU Utilization"
metrics = [
["AWS/ECS", "CPUUtilization", "ClusterName", var.cluster_name]
]
view = "timeSeries"
stacked = false
region = "us-east-1"
period = 300
}
},
{
type = "metric"
properties = {
title = "ECS Memory Utilization"
metrics = [
["AWS/ECS", "MemoryUtilization", "ClusterName", var.cluster_name]
]
view = "timeSeries"
stacked = false
region = "us-east-1"
period = 300
}
},
{
type = "metric"
properties = {
title = "RDS CPU Utilization"
metrics = [
["AWS/RDS", "CPUUtilization", "DBInstanceIdentifier", var.rds_identifier]
]
view = "timeSeries"
stacked = false
region = "us-east-1"
period = 300
}
},
{
type = "metric"
properties = {
title = "ALB Request Count"
metrics = [
["AWS/ApplicationELB", "RequestCount", "LoadBalancer", "${var.cluster_name}-alb"]
]
view = "timeSeries"
stacked = false
region = "us-east-1"
period = 60
}
},
{
type = "metric"
properties = {
title = "ALB 5xx Errors"
metrics = [
["AWS/ApplicationELB", "HTTPCode_Elb_5XX_Count", "LoadBalancer", "${var.cluster_name}-alb"]
]
view = "timeSeries"
stacked = false
region = "us-east-1"
period = 60
}
}
]
})
}
resource "aws_cloudwatch_metric_alarm" "ecs_cpu_high" {
alarm_name = "${var.project_name}-${var.environment}-ecs-cpu-high"
comparison_operator = "GreaterThanThreshold"
evaluation_periods = 2
metric_name = "CPUUtilization"
namespace = "AWS/ECS"
period = 300
statistic = "Average"
threshold = 80
alarm_description = "ECS CPU utilization above 80%"
dimensions = {
ClusterName = var.cluster_name
}
}
resource "aws_cloudwatch_metric_alarm" "ecs_memory_high" {
alarm_name = "${var.project_name}-${var.environment}-ecs-memory-high"
comparison_operator = "GreaterThanThreshold"
evaluation_periods = 2
metric_name = "MemoryUtilization"
namespace = "AWS/ECS"
period = 300
statistic = "Average"
threshold = 85
alarm_description = "ECS memory utilization above 85%"
dimensions = {
ClusterName = var.cluster_name
}
}
resource "aws_cloudwatch_metric_alarm" "alb_5xx" {
alarm_name = "${var.project_name}-${var.environment}-alb-5xx"
comparison_operator = "GreaterThanThreshold"
evaluation_periods = 3
metric_name = "HTTPCode_Elb_5XX_Count"
namespace = "AWS/ApplicationELB"
period = 60
statistic = "Sum"
threshold = 10
alarm_description = "ALB 5xx errors above 10 per minute"
dimensions = {
LoadBalancer = "${var.cluster_name}-alb"
}
}
resource "aws_cloudwatch_metric_alarm" "rds_cpu_high" {
alarm_name = "${var.project_name}-${var.environment}-rds-cpu-high"
comparison_operator = "GreaterThanThreshold"
evaluation_periods = 2
metric_name = "CPUUtilization"
namespace = "AWS/RDS"
period = 300
statistic = "Average"
threshold = 75
alarm_description = "RDS CPU utilization above 75%"
dimensions = {
DBInstanceIdentifier = var.rds_identifier
}
}
resource "aws_cloudwatch_metric_alarm" "rds_free_storage" {
alarm_name = "${var.project_name}-${var.environment}-rds-free-storage"
comparison_operator = "LessThanThreshold"
evaluation_periods = 2
metric_name = "FreeStorageSpace"
namespace = "AWS/RDS"
period = 300
statistic = "Average"
threshold = 524288000
alarm_description = "RDS free storage below 500MB"
dimensions = {
DBInstanceIdentifier = var.rds_identifier
}
}
output "dashboard_url" {
description = "CloudWatch dashboard URL"
value = "https://us-east-1.console.aws.amazon.com/cloudwatch/home#dashboards/dashboard/${var.project_name}-${var.environment}-dashboard"
}

355
infra/modules/ecs/main.tf Normal file
View File

@@ -0,0 +1,355 @@
variable "environment" {
description = "Deployment environment"
type = string
}
variable "cluster_name" {
description = "ECS cluster name"
type = string
}
variable "vpc_id" {
description = "VPC ID"
type = string
}
variable "subnet_ids" {
description = "Private subnet IDs"
type = list(string)
}
variable "security_group_ids" {
description = "Security group IDs"
type = list(string)
}
variable "services" {
description = "ECS services to deploy"
type = map(object({
cpu = number
memory = number
port = number
}))
}
variable "container_images" {
description = "Container image tags"
type = map(string)
}
variable "secrets_arn" {
description = "Secrets Manager ARN"
type = string
}
resource "aws_ecs_cluster" "main" {
name = var.cluster_name
settings {
name = "containerInsights"
value = "enabled"
}
tags = {
Name = var.cluster_name
}
}
resource "aws_ecs_cluster_capacity_providers" "main" {
cluster_name = aws_ecs_cluster.main.name
capacity_providers = ["FARGATE"]
default_capacity_provider_strategy {
base = 1
weight = 100
capacity_provider = "FARGATE"
}
}
resource "aws_ecs_task_definition" "services" {
for_each = var.services
family = "${var.cluster_name}-${each.key}"
container_definitions = jsonencode([
{
name = each.key
image = "ghcr.io/shieldai/shieldai-${each.key}:${var.container_images[each.key]}"
cpu = each.cpu
memory = each.memory
essential = true
portMappings = [
{
containerPort = each.port
hostPort = each.port
protocol = "tcp"
}
]
environment = [
{
name = "NODE_ENV"
value = var.environment
},
{
name = "PORT"
value = tostring(each.port)
}
]
secrets = [
{
name = "DATABASE_URL"
valueFrom = "${var.secrets_arn}:DATABASE_URL::"
},
{
name = "REDIS_URL"
valueFrom = "${var.secrets_arn}:REDIS_URL::"
},
{
name = "HIBP_API_KEY"
valueFrom = "${var.secrets_arn}:HIBP_API_KEY::"
},
{
name = "RESEND_API_KEY"
valueFrom = "${var.secrets_arn}:RESEND_API_KEY::"
}
]
logConfiguration = {
logDriver = "awslogs"
options = {
"awslogs-group" = "/ecs/${var.cluster_name}-${each.key}"
"awslogs-region" = "us-east-1"
"awslogs-stream-prefix" = each.key
}
}
healthCheck = {
command = ["CMD-SHELL", "wget -q --spider http://localhost:${each.port}/health || exit 1"]
interval = 30
timeout = 5
retries = 3
startPeriod = 60
}
}
])
network_mode = "awsvpc"
memory = each.memory
cpu = each.cpu
requires_compatibilities = ["FARGATE"]
execution_role_arn = aws_iam_role.execution[each.key].arn
task_role_arn = aws_iam_role.task[each.key].arn
tags = {
Name = "${var.cluster_name}-${each.key}"
}
}
resource "aws_iam_role" "execution" {
for_each = var.services
name = "${var.cluster_name}-${each.key}-execution"
assume_role_policy = jsonencode({
Version = "2012-10-17"
Statement = [
{
Action = "sts:AssumeRole"
Effect = "Allow"
Principal = {
Service = "ecs-tasks.amazonaws.com"
}
}
]
})
managed_policy_arns = [
"arn:aws:iam::aws:policy/service-role/AmazonECSTaskExecutionRolePolicy"
]
}
resource "aws_iam_role" "task" {
for_each = var.services
name = "${var.cluster_name}-${each.key}-task"
assume_role_policy = jsonencode({
Version = "2012-10-17"
Statement = [
{
Action = "sts:AssumeRole"
Effect = "Allow"
Principal = {
Service = "ecs-tasks.amazonaws.com"
}
}
]
})
managed_policy_arns = [
"arn:aws:iam::aws:policy/SecretsManagerReadOnly"
]
inline_policy {
name = "elasticache-access"
policy = jsonencode({
Version = "2012-10-17"
Statement = [
{
Effect = "Allow"
Action = [
"elasticache:DescribeCacheClusters",
"elasticache:DescribeCacheSubnetGroups"
]
Resource = "*"
}
]
})
}
}
resource "aws_ecs_service" "services" {
for_each = var.services
name = "${var.cluster_name}-${each.key}"
cluster = aws_ecs_cluster.main.id
task_definition = aws_ecs_task_definition.services[each.key].arn
desired_count = var.environment == "production" ? 3 : 1
launch_desired_count = "FARGATE"
network_configuration {
subnets = var.subnet_ids
security_groups = var.security_group_ids
assign_public_ip = false
}
load_balancer {
target_group_arn = aws_lb_target_group.services[each.key].arn
container_name = each.key
container_port = each.port
}
auto_scaling {
max_capacity = var.environment == "production" ? 10 : 3
min_capacity = var.environment == "production" ? 2 : 1
}
tags = {
Name = "${var.cluster_name}-${each.key}"
Service = each.key
}
depends_on = [
aws_lb_listener.services
]
}
resource "aws_lb" "main" {
name = "${var.cluster_name}-alb"
internal = false
load_balancer_type = "application"
security_groups = var.security_group_ids
subnets = var.subnet_ids
tags = {
Name = "${var.cluster_name}-alb"
}
}
resource "aws_lb_target_group" "services" {
for_each = var.services
name = "${var.cluster_name}-${each.key}-tg"
port = each.port
protocol = "HTTP"
vpc_id = var.vpc_id
health_check {
enabled = true
healthy_threshold = 3
interval = 30
matcher = "200"
path = "/health"
port = "traffic-port"
protocol = "HTTP"
timeout = 5
unhealthy_threshold = 3
}
stickiness {
type = "lb_cookie"
cookie_duration = 86400
}
}
resource "aws_lb_listener" "services" {
for_each = var.services
load_balancer_arn = aws_lb.main.arn
port = 80
protocol = "HTTP"
default_action {
type = "forward"
target_group_arn = aws_lb_target_group.services[each.key].arn
}
}
resource "aws_appautoscaling_target" "services" {
for_each = var.services
service_namespace = "ecs"
resource_id = "service/${aws_ecs_cluster.main.name}/${aws_ecs_service.services[each.key].name}"
scalable_dimension = "ecs:service:DesiredCount"
min_capacity = var.environment == "production" ? 2 : 1
max_capacity = var.environment == "production" ? 10 : 3
}
resource "aws_appautoscaling_policy" "cpu" {
for_each = var.services
name = "${var.cluster_name}-${each.key}-cpu-scaling"
service_namespace = "ecs"
resource_id = "service/${aws_ecs_cluster.main.name}/${aws_ecs_service.services[each.key].name}"
scalable_dimension = "ecs:service:DesiredCount"
target_tracking_scaling_policy_configuration {
target_value = 70.0
scale_in_cooldown = 60
scale_out_cooldown = 30
customized_metric_specification {
metric_name = "CPUUtilization"
namespace = "AWS/ECS"
statistic = "Average"
dimensions = [{ name = "ClusterName", value = aws_ecs_cluster.main.name }]
}
}
}
resource "aws_cloudwatch_log_group" "services" {
for_each = var.services
name = "/ecs/${var.cluster_name}-${each.key}"
retention_in_days = var.environment == "production" ? 30 : 7
tags = {
Name = "${var.cluster_name}-${each.key}-logs"
}
}
output "cluster_arn" {
description = "ECS cluster ARN"
value = aws_ecs_cluster.main.arn
}
output "alb_dns_name" {
description = "ALB DNS name"
value = aws_lb.main.dns_name
}

View File

@@ -0,0 +1,80 @@
variable "environment" {
description = "Deployment environment"
type = string
}
variable "vpc_id" {
description = "VPC ID"
type = string
}
variable "subnet_ids" {
description = "Private subnet IDs"
type = list(string)
}
variable "security_group_id" {
description = "ElastiCache security group ID"
type = string
}
variable "node_type" {
description = "Cache node type"
type = string
}
variable "num_nodes" {
description = "Number of cache nodes"
type = number
}
variable "project_name" {
description = "Project name"
type = string
}
resource "aws_elasticache_subnet_group" "main" {
name = "${var.project_name}-${var.environment}-redis-subnet"
subnet_ids = var.subnet_ids
tags = {
Name = "${var.project_name}-${var.environment}-redis-subnet"
}
}
resource "aws_elasticache_replication_group" "main" {
replication_group_id = "${var.project_name}-${var.environment}-redis"
description = "${var.project_name} Redis cluster (${var.environment})"
node_type = var.node_type
num_cache_clusters = var.num_nodes
engine = "redis"
engine_version = "7.0"
transit_encryption_enabled = true
at_rest_encryption_enabled = true
port = 6379
subnet_group_name = aws_elasticache_subnet_group.main.name
security_group_ids = [var.security_group_id]
automatic_failover_enabled = var.environment == "production"
snapshot_retention_limit = var.environment == "production" ? 7 : 1
snapshot_window = "03:00-04:00"
tags = {
Name = "${var.project_name}-${var.environment}-redis"
}
}
output "cache_endpoint" {
description = "ElastiCache primary endpoint"
value = aws_elasticache_replication_group.main.primary_endpoint_address
}
output "reader_endpoint" {
description = "ElastiCache reader endpoint"
value = aws_elasticache_replication_group.main.reader_endpoint_address
}

132
infra/modules/rds/main.tf Normal file
View File

@@ -0,0 +1,132 @@
variable "environment" {
description = "Deployment environment"
type = string
}
variable "vpc_id" {
description = "VPC ID"
type = string
}
variable "subnet_ids" {
description = "Private subnet IDs"
type = list(string)
}
variable "security_group_id" {
description = "RDS security group ID"
type = string
}
variable "db_name" {
description = "Database name"
type = string
}
variable "db_instance_class" {
description = "RDS instance class"
type = string
}
variable "multi_az" {
description = "Multi-AZ deployment"
type = bool
}
variable "backup_retention" {
description = "Backup retention days"
type = number
}
variable "project_name" {
description = "Project name"
type = string
}
resource "aws_db_subnet_group" "main" {
name = "${var.project_name}-${var.environment}-db-subnet"
subnet_ids = var.subnet_ids
tags = {
Name = "${var.project_name}-${var.environment}-db-subnet"
}
}
resource "aws_db_instance" "main" {
identifier = "${var.project_name}-${var.environment}-db"
engine = "postgres"
engine_version = "16.2"
instance_class = var.db_instance_class
allocated_storage = var.environment == "production" ? 100 : 20
db_name = var.db_name
username = "shieldai"
password = random_password.db_password.result
multi_az = var.multi_az
db_subnet_group_name = aws_db_subnet_group.main.name
vpc_security_group_ids = [var.security_group_id]
backup_retention_period = var.backup_retention
backup_window = "03:00-04:00"
maintenance_window = "sun:04:00-sun:05:00"
skip_final_snapshot = var.environment != "production"
final_snapshot_identifier = "${var.project_name}-${var.environment}-final"
storage_encrypted = true
storage_type = "gp3"
iops = var.environment == "production" ? 3000 : 1000
deletion_protection = var.environment == "production"
copy_tags_to_snapshot = true
tags = {
Name = "${var.project_name}-${var.environment}-db"
}
}
resource "random_password" "db_password" {
length = 16
special = true
keepers = {
environment = var.environment
}
}
resource "aws_secretsmanager_secret_version" "db_password" {
secret_id = aws_secretsmanager_secret.db_password.id
secret_string = jsonencode({
username = "shieldai"
password = random_password.db_password.result
engine = "postgres"
host = aws_db_instance.main.address
port = aws_db_instance.main.port
})
}
resource "aws_secretsmanager_secret" "db_password" {
name = "${var.project_name}-${var.environment}-db-password"
tags = {
Name = "${var.project_name}-${var.environment}-db-password"
}
}
output "db_endpoint" {
description = "RDS endpoint"
value = aws_db_instance.main.endpoint
sensitive = true
}
output "db_instance_identifier" {
description = "RDS instance identifier"
value = aws_db_instance.main.identifier
}
output "db_password_secret_arn" {
description = "DB password secret ARN"
value = aws_secretsmanager_secret.db_password.arn
}

108
infra/modules/s3/main.tf Normal file
View File

@@ -0,0 +1,108 @@
variable "environment" {
description = "Deployment environment"
type = string
}
variable "project_name" {
description = "Project name"
type = string
}
resource "aws_s3_bucket" "terraform_state" {
bucket = "${var.project_name}-${var.environment}-terraform-state"
tags = {
Name = "${var.project_name}-${var.environment}-terraform-state"
}
}
resource "aws_s3_bucket_versioning" "terraform_state" {
bucket = aws_s3_bucket.terraform_state.id
versioning_configuration {
status = "Enabled"
}
}
resource "aws_s3_bucket_server_side_encryption_configuration" "terraform_state" {
bucket = aws_s3_bucket.terraform_state.id
rule {
apply_server_side_encryption_by_default {
sse_algorithm = "aws:kms"
}
}
}
resource "aws_s3_bucket_lifecycle_configuration" "terraform_state" {
bucket = aws_s3_bucket.terraform_state.id
rule {
id = "expire-noncurrent"
status = "Enabled"
noncurrent_version_expiration {
noncurrent_days = 30
}
}
}
resource "aws_s3_bucket" "artifacts" {
bucket = "${var.project_name}-${var.environment}-artifacts"
tags = {
Name = "${var.project_name}-${var.environment}-artifacts"
}
}
resource "aws_s3_bucket_versioning" "artifacts" {
bucket = aws_s3_bucket.artifacts.id
versioning_configuration {
status = "Enabled"
}
}
resource "aws_s3_bucket_server_side_encryption_configuration" "artifacts" {
bucket = aws_s3_bucket.artifacts.id
rule {
apply_server_side_encryption_by_default {
sse_algorithm = "aws:kms"
}
}
}
resource "aws_s3_bucket" "logs" {
bucket = "${var.project_name}-${var.environment}-logs"
tags = {
Name = "${var.project_name}-${var.environment}-logs"
}
}
resource "aws_s3_bucket_lifecycle_configuration" "logs" {
bucket = aws_s3_bucket.logs.id
rule {
id = "expire-old-logs"
status = "Enabled"
expiration {
days = 90
}
}
}
output "bucket_name" {
description = "Terraform state S3 bucket name"
value = aws_s3_bucket.terraform_state.id
}
output "artifacts_bucket_name" {
description = "Artifacts S3 bucket name"
value = aws_s3_bucket.artifacts.id
}
output "logs_bucket_name" {
description = "Logs S3 bucket name"
value = aws_s3_bucket.logs.id
}

View File

@@ -0,0 +1,49 @@
variable "environment" {
description = "Deployment environment"
type = string
}
variable "project_name" {
description = "Project name"
type = string
}
variable "secrets" {
description = "Secrets to store"
type = map(string)
default = {}
}
resource "aws_secretsmanager_secret" "main" {
name = "${var.project_name}-${var.environment}-app-secrets"
description = "Application secrets for ${var.project_name} (${var.environment})"
tags = {
Name = "${var.project_name}-${var.environment}-app-secrets"
Environment = var.environment
}
}
resource "aws_secretsmanager_secret_version" "main" {
secret_id = aws_secretsmanager_secret.main.id
secret_string = jsonencode(merge({
DATABASE_URL = "postgresql://shieldai:${var.project_name}@${var.project_name}-${var.environment}-db.${data.aws_caller_identity.current.account_id}.us-east-1.rds.amazonaws.com:5432/shieldai"
REDIS_URL = "redis://${var.project_name}-${var.environment}-redis.${data.aws_caller_identity.current.account_id}.us-east-1.cache.amazonaws.com:6379"
NODE_ENV = var.environment
LOG_LEVEL = var.environment == "production" ? "info" : "debug"
}, var.secrets))
}
data "aws_caller_identity" "current" {}
output "secrets_manager_arn" {
description = "Secrets Manager ARN"
value = aws_secretsmanager_secret.main.arn
}
output "secrets_manager_name" {
description = "Secrets Manager secret name"
value = aws_secretsmanager_secret.main.name
}

235
infra/modules/vpc/main.tf Normal file
View File

@@ -0,0 +1,235 @@
variable "environment" {
description = "Deployment environment"
type = string
}
variable "vpc_cidr" {
description = "CIDR block for VPC"
type = string
}
variable "az_count" {
description = "Number of availability zones"
type = number
}
variable "project_name" {
description = "Project name"
type = string
}
resource "aws_vpc" "main" {
cidr_block = var.vpc_cidr
enable_dns_support = true
enable_dns_hostnames = true
tags = {
Name = "${var.project_name}-${var.environment}-vpc"
}
}
data "aws_availability_zones" "available" {
state = "available"
}
resource "aws_subnet" "public" {
count = var.az_count
vpc_id = aws_vpc.main.id
cidr_block = cidrsubnet(var.vpc_cidr, 8, count.index)
availability_zone = data.aws_availability_zones.available.names[count.index]
map_public_ip_on_launch = true
tags = {
Name = "${var.project_name}-${var.environment}-public-${data.aws_availability_zones.available.names[count.index]}"
"kubernetes.io/role/elb" = "1"
}
}
resource "aws_subnet" "private" {
count = var.az_count
vpc_id = aws_vpc.main.id
cidr_block = cidrsubnet(var.vpc_cidr, 8, var.az_count + count.index)
availability_zone = data.aws_availability_zones.available.names[count.index]
tags = {
Name = "${var.project_name}-${var.environment}-private-${data.aws_availability_zones.available.names[count.index]}"
"kubernetes.io/role/internal-elb" = "1"
}
}
resource "aws_internet_gateway" "main" {
vpc_id = aws_vpc.main.id
tags = {
Name = "${var.project_name}-${var.environment}-igw"
}
}
resource "aws_eip" "nat" {
count = var.az_count
domain = "vpc"
tags = {
Name = "${var.project_name}-${var.environment}-nat-${count.index}"
}
}
resource "aws_nat_gateway" "main" {
count = var.az_count
allocation_id = aws_eip.nat[count.index].id
subnet_id = aws_subnet.public[count.index].id
tags = {
Name = "${var.project_name}-${var.environment}-nat-${count.index}"
}
depends_on = [aws_internet_gateway.main]
}
resource "aws_route_table" "public" {
vpc_id = aws_vpc.main.id
route {
cidr_block = "0.0.0.0/0"
gateway_id = aws_internet_gateway.main.id
}
tags = {
Name = "${var.project_name}-${var.environment}-public-rt"
}
}
resource "aws_route_table" "private" {
count = var.az_count
vpc_id = aws_vpc.main.id
route {
cidr_block = "0.0.0.0/0"
nat_gateway_id = aws_nat_gateway.main[count.index].id
}
tags = {
Name = "${var.project_name}-${var.environment}-private-rt-${count.index}"
}
}
resource "aws_route_table_association" "public" {
count = var.az_count
subnet_id = aws_subnet.public[count.index].id
route_table_id = aws_route_table.public.id
}
resource "aws_route_table_association" "private" {
count = var.az_count
subnet_id = aws_subnet.private[count.index].id
route_table_id = aws_route_table.private[count.index].id
}
resource "aws_security_group" "ecs" {
name_prefix = "${var.project_name}-${var.environment}-ecs"
vpc_id = aws_vpc.main.id
ingress {
from_port = 3000
to_port = 3003
protocol = "tcp"
cidr_blocks = ["0.0.0.0/0"]
description = "Service ports"
}
egress {
from_port = 0
to_port = 0
protocol = "-1"
cidr_blocks = ["0.0.0.0/0"]
}
tags = {
Name = "${var.project_name}-${var.environment}-ecs-sg"
}
}
resource "aws_security_group" "rds" {
name_prefix = "${var.project_name}-${var.environment}-rds"
vpc_id = aws_vpc.main.id
ingress {
from_port = 5432
to_port = 5432
protocol = "tcp"
security_groups = [aws_security_group.ecs.id]
description = "PostgreSQL from ECS"
}
egress {
from_port = 0
to_port = 0
protocol = "-1"
cidr_blocks = ["0.0.0.0/0"]
}
tags = {
Name = "${var.project_name}-${var.environment}-rds-sg"
}
}
resource "aws_security_group" "elasticache" {
name_prefix = "${var.project_name}-${var.environment}-elasticache"
vpc_id = aws_vpc.main.id
ingress {
from_port = 6379
to_port = 6379
protocol = "tcp"
security_groups = [aws_security_group.ecs.id]
description = "Redis from ECS"
}
egress {
from_port = 0
to_port = 0
protocol = "-1"
cidr_blocks = ["0.0.0.0/0"]
}
tags = {
Name = "${var.project_name}-${var.environment}-elasticache-sg"
}
}
output "vpc_id" {
description = "VPC ID"
value = aws_vpc.main.id
}
output "private_subnet_ids" {
description = "Private subnet IDs"
value = aws_subnet.private[*].id
}
output "public_subnet_ids" {
description = "Public subnet IDs"
value = aws_subnet.public[*].id
}
output "ecs_security_group_id" {
description = "ECS security group ID"
value = aws_security_group.ecs.id
}
output "rds_security_group_id" {
description = "RDS security group ID"
value = aws_security_group.rds.id
}
output "elasticache_security_group_id" {
description = "ElastiCache security group ID"
value = aws_security_group.elasticache.id
}

35
infra/outputs.tf Normal file
View File

@@ -0,0 +1,35 @@
output "vpc_id" {
description = "VPC ID"
value = module.vpc.vpc_id
}
output "cluster_name" {
description = "ECS cluster name"
value = "${var.project_name}-${var.environment}"
}
output "rds_endpoint" {
description = "RDS endpoint"
value = module.rds.db_endpoint
sensitive = true
}
output "elasticache_endpoint" {
description = "ElastiCache primary endpoint"
value = module.elasticache.cache_endpoint
}
output "s3_bucket_name" {
description = "S3 bucket name"
value = module.s3.bucket_name
}
output "secrets_manager_arn" {
description = "Secrets Manager ARN"
value = module.secrets.secrets_manager_arn
}
output "cloudwatch_dashboard_url" {
description = "CloudWatch dashboard URL"
value = module.cloudwatch.dashboard_url
}

32
infra/scripts/rollback.sh Executable file
View File

@@ -0,0 +1,32 @@
#!/bin/bash
set -euo pipefail
ENVIRONMENT=${1:-staging}
SERVICE=${2:-all}
CLUSTER="shieldai-${ENVIRONMENT}"
echo "Rolling back services in cluster: $CLUSTER"
SERVICES="api darkwatch spamshield voiceprint"
if [ "$SERVICE" != "all" ]; then
SERVICES="$SERVICE"
fi
for svc in $SERVICES; do
echo "Rolling back $svc..."
aws ecs update-service \
--cluster "$CLUSTER" \
--service "${CLUSTER}-${svc}" \
--rollback \
--no-cli-auto-prompt
echo "Waiting for $svc to stabilize..."
aws ecs wait services-stable \
--cluster "$CLUSTER" \
--services "${CLUSTER}-${svc}"
echo "$svc rolled back successfully"
done
echo "Rollback complete for $SERVICES"

116
infra/variables.tf Normal file
View File

@@ -0,0 +1,116 @@
variable "aws_region" {
description = "AWS region"
type = string
default = "us-east-1"
}
variable "environment" {
description = "Deployment environment"
type = string
validation {
condition = contains(["dev", "staging", "production"], var.environment)
error_message = "Environment must be one of: dev, staging, production."
}
}
variable "project_name" {
description = "Project name for resource naming"
type = string
default = "shieldai"
}
variable "vpc_cidr" {
description = "CIDR block for VPC"
type = string
default = "10.0.0.0/16"
}
variable "az_count" {
description = "Number of availability zones"
type = number
default = 2
}
variable "db_name" {
description = "RDS database name"
type = string
default = "shieldai"
}
variable "db_instance_class" {
description = "RDS instance class"
type = string
default = "db.t3.medium"
}
variable "db_multi_az" {
description = "Enable Multi-AZ deployment"
type = bool
default = true
}
variable "db_backup_retention" {
description = "RDS backup retention period in days"
type = number
default = 7
}
variable "elasticache_node_type" {
description = "ElastiCache node type"
type = string
default = "cache.t3.medium"
}
variable "elasticache_num_nodes" {
description = "Number of ElastiCache nodes"
type = number
default = 2
}
variable "services" {
description = "ECS services to deploy"
type = map(object({
cpu = number
memory = number
port = number
}))
default = {
api = {
cpu = 512
memory = 1024
port = 3000
}
darkwatch = {
cpu = 256
memory = 512
port = 3001
}
spamshield = {
cpu = 256
memory = 512
port = 3002
}
voiceprint = {
cpu = 512
memory = 1024
port = 3003
}
}
}
variable "container_images" {
description = "Container image tags per service"
type = map(string)
default = {
api = "latest"
darkwatch = "latest"
spamshield = "latest"
voiceprint = "latest"
}
}
variable "secrets" {
description = "Secrets to store in AWS Secrets Manager"
type = map(string)
default = {}
}