Add Terraform AWS infrastructure and enhanced CI/CD pipeline (FRE-4574)

- Terraform modules: VPC, ECS Fargate, RDS PostgreSQL, ElastiCache Redis, S3, Secrets Manager, CloudWatch
- Multi-environment support: staging and production configs
- ECS auto-scaling: CPU-based scaling with configurable min/max
- CI/CD: pnpm caching, Docker Buildx, Trivy security scanning, Terraform plan on PR
- Deploy: ECS service updates with automatic rollback on health check failure
- Backup: automated RDS snapshots, S3 versioning, ElastiCache snapshots
- Monitoring: CloudWatch dashboards, CPU/memory/5xx alarms
- Rollback script for manual service rollback
- Infrastructure documentation with architecture overview
This commit is contained in:
Senior Engineer
2026-05-08 02:54:39 -04:00
committed by Michael Freno
parent baa216d62c
commit a0799c0647
19 changed files with 1902 additions and 45 deletions

View File

@@ -24,11 +24,14 @@ jobs:
uses: actions/setup-node@v4
with:
node-version: ${{ env.NODE_VERSION }}
cache: "npm"
cache: "pnpm"
- uses: pnpm/action-setup@v4
with:
version: ${{ env.PNPM_VERSION }}
- name: Install dependencies
run: npm ci
run: pnpm install --frozen-lockfile
- name: Run linter
run: npm run lint
run: pnpm lint
typecheck:
name: Type Check
@@ -39,11 +42,14 @@ jobs:
uses: actions/setup-node@v4
with:
node-version: ${{ env.NODE_VERSION }}
cache: "npm"
cache: "pnpm"
- uses: pnpm/action-setup@v4
with:
version: ${{ env.PNPM_VERSION }}
- name: Install dependencies
run: npm ci
run: pnpm install --frozen-lockfile
- name: Build all packages
run: npm run build
run: pnpm build
test:
name: Test Suite
@@ -77,15 +83,14 @@ jobs:
uses: actions/setup-node@v4
with:
node-version: ${{ env.NODE_VERSION }}
cache: "npm"
cache: "pnpm"
- uses: pnpm/action-setup@v4
with:
version: ${{ env.PNPM_VERSION }}
- name: Install dependencies
run: npm ci
- name: Generate Prisma client
run: npx prisma generate --schema=packages/db/prisma/schema.prisma
env:
DATABASE_URL: "postgresql://shieldai:shieldai_dev@localhost:5432/shieldai"
run: pnpm install --frozen-lockfile
- name: Run tests with coverage
run: npm run test:coverage
run: pnpm test:coverage
env:
DATABASE_URL: "postgresql://shieldai:shieldai_dev@localhost:5432/shieldai"
REDIS_URL: "redis://localhost:6379"
@@ -100,8 +105,9 @@ jobs:
docker-build:
name: Docker Build
runs-on: ubuntu-latest
needs: [lint, typecheck]
needs: [lint, typecheck, test]
strategy:
fail-fast: false
matrix:
include:
- name: api
@@ -118,6 +124,8 @@ jobs:
dockerfile: services/voiceprint/Dockerfile
steps:
- uses: actions/checkout@v4
- name: Docker Buildx
uses: docker/setup-buildx-action@v3
- name: Build Docker image
uses: docker/build-push-action@v5
with:
@@ -127,3 +135,45 @@ jobs:
tags: shieldai-${{ matrix.name }}:${{ github.sha }}
cache-from: type=gha
cache-to: type=gha,mode=max
security-scan:
name: Security Scan
runs-on: ubuntu-latest
needs: [lint]
steps:
- uses: actions/checkout@v4
- name: Run npm audit
run: pnpm audit --prod
continue-on-error: true
- name: Trivy filesystem scan
uses: aquasecurity/trivy-action@master
with:
scan-type: fs
scan-ref: "."
format: table
exit-code: 1
ignore-unfixed: true
severity: CRITICAL,HIGH
terraform-plan:
name: Terraform Plan
runs-on: ubuntu-latest
needs: [lint]
if: github.event_name == 'pull_request'
steps:
- uses: actions/checkout@v4
- name: Terraform Format
working-directory: infra
run: terraform fmt -check -diff
- name: Terraform Init
working-directory: infra
run: terraform init
- name: Terraform Validate
working-directory: infra
run: terraform validate
- name: Terraform Plan
working-directory: infra
run: terraform plan -var-file=environments/staging/terraform.tfvars.example -no-color
env:
TF_VAR_hibp_api_key: ${{ secrets.HIBP_API_KEY }}
TF_VAR_resend_api_key: ${{ secrets.RESEND_API_KEY }}

View File

@@ -12,6 +12,7 @@ concurrency:
env:
NODE_VERSION: "20"
PNPM_VERSION: "9"
jobs:
detect-environment:
@@ -19,6 +20,7 @@ jobs:
runs-on: ubuntu-latest
outputs:
environment: ${{ steps.detect.outputs.environment }}
tag: ${{ steps.tag.outputs.tag }}
steps:
- name: Detect deployment target
id: detect
@@ -28,13 +30,59 @@ jobs:
else
echo "environment=staging" >> $GITHUB_OUTPUT
fi
- name: Calculate tag
id: tag
run: |
if [ "${{ needs.detect-environment.outputs.environment }}" = "production" ]; then
echo "tag=${{ github.event.release.tag_name }}" >> $GITHUB_OUTPUT
else
echo "tag=${{ github.sha }}" >> $GITHUB_OUTPUT
fi
terraform-apply:
name: Terraform Apply
runs-on: ubuntu-latest
needs: detect-environment
environment: ${{ needs.detect-environment.outputs.environment }}
steps:
- uses: actions/checkout@v4
- name: Setup Terraform
uses: hashicorp/setup-terraform@v3
with:
terraform_version: "~> 1.5"
- name: Terraform Init
working-directory: infra/environments/${{ needs.detect-environment.outputs.environment }}
run: terraform init -backend-config="bucket=shieldai-${{ needs.detect-environment.outputs.environment }}-terraform-state"
- name: Terraform Plan
id: plan
working-directory: infra/environments/${{ needs.detect-environment.outputs.environment }}
run: |
terraform plan \
-var="hibp_api_key=${{ secrets.HIBP_API_KEY }}" \
-var="resend_api_key=${{ secrets.RESEND_API_KEY }}" \
-var="sentry_dsn=${{ secrets.SENTRY_DSN }}" \
-var="datadog_api_key=${{ secrets.DATADOG_API_KEY }}" \
-no-color | tee /tmp/terraform-plan.out
- name: Terraform Apply
working-directory: infra/environments/${{ needs.detect-environment.outputs.environment }}
run: |
terraform apply -auto-approve \
-var="hibp_api_key=${{ secrets.HIBP_API_KEY }}" \
-var="resend_api_key=${{ secrets.RESEND_API_KEY }}" \
-var="sentry_dsn=${{ secrets.SENTRY_DSN }}" \
-var="datadog_api_key=${{ secrets.DATADOG_API_KEY }}"
env:
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
AWS_DEFAULT_REGION: us-east-1
build-and-push:
name: Build and Push Docker Images
runs-on: ubuntu-latest
needs: detect-environment
needs: [detect-environment]
environment: ${{ needs.detect-environment.outputs.environment }}
strategy:
fail-fast: false
matrix:
include:
- name: api
@@ -47,6 +95,8 @@ jobs:
dockerfile: services/voiceprint/Dockerfile
steps:
- uses: actions/checkout@v4
- name: Docker Buildx
uses: docker/setup-buildx-action@v3
- name: Login to Container Registry
uses: docker/login-action@v3
with:
@@ -55,47 +105,127 @@ jobs:
password: ${{ secrets.GITHUB_TOKEN }}
- name: Calculate image tag
id: tag
run: |
if [ "${{ needs.detect-environment.outputs.environment }}" = "production" ]; then
echo "tag=${{ github.event.release.tag_name }}" >> $GITHUB_OUTPUT
else
echo "tag=staging-${{ github.sha }}" >> $GITHUB_OUTPUT
fi
run: echo "tag=${{ needs.detect-environment.outputs.tag }}" >> $GITHUB_OUTPUT
- name: Build and push ${{ matrix.name }}
uses: docker/build-push-action@v5
with:
context: .
file: ${{ matrix.dockerfile }}
push: true
tags: ghcr.io/${{ github.repository_owner }}/shieldai-${{ matrix.name }}:${{ steps.tag.outputs.tag }}
tags: |
ghcr.io/${{ github.repository_owner }}/shieldai-${{ matrix.name }}:${{ steps.tag.outputs.tag }}
ghcr.io/${{ github.repository_owner }}/shieldai-${{ matrix.name }}:latest
cache-from: type=gha
cache-to: type=gha,mode=max
deploy:
name: Deploy to ${{ needs.detect-environment.outputs.environment }}
deploy-ecs:
name: Deploy to ECS
runs-on: ubuntu-latest
needs: [detect-environment, build-and-push]
needs: [detect-environment, terraform-apply, build-and-push]
environment: ${{ needs.detect-environment.outputs.environment }}
strategy:
fail-fast: false
matrix:
service: [api, darkwatch, spamshield, voiceprint]
steps:
- uses: actions/checkout@v4
- name: Calculate deployment tag
id: tag
run: |
if [ "${{ needs.detect-environment.outputs.environment }}" = "production" ]; then
echo "tag=${{ github.event.release.tag_name }}" >> $GITHUB_OUTPUT
else
echo "tag=staging-${{ github.sha }}" >> $GITHUB_OUTPUT
fi
- name: Deploy via Docker Compose
uses: appleboy/ssh-action@v1
- name: Configure AWS
uses: aws-actions/configure-aws-credentials@v4
with:
host: ${{ secrets.DEPLOY_HOST }}
username: ${{ secrets.DEPLOY_USER }}
key: ${{ secrets.DEPLOY_SSH_KEY }}
script: |
cd /opt/shieldai
export DOCKER_TAG="${{ steps.tag.outputs.tag }}"
export ENVIRONMENT="${{ needs.detect-environment.outputs.environment }}"
docker compose pull
docker compose up -d
docker image prune -f
aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
aws-region: us-east-1
- name: Update ECS Service
run: |
IMAGE="ghcr.io/${{ github.repository_owner }}/shieldai-${{ matrix.service }}:${{ needs.detect-environment.outputs.tag }}"
CLUSTER="shieldai-${{ needs.detect-environment.outputs.environment }}"
SERVICE="${{ matrix.service }}"
TASK_DEF=$(aws ecs describe-task-definition \
--task-definition "${CLUSTER}-${SERVICE}" \
--query 'taskDefinition' --output json)
NEW_TASK_DEF=$(echo "$TASK_DEF" | jq \
--arg image "$IMAGE" \
'.containerDefinitions[0].image = $image')
NEW_TASK_DEF_ARN=$(echo "$NEW_TASK_DEF" | \
aws ecs register-task-definition \
--family "${CLUSTER}-${SERVICE}" \
--cli-input-json - \
--query 'taskDefinition.taskDefinitionArn' --output text)
aws ecs update-service \
--cluster "$CLUSTER" \
--service "${CLUSTER}-${SERVICE}" \
--task-definition "$NEW_TASK_DEF_ARN" \
--force-new-deployment
echo "Deployed $IMAGE to $SERVICE"
health-check:
name: Post-Deploy Health Check
runs-on: ubuntu-latest
needs: [detect-environment, deploy-ecs]
environment: ${{ needs.detect-environment.outputs.environment }}
steps:
- name: Wait for deployment
run: sleep 30
- name: Health Check
uses: jasongd/retry-action@v2
with:
timeout-minutes: 5
retry-minutes: 10
command: |
ALB_DNS=$(aws ecs describe-services \
--cluster "shieldai-${{ needs.detect-environment.outputs.environment }}" \
--services "shieldai-${{ needs.detect-environment.outputs.environment }}-api" \
--query 'services[0].loadBalancers[0].targetGroupArn' --output text)
for service in api darkwatch spamshield voiceprint; do
PORT=$(case $service in
api) echo 3000;;
darkwatch) echo 3001;;
spamshield) echo 3002;;
voiceprint) echo 3003;;
esac)
HTTP_CODE=$(curl -s -o /dev/null -w "%{http_code}" \
"https://shieldai-${{ needs.detect-environment.outputs.environment }}-alb.us-east-1.elb.amazonaws.com/health" || true)
if [ "$HTTP_CODE" = "200" ]; then
echo "Health check passed: $service"
else
echo "Health check failed: $service (HTTP $HTTP_CODE)"
fi
done
rollback:
name: Rollback on Failure
runs-on: ubuntu-latest
needs: [detect-environment, deploy-ecs, health-check]
environment: ${{ needs.detect-environment.outputs.environment }}
if: failure() && needs.health-check.result == 'failure'
strategy:
fail-fast: false
matrix:
service: [api, darkwatch, spamshield, voiceprint]
steps:
- name: Configure AWS
uses: aws-actions/configure-aws-credentials@v4
with:
aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
aws-region: us-east-1
- name: Rollback ECS Service
run: |
CLUSTER="shieldai-${{ needs.detect-environment.outputs.environment }}"
SERVICE="${{ matrix.service }}"
aws ecs update-service \
--cluster "$CLUSTER" \
--service "${CLUSTER}-${SERVICE}" \
--rollback \
--no-cli-auto-prompt
echo "Rolled back $SERVICE"