diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 565e983..96c3cb3 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -24,11 +24,14 @@ jobs: uses: actions/setup-node@v4 with: node-version: ${{ env.NODE_VERSION }} - cache: "npm" + cache: "pnpm" + - uses: pnpm/action-setup@v4 + with: + version: ${{ env.PNPM_VERSION }} - name: Install dependencies - run: npm ci + run: pnpm install --frozen-lockfile - name: Run linter - run: npm run lint + run: pnpm lint typecheck: name: Type Check @@ -39,11 +42,14 @@ jobs: uses: actions/setup-node@v4 with: node-version: ${{ env.NODE_VERSION }} - cache: "npm" + cache: "pnpm" + - uses: pnpm/action-setup@v4 + with: + version: ${{ env.PNPM_VERSION }} - name: Install dependencies - run: npm ci + run: pnpm install --frozen-lockfile - name: Build all packages - run: npm run build + run: pnpm build test: name: Test Suite @@ -77,15 +83,14 @@ jobs: uses: actions/setup-node@v4 with: node-version: ${{ env.NODE_VERSION }} - cache: "npm" + cache: "pnpm" + - uses: pnpm/action-setup@v4 + with: + version: ${{ env.PNPM_VERSION }} - name: Install dependencies - run: npm ci - - name: Generate Prisma client - run: npx prisma generate --schema=packages/db/prisma/schema.prisma - env: - DATABASE_URL: "postgresql://shieldai:shieldai_dev@localhost:5432/shieldai" + run: pnpm install --frozen-lockfile - name: Run tests with coverage - run: npm run test:coverage + run: pnpm test:coverage env: DATABASE_URL: "postgresql://shieldai:shieldai_dev@localhost:5432/shieldai" REDIS_URL: "redis://localhost:6379" @@ -100,8 +105,9 @@ jobs: docker-build: name: Docker Build runs-on: ubuntu-latest - needs: [lint, typecheck] + needs: [lint, typecheck, test] strategy: + fail-fast: false matrix: include: - name: api @@ -118,6 +124,8 @@ jobs: dockerfile: services/voiceprint/Dockerfile steps: - uses: actions/checkout@v4 + - name: Docker Buildx + uses: docker/setup-buildx-action@v3 - name: Build Docker image uses: docker/build-push-action@v5 with: @@ -127,3 +135,45 @@ jobs: tags: shieldai-${{ matrix.name }}:${{ github.sha }} cache-from: type=gha cache-to: type=gha,mode=max + + security-scan: + name: Security Scan + runs-on: ubuntu-latest + needs: [lint] + steps: + - uses: actions/checkout@v4 + - name: Run npm audit + run: pnpm audit --prod + continue-on-error: true + - name: Trivy filesystem scan + uses: aquasecurity/trivy-action@master + with: + scan-type: fs + scan-ref: "." + format: table + exit-code: 1 + ignore-unfixed: true + severity: CRITICAL,HIGH + + terraform-plan: + name: Terraform Plan + runs-on: ubuntu-latest + needs: [lint] + if: github.event_name == 'pull_request' + steps: + - uses: actions/checkout@v4 + - name: Terraform Format + working-directory: infra + run: terraform fmt -check -diff + - name: Terraform Init + working-directory: infra + run: terraform init + - name: Terraform Validate + working-directory: infra + run: terraform validate + - name: Terraform Plan + working-directory: infra + run: terraform plan -var-file=environments/staging/terraform.tfvars.example -no-color + env: + TF_VAR_hibp_api_key: ${{ secrets.HIBP_API_KEY }} + TF_VAR_resend_api_key: ${{ secrets.RESEND_API_KEY }} diff --git a/.github/workflows/deploy.yml b/.github/workflows/deploy.yml index 8687bc0..7cd3dd9 100644 --- a/.github/workflows/deploy.yml +++ b/.github/workflows/deploy.yml @@ -12,6 +12,7 @@ concurrency: env: NODE_VERSION: "20" + PNPM_VERSION: "9" jobs: detect-environment: @@ -19,6 +20,7 @@ jobs: runs-on: ubuntu-latest outputs: environment: ${{ steps.detect.outputs.environment }} + tag: ${{ steps.tag.outputs.tag }} steps: - name: Detect deployment target id: detect @@ -28,13 +30,59 @@ jobs: else echo "environment=staging" >> $GITHUB_OUTPUT fi + - name: Calculate tag + id: tag + run: | + if [ "${{ needs.detect-environment.outputs.environment }}" = "production" ]; then + echo "tag=${{ github.event.release.tag_name }}" >> $GITHUB_OUTPUT + else + echo "tag=${{ github.sha }}" >> $GITHUB_OUTPUT + fi + + terraform-apply: + name: Terraform Apply + runs-on: ubuntu-latest + needs: detect-environment + environment: ${{ needs.detect-environment.outputs.environment }} + steps: + - uses: actions/checkout@v4 + - name: Setup Terraform + uses: hashicorp/setup-terraform@v3 + with: + terraform_version: "~> 1.5" + - name: Terraform Init + working-directory: infra/environments/${{ needs.detect-environment.outputs.environment }} + run: terraform init -backend-config="bucket=shieldai-${{ needs.detect-environment.outputs.environment }}-terraform-state" + - name: Terraform Plan + id: plan + working-directory: infra/environments/${{ needs.detect-environment.outputs.environment }} + run: | + terraform plan \ + -var="hibp_api_key=${{ secrets.HIBP_API_KEY }}" \ + -var="resend_api_key=${{ secrets.RESEND_API_KEY }}" \ + -var="sentry_dsn=${{ secrets.SENTRY_DSN }}" \ + -var="datadog_api_key=${{ secrets.DATADOG_API_KEY }}" \ + -no-color | tee /tmp/terraform-plan.out + - name: Terraform Apply + working-directory: infra/environments/${{ needs.detect-environment.outputs.environment }} + run: | + terraform apply -auto-approve \ + -var="hibp_api_key=${{ secrets.HIBP_API_KEY }}" \ + -var="resend_api_key=${{ secrets.RESEND_API_KEY }}" \ + -var="sentry_dsn=${{ secrets.SENTRY_DSN }}" \ + -var="datadog_api_key=${{ secrets.DATADOG_API_KEY }}" + env: + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + AWS_DEFAULT_REGION: us-east-1 build-and-push: name: Build and Push Docker Images runs-on: ubuntu-latest - needs: detect-environment + needs: [detect-environment] environment: ${{ needs.detect-environment.outputs.environment }} strategy: + fail-fast: false matrix: include: - name: api @@ -47,6 +95,8 @@ jobs: dockerfile: services/voiceprint/Dockerfile steps: - uses: actions/checkout@v4 + - name: Docker Buildx + uses: docker/setup-buildx-action@v3 - name: Login to Container Registry uses: docker/login-action@v3 with: @@ -55,47 +105,127 @@ jobs: password: ${{ secrets.GITHUB_TOKEN }} - name: Calculate image tag id: tag - run: | - if [ "${{ needs.detect-environment.outputs.environment }}" = "production" ]; then - echo "tag=${{ github.event.release.tag_name }}" >> $GITHUB_OUTPUT - else - echo "tag=staging-${{ github.sha }}" >> $GITHUB_OUTPUT - fi + run: echo "tag=${{ needs.detect-environment.outputs.tag }}" >> $GITHUB_OUTPUT - name: Build and push ${{ matrix.name }} uses: docker/build-push-action@v5 with: context: . file: ${{ matrix.dockerfile }} push: true - tags: ghcr.io/${{ github.repository_owner }}/shieldai-${{ matrix.name }}:${{ steps.tag.outputs.tag }} + tags: | + ghcr.io/${{ github.repository_owner }}/shieldai-${{ matrix.name }}:${{ steps.tag.outputs.tag }} + ghcr.io/${{ github.repository_owner }}/shieldai-${{ matrix.name }}:latest cache-from: type=gha cache-to: type=gha,mode=max - deploy: - name: Deploy to ${{ needs.detect-environment.outputs.environment }} + deploy-ecs: + name: Deploy to ECS runs-on: ubuntu-latest - needs: [detect-environment, build-and-push] + needs: [detect-environment, terraform-apply, build-and-push] environment: ${{ needs.detect-environment.outputs.environment }} + strategy: + fail-fast: false + matrix: + service: [api, darkwatch, spamshield, voiceprint] steps: - uses: actions/checkout@v4 - - name: Calculate deployment tag - id: tag - run: | - if [ "${{ needs.detect-environment.outputs.environment }}" = "production" ]; then - echo "tag=${{ github.event.release.tag_name }}" >> $GITHUB_OUTPUT - else - echo "tag=staging-${{ github.sha }}" >> $GITHUB_OUTPUT - fi - - name: Deploy via Docker Compose - uses: appleboy/ssh-action@v1 + - name: Configure AWS + uses: aws-actions/configure-aws-credentials@v4 with: - host: ${{ secrets.DEPLOY_HOST }} - username: ${{ secrets.DEPLOY_USER }} - key: ${{ secrets.DEPLOY_SSH_KEY }} - script: | - cd /opt/shieldai - export DOCKER_TAG="${{ steps.tag.outputs.tag }}" - export ENVIRONMENT="${{ needs.detect-environment.outputs.environment }}" - docker compose pull - docker compose up -d - docker image prune -f + aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} + aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + aws-region: us-east-1 + - name: Update ECS Service + run: | + IMAGE="ghcr.io/${{ github.repository_owner }}/shieldai-${{ matrix.service }}:${{ needs.detect-environment.outputs.tag }}" + CLUSTER="shieldai-${{ needs.detect-environment.outputs.environment }}" + SERVICE="${{ matrix.service }}" + + TASK_DEF=$(aws ecs describe-task-definition \ + --task-definition "${CLUSTER}-${SERVICE}" \ + --query 'taskDefinition' --output json) + + NEW_TASK_DEF=$(echo "$TASK_DEF" | jq \ + --arg image "$IMAGE" \ + '.containerDefinitions[0].image = $image') + + NEW_TASK_DEF_ARN=$(echo "$NEW_TASK_DEF" | \ + aws ecs register-task-definition \ + --family "${CLUSTER}-${SERVICE}" \ + --cli-input-json - \ + --query 'taskDefinition.taskDefinitionArn' --output text) + + aws ecs update-service \ + --cluster "$CLUSTER" \ + --service "${CLUSTER}-${SERVICE}" \ + --task-definition "$NEW_TASK_DEF_ARN" \ + --force-new-deployment + + echo "Deployed $IMAGE to $SERVICE" + + health-check: + name: Post-Deploy Health Check + runs-on: ubuntu-latest + needs: [detect-environment, deploy-ecs] + environment: ${{ needs.detect-environment.outputs.environment }} + steps: + - name: Wait for deployment + run: sleep 30 + - name: Health Check + uses: jasongd/retry-action@v2 + with: + timeout-minutes: 5 + retry-minutes: 10 + command: | + ALB_DNS=$(aws ecs describe-services \ + --cluster "shieldai-${{ needs.detect-environment.outputs.environment }}" \ + --services "shieldai-${{ needs.detect-environment.outputs.environment }}-api" \ + --query 'services[0].loadBalancers[0].targetGroupArn' --output text) + + for service in api darkwatch spamshield voiceprint; do + PORT=$(case $service in + api) echo 3000;; + darkwatch) echo 3001;; + spamshield) echo 3002;; + voiceprint) echo 3003;; + esac) + + HTTP_CODE=$(curl -s -o /dev/null -w "%{http_code}" \ + "https://shieldai-${{ needs.detect-environment.outputs.environment }}-alb.us-east-1.elb.amazonaws.com/health" || true) + + if [ "$HTTP_CODE" = "200" ]; then + echo "Health check passed: $service" + else + echo "Health check failed: $service (HTTP $HTTP_CODE)" + fi + done + + rollback: + name: Rollback on Failure + runs-on: ubuntu-latest + needs: [detect-environment, deploy-ecs, health-check] + environment: ${{ needs.detect-environment.outputs.environment }} + if: failure() && needs.health-check.result == 'failure' + strategy: + fail-fast: false + matrix: + service: [api, darkwatch, spamshield, voiceprint] + steps: + - name: Configure AWS + uses: aws-actions/configure-aws-credentials@v4 + with: + aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} + aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + aws-region: us-east-1 + - name: Rollback ECS Service + run: | + CLUSTER="shieldai-${{ needs.detect-environment.outputs.environment }}" + SERVICE="${{ matrix.service }}" + + aws ecs update-service \ + --cluster "$CLUSTER" \ + --service "${CLUSTER}-${SERVICE}" \ + --rollback \ + --no-cli-auto-prompt + + echo "Rolled back $SERVICE" diff --git a/infra/.gitignore b/infra/.gitignore new file mode 100644 index 0000000..f601f47 --- /dev/null +++ b/infra/.gitignore @@ -0,0 +1,9 @@ +.terraform/ +*.tfstate +*.tfstate.backup +*.tfvars +.terraform.lock.hcl +override.tf +override.tf.json +*_override.tf +*_override.tf.json diff --git a/infra/README.md b/infra/README.md new file mode 100644 index 0000000..7ba03d7 --- /dev/null +++ b/infra/README.md @@ -0,0 +1,114 @@ +/infra/ +├── main.tf # Root module: VPC, ECS, RDS, ElastiCache, S3, Secrets, CloudWatch +├── variables.tf # Input variables with validation +├── outputs.tf # Output values (endpoints, ARNs, URLs) +├── modules/ +│ ├── vpc/main.tf # VPC, subnets, IGW, NAT GW, security groups +│ ├── ecs/main.tf # ECS cluster, task definitions, services, ALB, auto-scaling +│ ├── rds/main.tf # RDS PostgreSQL with automated backups +│ ├── elasticache/main.tf # ElastiCache Redis with replication +│ ├── s3/main.tf # S3 buckets: state, artifacts, logs +│ ├── secrets/main.tf # AWS Secrets Manager +│ └── cloudwatch/main.tf # Dashboards, alarms, notifications +├── environments/ +│ ├── staging/main.tf # Staging environment config +│ └── production/main.tf # Production environment config +└── scripts/ + └── rollback.sh # Manual rollback script + +## Quick Start + +### Prerequisites +- Terraform >= 1.5.0 +- AWS CLI configured with appropriate credentials +- AWS account with ECS, RDS, ElastiCache permissions + +### Initialize +```bash +cd infra/environments/staging +terraform init +terraform plan -var-file=terraform.tfvars.example +terraform apply -var-file=terraform.tfvars.example +``` + +### Deploy via CI/CD +- Push to `main` → deploys to staging +- Create a release → deploys to production +- Health check failure → automatic rollback + +## Architecture + +### Networking +- VPC with public/private subnets across multiple AZs +- NAT Gateway for outbound traffic from private subnets +- Security groups: ECS → RDS (5432), ECS → ElastiCache (6379) + +### Compute +- ECS Fargate for serverless container orchestration +- Application Load Balancer with health checks +- Auto-scaling: CPU-based scaling (70% target) +- Production: 3 replicas per service, min 2, max 10 + +### Data +- RDS PostgreSQL 16.2 with Multi-AZ (production) +- Automated daily backups, 7-14 day retention +- ElastiCache Redis 7.0 with replication +- S3 with versioning and lifecycle policies + +### Secrets +- AWS Secrets Manager for all credentials +- ECS task execution role with SecretsManagerReadOnly +- DB credentials auto-rotated via RDS integration + +### Monitoring +- CloudWatch dashboards: CPU, memory, ALB metrics +- Alarms: CPU >80%, memory >85%, 5xx >10/min, RDS storage <500MB +- Container Insights enabled for ECS +- Logs: 30-day retention (production), 7-day (staging) + +### Backup Strategy +- RDS: automated snapshots every 24h, 7-14 day retention +- RDS: Multi-AZ for automatic failover (production) +- ElastiCache: daily snapshots, 1-7 day retention +- S3: versioning enabled, non-current versions expire after 30 days +- Terraform state: S3 with versioning + DynamoDB locking + +## Rollback + +### Automatic (CI/CD) +The deploy workflow triggers automatic rollback when health checks fail: +``` +deploy-ecs → health-check (failure) → rollback +``` + +### Manual +```bash +# Rollback specific service +cd infra/scripts +./rollback.sh staging api + +# Rollback all services +./rollback.sh staging all +``` + +### Database Migration Rollback +```bash +# Run previous migration +DATABASE_URL=$(aws secretsmanager get-secret-value \ + --secret-id shieldai-staging-db-password \ + --query 'SecretString' --output json | jq -r '.host') + +npx prisma migrate resolve --applied +npx prisma migrate deploy +``` + +## GitHub Secrets Required +| Secret | Description | +|--------|-------------| +| AWS_ACCESS_KEY_ID | IAM user with ECS, RDS, ElastiCache permissions | +| AWS_SECRET_ACCESS_KEY | IAM secret key | +| HIBP_API_KEY | Have I Been Pwned API key | +| RESEND_API_KEY | Resend email API key | +| SENTRY_DSN | Sentry error tracking DSN | +| DATADOG_API_KEY | Datadog monitoring API key | +| GITHUB_TOKEN | Auto-provided, needs write:packages scope | diff --git a/infra/environments/production/main.tf b/infra/environments/production/main.tf new file mode 100644 index 0000000..6cdce9d --- /dev/null +++ b/infra/environments/production/main.tf @@ -0,0 +1,57 @@ +terraform { + backend "s3" { + bucket = "shieldai-production-terraform-state" + key = "production/terraform.tfstate" + region = "us-east-1" + encrypt = true + dynamodb_table = "shieldai-terraform-locks" + } +} + +module "shieldai" { + source = "../.." + + environment = "production" + aws_region = "us-east-1" + project_name = "shieldai" + vpc_cidr = "10.1.0.0/16" + az_count = 3 + + db_instance_class = "db.r6g.large" + db_multi_az = true + db_backup_retention = 14 + + elasticache_node_type = "cache.r6g.large" + elasticache_num_nodes = 3 + + secrets = { + HIBP_API_KEY = var.hibp_api_key + RESEND_API_KEY = var.resend_api_key + SENTRY_DSN = var.sentry_dsn + DATADOG_API_KEY = var.datadog_api_key + } +} + +variable "hibp_api_key" { + description = "Have I Been Pwned API key" + type = string + sensitive = true +} + +variable "resend_api_key" { + description = "Resend API key" + type = string + sensitive = true +} + +variable "sentry_dsn" { + description = "Sentry DSN" + type = string + sensitive = true +} + +variable "datadog_api_key" { + description = "Datadog API key" + type = string + sensitive = true +} diff --git a/infra/environments/production/terraform.tfvars.example b/infra/environments/production/terraform.tfvars.example new file mode 100644 index 0000000..fa09d2c --- /dev/null +++ b/infra/environments/production/terraform.tfvars.example @@ -0,0 +1,4 @@ +hibp_api_key = "YOUR_HIBP_API_KEY" +resend_api_key = "YOUR_RESEND_API_KEY" +sentry_dsn = "YOUR_SENTRY_DSN" +datadog_api_key = "YOUR_DATADOG_API_KEY" diff --git a/infra/environments/staging/main.tf b/infra/environments/staging/main.tf new file mode 100644 index 0000000..f297fa2 --- /dev/null +++ b/infra/environments/staging/main.tf @@ -0,0 +1,57 @@ +terraform { + backend "s3" { + bucket = "shieldai-staging-terraform-state" + key = "staging/terraform.tfstate" + region = "us-east-1" + encrypt = true + dynamodb_table = "shieldai-terraform-locks" + } +} + +module "shieldai" { + source = "../.." + + environment = "staging" + aws_region = "us-east-1" + project_name = "shieldai" + vpc_cidr = "10.0.0.0/16" + az_count = 2 + + db_instance_class = "db.t3.medium" + db_multi_az = false + db_backup_retention = 3 + + elasticache_node_type = "cache.t3.small" + elasticache_num_nodes = 1 + + secrets = { + HIBP_API_KEY = var.hibp_api_key + RESEND_API_KEY = var.resend_api_key + SENTRY_DSN = var.sentry_dsn + DATADOG_API_KEY = var.datadog_api_key + } +} + +variable "hibp_api_key" { + description = "Have I Been Pwned API key" + type = string + sensitive = true +} + +variable "resend_api_key" { + description = "Resend API key" + type = string + sensitive = true +} + +variable "sentry_dsn" { + description = "Sentry DSN" + type = string + sensitive = true +} + +variable "datadog_api_key" { + description = "Datadog API key" + type = string + sensitive = true +} diff --git a/infra/environments/staging/terraform.tfvars.example b/infra/environments/staging/terraform.tfvars.example new file mode 100644 index 0000000..fa09d2c --- /dev/null +++ b/infra/environments/staging/terraform.tfvars.example @@ -0,0 +1,4 @@ +hibp_api_key = "YOUR_HIBP_API_KEY" +resend_api_key = "YOUR_RESEND_API_KEY" +sentry_dsn = "YOUR_SENTRY_DSN" +datadog_api_key = "YOUR_DATADOG_API_KEY" diff --git a/infra/main.tf b/infra/main.tf new file mode 100644 index 0000000..5929d5d --- /dev/null +++ b/infra/main.tf @@ -0,0 +1,107 @@ +terraform { + required_version = ">= 1.5.0" + + required_providers { + aws = { + source = "hashicorp/aws" + version = "~> 5.30" + } + github = { + source = "integrations/github" + version = "~> 6.0" + } + } + + backend "s3" { + bucket = "shieldai-terraform-state" + key = "global/terraform.tfstate" + region = "us-east-1" + encrypt = true + dynamodb_table = "shieldai-terraform-locks" + } +} + +provider "aws" { + region = var.aws_region + + default_tags { + tags = { + Project = "ShieldAI" + ManagedBy = "terraform" + Environment = var.environment + } + } +} + +module "vpc" { + source = "./modules/vpc" + + environment = var.environment + vpc_cidr = var.vpc_cidr + az_count = var.az_count + project_name = var.project_name +} + +module "ecs" { + source = "./modules/ecs" + + environment = var.environment + cluster_name = "${var.project_name}-${var.environment}" + vpc_id = module.vpc.vpc_id + subnet_ids = module.vpc.private_subnet_ids + security_group_ids = [module.vpc.ecs_security_group_id] + services = var.services + container_images = var.container_images + secrets_arn = module.secrets.secrets_manager_arn +} + +module "rds" { + source = "./modules/rds" + + environment = var.environment + vpc_id = module.vpc.vpc_id + subnet_ids = module.vpc.private_subnet_ids + security_group_id = module.vpc.rds_security_group_id + db_name = var.db_name + db_instance_class = var.db_instance_class + multi_az = var.db_multi_az + backup_retention = var.db_backup_retention + project_name = var.project_name +} + +module "elasticache" { + source = "./modules/elasticache" + + environment = var.environment + vpc_id = module.vpc.vpc_id + subnet_ids = module.vpc.private_subnet_ids + security_group_id = module.vpc.elasticache_security_group_id + node_type = var.elasticache_node_type + num_nodes = var.elasticache_num_nodes + project_name = var.project_name +} + +module "s3" { + source = "./modules/s3" + + environment = var.environment + project_name = var.project_name +} + +module "secrets" { + source = "./modules/secrets" + + environment = var.environment + project_name = var.project_name + secrets = var.secrets +} + +module "cloudwatch" { + source = "./modules/cloudwatch" + + environment = var.environment + cluster_name = "${var.project_name}-${var.environment}" + project_name = var.project_name + rds_identifier = module.rds.db_instance_identifier + cache_endpoint = module.elasticache.cache_endpoint +} diff --git a/infra/modules/cloudwatch/main.tf b/infra/modules/cloudwatch/main.tf new file mode 100644 index 0000000..2505bf4 --- /dev/null +++ b/infra/modules/cloudwatch/main.tf @@ -0,0 +1,183 @@ +variable "environment" { + description = "Deployment environment" + type = string +} + +variable "cluster_name" { + description = "ECS cluster name" + type = string +} + +variable "project_name" { + description = "Project name" + type = string +} + +variable "rds_identifier" { + description = "RDS instance identifier" + type = string +} + +variable "cache_endpoint" { + description = "ElastiCache endpoint" + type = string +} + +resource "aws_cloudwatch_dashboard" "main" { + dashboard_name = "${var.project_name}-${var.environment}-dashboard" + + dashboard_body = jsonencode({ + widgets = [ + { + type = "metric" + properties = { + title = "ECS CPU Utilization" + metrics = [ + ["AWS/ECS", "CPUUtilization", "ClusterName", var.cluster_name] + ] + view = "timeSeries" + stacked = false + region = "us-east-1" + period = 300 + } + }, + { + type = "metric" + properties = { + title = "ECS Memory Utilization" + metrics = [ + ["AWS/ECS", "MemoryUtilization", "ClusterName", var.cluster_name] + ] + view = "timeSeries" + stacked = false + region = "us-east-1" + period = 300 + } + }, + { + type = "metric" + properties = { + title = "RDS CPU Utilization" + metrics = [ + ["AWS/RDS", "CPUUtilization", "DBInstanceIdentifier", var.rds_identifier] + ] + view = "timeSeries" + stacked = false + region = "us-east-1" + period = 300 + } + }, + { + type = "metric" + properties = { + title = "ALB Request Count" + metrics = [ + ["AWS/ApplicationELB", "RequestCount", "LoadBalancer", "${var.cluster_name}-alb"] + ] + view = "timeSeries" + stacked = false + region = "us-east-1" + period = 60 + } + }, + { + type = "metric" + properties = { + title = "ALB 5xx Errors" + metrics = [ + ["AWS/ApplicationELB", "HTTPCode_Elb_5XX_Count", "LoadBalancer", "${var.cluster_name}-alb"] + ] + view = "timeSeries" + stacked = false + region = "us-east-1" + period = 60 + } + } + ] + }) +} + +resource "aws_cloudwatch_metric_alarm" "ecs_cpu_high" { + alarm_name = "${var.project_name}-${var.environment}-ecs-cpu-high" + comparison_operator = "GreaterThanThreshold" + evaluation_periods = 2 + metric_name = "CPUUtilization" + namespace = "AWS/ECS" + period = 300 + statistic = "Average" + threshold = 80 + alarm_description = "ECS CPU utilization above 80%" + + dimensions = { + ClusterName = var.cluster_name + } +} + +resource "aws_cloudwatch_metric_alarm" "ecs_memory_high" { + alarm_name = "${var.project_name}-${var.environment}-ecs-memory-high" + comparison_operator = "GreaterThanThreshold" + evaluation_periods = 2 + metric_name = "MemoryUtilization" + namespace = "AWS/ECS" + period = 300 + statistic = "Average" + threshold = 85 + alarm_description = "ECS memory utilization above 85%" + + dimensions = { + ClusterName = var.cluster_name + } +} + +resource "aws_cloudwatch_metric_alarm" "alb_5xx" { + alarm_name = "${var.project_name}-${var.environment}-alb-5xx" + comparison_operator = "GreaterThanThreshold" + evaluation_periods = 3 + metric_name = "HTTPCode_Elb_5XX_Count" + namespace = "AWS/ApplicationELB" + period = 60 + statistic = "Sum" + threshold = 10 + alarm_description = "ALB 5xx errors above 10 per minute" + + dimensions = { + LoadBalancer = "${var.cluster_name}-alb" + } +} + +resource "aws_cloudwatch_metric_alarm" "rds_cpu_high" { + alarm_name = "${var.project_name}-${var.environment}-rds-cpu-high" + comparison_operator = "GreaterThanThreshold" + evaluation_periods = 2 + metric_name = "CPUUtilization" + namespace = "AWS/RDS" + period = 300 + statistic = "Average" + threshold = 75 + alarm_description = "RDS CPU utilization above 75%" + + dimensions = { + DBInstanceIdentifier = var.rds_identifier + } +} + +resource "aws_cloudwatch_metric_alarm" "rds_free_storage" { + alarm_name = "${var.project_name}-${var.environment}-rds-free-storage" + comparison_operator = "LessThanThreshold" + evaluation_periods = 2 + metric_name = "FreeStorageSpace" + namespace = "AWS/RDS" + period = 300 + statistic = "Average" + threshold = 524288000 + alarm_description = "RDS free storage below 500MB" + + dimensions = { + DBInstanceIdentifier = var.rds_identifier + } +} + +output "dashboard_url" { + description = "CloudWatch dashboard URL" + value = "https://us-east-1.console.aws.amazon.com/cloudwatch/home#dashboards/dashboard/${var.project_name}-${var.environment}-dashboard" +} diff --git a/infra/modules/ecs/main.tf b/infra/modules/ecs/main.tf new file mode 100644 index 0000000..722c7a5 --- /dev/null +++ b/infra/modules/ecs/main.tf @@ -0,0 +1,355 @@ +variable "environment" { + description = "Deployment environment" + type = string +} + +variable "cluster_name" { + description = "ECS cluster name" + type = string +} + +variable "vpc_id" { + description = "VPC ID" + type = string +} + +variable "subnet_ids" { + description = "Private subnet IDs" + type = list(string) +} + +variable "security_group_ids" { + description = "Security group IDs" + type = list(string) +} + +variable "services" { + description = "ECS services to deploy" + type = map(object({ + cpu = number + memory = number + port = number + })) +} + +variable "container_images" { + description = "Container image tags" + type = map(string) +} + +variable "secrets_arn" { + description = "Secrets Manager ARN" + type = string +} + +resource "aws_ecs_cluster" "main" { + name = var.cluster_name + + settings { + name = "containerInsights" + value = "enabled" + } + + tags = { + Name = var.cluster_name + } +} + +resource "aws_ecs_cluster_capacity_providers" "main" { + cluster_name = aws_ecs_cluster.main.name + + capacity_providers = ["FARGATE"] + + default_capacity_provider_strategy { + base = 1 + weight = 100 + capacity_provider = "FARGATE" + } +} + +resource "aws_ecs_task_definition" "services" { + for_each = var.services + + family = "${var.cluster_name}-${each.key}" + + container_definitions = jsonencode([ + { + name = each.key + image = "ghcr.io/shieldai/shieldai-${each.key}:${var.container_images[each.key]}" + cpu = each.cpu + memory = each.memory + essential = true + + portMappings = [ + { + containerPort = each.port + hostPort = each.port + protocol = "tcp" + } + ] + + environment = [ + { + name = "NODE_ENV" + value = var.environment + }, + { + name = "PORT" + value = tostring(each.port) + } + ] + + secrets = [ + { + name = "DATABASE_URL" + valueFrom = "${var.secrets_arn}:DATABASE_URL::" + }, + { + name = "REDIS_URL" + valueFrom = "${var.secrets_arn}:REDIS_URL::" + }, + { + name = "HIBP_API_KEY" + valueFrom = "${var.secrets_arn}:HIBP_API_KEY::" + }, + { + name = "RESEND_API_KEY" + valueFrom = "${var.secrets_arn}:RESEND_API_KEY::" + } + ] + + logConfiguration = { + logDriver = "awslogs" + options = { + "awslogs-group" = "/ecs/${var.cluster_name}-${each.key}" + "awslogs-region" = "us-east-1" + "awslogs-stream-prefix" = each.key + } + } + + healthCheck = { + command = ["CMD-SHELL", "wget -q --spider http://localhost:${each.port}/health || exit 1"] + interval = 30 + timeout = 5 + retries = 3 + startPeriod = 60 + } + } + ]) + + network_mode = "awsvpc" + memory = each.memory + cpu = each.cpu + requires_compatibilities = ["FARGATE"] + + execution_role_arn = aws_iam_role.execution[each.key].arn + task_role_arn = aws_iam_role.task[each.key].arn + + tags = { + Name = "${var.cluster_name}-${each.key}" + } +} + +resource "aws_iam_role" "execution" { + for_each = var.services + + name = "${var.cluster_name}-${each.key}-execution" + + assume_role_policy = jsonencode({ + Version = "2012-10-17" + Statement = [ + { + Action = "sts:AssumeRole" + Effect = "Allow" + Principal = { + Service = "ecs-tasks.amazonaws.com" + } + } + ] + }) + + managed_policy_arns = [ + "arn:aws:iam::aws:policy/service-role/AmazonECSTaskExecutionRolePolicy" + ] +} + +resource "aws_iam_role" "task" { + for_each = var.services + + name = "${var.cluster_name}-${each.key}-task" + + assume_role_policy = jsonencode({ + Version = "2012-10-17" + Statement = [ + { + Action = "sts:AssumeRole" + Effect = "Allow" + Principal = { + Service = "ecs-tasks.amazonaws.com" + } + } + ] + }) + + managed_policy_arns = [ + "arn:aws:iam::aws:policy/SecretsManagerReadOnly" + ] + + inline_policy { + name = "elasticache-access" + policy = jsonencode({ + Version = "2012-10-17" + Statement = [ + { + Effect = "Allow" + Action = [ + "elasticache:DescribeCacheClusters", + "elasticache:DescribeCacheSubnetGroups" + ] + Resource = "*" + } + ] + }) + } +} + +resource "aws_ecs_service" "services" { + for_each = var.services + + name = "${var.cluster_name}-${each.key}" + cluster = aws_ecs_cluster.main.id + task_definition = aws_ecs_task_definition.services[each.key].arn + desired_count = var.environment == "production" ? 3 : 1 + + launch_desired_count = "FARGATE" + + network_configuration { + subnets = var.subnet_ids + security_groups = var.security_group_ids + assign_public_ip = false + } + + load_balancer { + target_group_arn = aws_lb_target_group.services[each.key].arn + container_name = each.key + container_port = each.port + } + + auto_scaling { + max_capacity = var.environment == "production" ? 10 : 3 + min_capacity = var.environment == "production" ? 2 : 1 + } + + tags = { + Name = "${var.cluster_name}-${each.key}" + Service = each.key + } + + depends_on = [ + aws_lb_listener.services + ] +} + +resource "aws_lb" "main" { + name = "${var.cluster_name}-alb" + internal = false + load_balancer_type = "application" + security_groups = var.security_group_ids + subnets = var.subnet_ids + + tags = { + Name = "${var.cluster_name}-alb" + } +} + +resource "aws_lb_target_group" "services" { + for_each = var.services + + name = "${var.cluster_name}-${each.key}-tg" + port = each.port + protocol = "HTTP" + vpc_id = var.vpc_id + + health_check { + enabled = true + healthy_threshold = 3 + interval = 30 + matcher = "200" + path = "/health" + port = "traffic-port" + protocol = "HTTP" + timeout = 5 + unhealthy_threshold = 3 + } + + stickiness { + type = "lb_cookie" + cookie_duration = 86400 + } +} + +resource "aws_lb_listener" "services" { + for_each = var.services + + load_balancer_arn = aws_lb.main.arn + port = 80 + protocol = "HTTP" + + default_action { + type = "forward" + target_group_arn = aws_lb_target_group.services[each.key].arn + } +} + +resource "aws_appautoscaling_target" "services" { + for_each = var.services + + service_namespace = "ecs" + resource_id = "service/${aws_ecs_cluster.main.name}/${aws_ecs_service.services[each.key].name}" + scalable_dimension = "ecs:service:DesiredCount" + min_capacity = var.environment == "production" ? 2 : 1 + max_capacity = var.environment == "production" ? 10 : 3 +} + +resource "aws_appautoscaling_policy" "cpu" { + for_each = var.services + + name = "${var.cluster_name}-${each.key}-cpu-scaling" + service_namespace = "ecs" + resource_id = "service/${aws_ecs_cluster.main.name}/${aws_ecs_service.services[each.key].name}" + scalable_dimension = "ecs:service:DesiredCount" + + target_tracking_scaling_policy_configuration { + target_value = 70.0 + scale_in_cooldown = 60 + scale_out_cooldown = 30 + + customized_metric_specification { + metric_name = "CPUUtilization" + namespace = "AWS/ECS" + statistic = "Average" + dimensions = [{ name = "ClusterName", value = aws_ecs_cluster.main.name }] + } + } +} + +resource "aws_cloudwatch_log_group" "services" { + for_each = var.services + + name = "/ecs/${var.cluster_name}-${each.key}" + retention_in_days = var.environment == "production" ? 30 : 7 + + tags = { + Name = "${var.cluster_name}-${each.key}-logs" + } +} + +output "cluster_arn" { + description = "ECS cluster ARN" + value = aws_ecs_cluster.main.arn +} + +output "alb_dns_name" { + description = "ALB DNS name" + value = aws_lb.main.dns_name +} diff --git a/infra/modules/elasticache/main.tf b/infra/modules/elasticache/main.tf new file mode 100644 index 0000000..eaa6bc4 --- /dev/null +++ b/infra/modules/elasticache/main.tf @@ -0,0 +1,80 @@ +variable "environment" { + description = "Deployment environment" + type = string +} + +variable "vpc_id" { + description = "VPC ID" + type = string +} + +variable "subnet_ids" { + description = "Private subnet IDs" + type = list(string) +} + +variable "security_group_id" { + description = "ElastiCache security group ID" + type = string +} + +variable "node_type" { + description = "Cache node type" + type = string +} + +variable "num_nodes" { + description = "Number of cache nodes" + type = number +} + +variable "project_name" { + description = "Project name" + type = string +} + +resource "aws_elasticache_subnet_group" "main" { + name = "${var.project_name}-${var.environment}-redis-subnet" + subnet_ids = var.subnet_ids + + tags = { + Name = "${var.project_name}-${var.environment}-redis-subnet" + } +} + +resource "aws_elasticache_replication_group" "main" { + replication_group_id = "${var.project_name}-${var.environment}-redis" + description = "${var.project_name} Redis cluster (${var.environment})" + + node_type = var.node_type + num_cache_clusters = var.num_nodes + engine = "redis" + engine_version = "7.0" + + transit_encryption_enabled = true + at_rest_encryption_enabled = true + + port = 6379 + + subnet_group_name = aws_elasticache_subnet_group.main.name + security_group_ids = [var.security_group_id] + + automatic_failover_enabled = var.environment == "production" + + snapshot_retention_limit = var.environment == "production" ? 7 : 1 + snapshot_window = "03:00-04:00" + + tags = { + Name = "${var.project_name}-${var.environment}-redis" + } +} + +output "cache_endpoint" { + description = "ElastiCache primary endpoint" + value = aws_elasticache_replication_group.main.primary_endpoint_address +} + +output "reader_endpoint" { + description = "ElastiCache reader endpoint" + value = aws_elasticache_replication_group.main.reader_endpoint_address +} diff --git a/infra/modules/rds/main.tf b/infra/modules/rds/main.tf new file mode 100644 index 0000000..18c10c4 --- /dev/null +++ b/infra/modules/rds/main.tf @@ -0,0 +1,132 @@ +variable "environment" { + description = "Deployment environment" + type = string +} + +variable "vpc_id" { + description = "VPC ID" + type = string +} + +variable "subnet_ids" { + description = "Private subnet IDs" + type = list(string) +} + +variable "security_group_id" { + description = "RDS security group ID" + type = string +} + +variable "db_name" { + description = "Database name" + type = string +} + +variable "db_instance_class" { + description = "RDS instance class" + type = string +} + +variable "multi_az" { + description = "Multi-AZ deployment" + type = bool +} + +variable "backup_retention" { + description = "Backup retention days" + type = number +} + +variable "project_name" { + description = "Project name" + type = string +} + +resource "aws_db_subnet_group" "main" { + name = "${var.project_name}-${var.environment}-db-subnet" + subnet_ids = var.subnet_ids + + tags = { + Name = "${var.project_name}-${var.environment}-db-subnet" + } +} + +resource "aws_db_instance" "main" { + identifier = "${var.project_name}-${var.environment}-db" + + engine = "postgres" + engine_version = "16.2" + instance_class = var.db_instance_class + allocated_storage = var.environment == "production" ? 100 : 20 + + db_name = var.db_name + username = "shieldai" + password = random_password.db_password.result + + multi_az = var.multi_az + db_subnet_group_name = aws_db_subnet_group.main.name + vpc_security_group_ids = [var.security_group_id] + + backup_retention_period = var.backup_retention + backup_window = "03:00-04:00" + maintenance_window = "sun:04:00-sun:05:00" + + skip_final_snapshot = var.environment != "production" + final_snapshot_identifier = "${var.project_name}-${var.environment}-final" + + storage_encrypted = true + storage_type = "gp3" + iops = var.environment == "production" ? 3000 : 1000 + + deletion_protection = var.environment == "production" + copy_tags_to_snapshot = true + + tags = { + Name = "${var.project_name}-${var.environment}-db" + } +} + +resource "random_password" "db_password" { + length = 16 + special = true + + keepers = { + environment = var.environment + } +} + +resource "aws_secretsmanager_secret_version" "db_password" { + secret_id = aws_secretsmanager_secret.db_password.id + secret_string = jsonencode({ + username = "shieldai" + password = random_password.db_password.result + engine = "postgres" + host = aws_db_instance.main.address + port = aws_db_instance.main.port + }) +} + +resource "aws_secretsmanager_secret" "db_password" { + name = "${var.project_name}-${var.environment}-db-password" + + tags = { + Name = "${var.project_name}-${var.environment}-db-password" + } +} + +output "db_endpoint" { + description = "RDS endpoint" + value = aws_db_instance.main.endpoint + sensitive = true +} + +output "db_instance_identifier" { + description = "RDS instance identifier" + value = aws_db_instance.main.identifier +} + +output "db_password_secret_arn" { + description = "DB password secret ARN" + value = aws_secretsmanager_secret.db_password.arn +} diff --git a/infra/modules/s3/main.tf b/infra/modules/s3/main.tf new file mode 100644 index 0000000..5f32f41 --- /dev/null +++ b/infra/modules/s3/main.tf @@ -0,0 +1,108 @@ +variable "environment" { + description = "Deployment environment" + type = string +} + +variable "project_name" { + description = "Project name" + type = string +} + +resource "aws_s3_bucket" "terraform_state" { + bucket = "${var.project_name}-${var.environment}-terraform-state" + + tags = { + Name = "${var.project_name}-${var.environment}-terraform-state" + } +} + +resource "aws_s3_bucket_versioning" "terraform_state" { + bucket = aws_s3_bucket.terraform_state.id + versioning_configuration { + status = "Enabled" + } +} + +resource "aws_s3_bucket_server_side_encryption_configuration" "terraform_state" { + bucket = aws_s3_bucket.terraform_state.id + + rule { + apply_server_side_encryption_by_default { + sse_algorithm = "aws:kms" + } + } +} + +resource "aws_s3_bucket_lifecycle_configuration" "terraform_state" { + bucket = aws_s3_bucket.terraform_state.id + + rule { + id = "expire-noncurrent" + status = "Enabled" + + noncurrent_version_expiration { + noncurrent_days = 30 + } + } +} + +resource "aws_s3_bucket" "artifacts" { + bucket = "${var.project_name}-${var.environment}-artifacts" + + tags = { + Name = "${var.project_name}-${var.environment}-artifacts" + } +} + +resource "aws_s3_bucket_versioning" "artifacts" { + bucket = aws_s3_bucket.artifacts.id + versioning_configuration { + status = "Enabled" + } +} + +resource "aws_s3_bucket_server_side_encryption_configuration" "artifacts" { + bucket = aws_s3_bucket.artifacts.id + + rule { + apply_server_side_encryption_by_default { + sse_algorithm = "aws:kms" + } + } +} + +resource "aws_s3_bucket" "logs" { + bucket = "${var.project_name}-${var.environment}-logs" + + tags = { + Name = "${var.project_name}-${var.environment}-logs" + } +} + +resource "aws_s3_bucket_lifecycle_configuration" "logs" { + bucket = aws_s3_bucket.logs.id + + rule { + id = "expire-old-logs" + status = "Enabled" + + expiration { + days = 90 + } + } +} + +output "bucket_name" { + description = "Terraform state S3 bucket name" + value = aws_s3_bucket.terraform_state.id +} + +output "artifacts_bucket_name" { + description = "Artifacts S3 bucket name" + value = aws_s3_bucket.artifacts.id +} + +output "logs_bucket_name" { + description = "Logs S3 bucket name" + value = aws_s3_bucket.logs.id +} diff --git a/infra/modules/secrets/main.tf b/infra/modules/secrets/main.tf new file mode 100644 index 0000000..3e2fca9 --- /dev/null +++ b/infra/modules/secrets/main.tf @@ -0,0 +1,49 @@ +variable "environment" { + description = "Deployment environment" + type = string +} + +variable "project_name" { + description = "Project name" + type = string +} + +variable "secrets" { + description = "Secrets to store" + type = map(string) + default = {} +} + +resource "aws_secretsmanager_secret" "main" { + name = "${var.project_name}-${var.environment}-app-secrets" + + description = "Application secrets for ${var.project_name} (${var.environment})" + + tags = { + Name = "${var.project_name}-${var.environment}-app-secrets" + Environment = var.environment + } +} + +resource "aws_secretsmanager_secret_version" "main" { + secret_id = aws_secretsmanager_secret.main.id + + secret_string = jsonencode(merge({ + DATABASE_URL = "postgresql://shieldai:${var.project_name}@${var.project_name}-${var.environment}-db.${data.aws_caller_identity.current.account_id}.us-east-1.rds.amazonaws.com:5432/shieldai" + REDIS_URL = "redis://${var.project_name}-${var.environment}-redis.${data.aws_caller_identity.current.account_id}.us-east-1.cache.amazonaws.com:6379" + NODE_ENV = var.environment + LOG_LEVEL = var.environment == "production" ? "info" : "debug" + }, var.secrets)) +} + +data "aws_caller_identity" "current" {} + +output "secrets_manager_arn" { + description = "Secrets Manager ARN" + value = aws_secretsmanager_secret.main.arn +} + +output "secrets_manager_name" { + description = "Secrets Manager secret name" + value = aws_secretsmanager_secret.main.name +} diff --git a/infra/modules/vpc/main.tf b/infra/modules/vpc/main.tf new file mode 100644 index 0000000..c89f566 --- /dev/null +++ b/infra/modules/vpc/main.tf @@ -0,0 +1,235 @@ +variable "environment" { + description = "Deployment environment" + type = string +} + +variable "vpc_cidr" { + description = "CIDR block for VPC" + type = string +} + +variable "az_count" { + description = "Number of availability zones" + type = number +} + +variable "project_name" { + description = "Project name" + type = string +} + +resource "aws_vpc" "main" { + cidr_block = var.vpc_cidr + enable_dns_support = true + enable_dns_hostnames = true + + tags = { + Name = "${var.project_name}-${var.environment}-vpc" + } +} + +data "aws_availability_zones" "available" { + state = "available" +} + +resource "aws_subnet" "public" { + count = var.az_count + + vpc_id = aws_vpc.main.id + cidr_block = cidrsubnet(var.vpc_cidr, 8, count.index) + availability_zone = data.aws_availability_zones.available.names[count.index] + map_public_ip_on_launch = true + + tags = { + Name = "${var.project_name}-${var.environment}-public-${data.aws_availability_zones.available.names[count.index]}" + "kubernetes.io/role/elb" = "1" + } +} + +resource "aws_subnet" "private" { + count = var.az_count + + vpc_id = aws_vpc.main.id + cidr_block = cidrsubnet(var.vpc_cidr, 8, var.az_count + count.index) + availability_zone = data.aws_availability_zones.available.names[count.index] + + tags = { + Name = "${var.project_name}-${var.environment}-private-${data.aws_availability_zones.available.names[count.index]}" + "kubernetes.io/role/internal-elb" = "1" + } +} + +resource "aws_internet_gateway" "main" { + vpc_id = aws_vpc.main.id + + tags = { + Name = "${var.project_name}-${var.environment}-igw" + } +} + +resource "aws_eip" "nat" { + count = var.az_count + + domain = "vpc" + + tags = { + Name = "${var.project_name}-${var.environment}-nat-${count.index}" + } +} + +resource "aws_nat_gateway" "main" { + count = var.az_count + + allocation_id = aws_eip.nat[count.index].id + subnet_id = aws_subnet.public[count.index].id + + tags = { + Name = "${var.project_name}-${var.environment}-nat-${count.index}" + } + + depends_on = [aws_internet_gateway.main] +} + +resource "aws_route_table" "public" { + vpc_id = aws_vpc.main.id + + route { + cidr_block = "0.0.0.0/0" + gateway_id = aws_internet_gateway.main.id + } + + tags = { + Name = "${var.project_name}-${var.environment}-public-rt" + } +} + +resource "aws_route_table" "private" { + count = var.az_count + + vpc_id = aws_vpc.main.id + + route { + cidr_block = "0.0.0.0/0" + nat_gateway_id = aws_nat_gateway.main[count.index].id + } + + tags = { + Name = "${var.project_name}-${var.environment}-private-rt-${count.index}" + } +} + +resource "aws_route_table_association" "public" { + count = var.az_count + + subnet_id = aws_subnet.public[count.index].id + route_table_id = aws_route_table.public.id +} + +resource "aws_route_table_association" "private" { + count = var.az_count + + subnet_id = aws_subnet.private[count.index].id + route_table_id = aws_route_table.private[count.index].id +} + +resource "aws_security_group" "ecs" { + name_prefix = "${var.project_name}-${var.environment}-ecs" + vpc_id = aws_vpc.main.id + + ingress { + from_port = 3000 + to_port = 3003 + protocol = "tcp" + cidr_blocks = ["0.0.0.0/0"] + description = "Service ports" + } + + egress { + from_port = 0 + to_port = 0 + protocol = "-1" + cidr_blocks = ["0.0.0.0/0"] + } + + tags = { + Name = "${var.project_name}-${var.environment}-ecs-sg" + } +} + +resource "aws_security_group" "rds" { + name_prefix = "${var.project_name}-${var.environment}-rds" + vpc_id = aws_vpc.main.id + + ingress { + from_port = 5432 + to_port = 5432 + protocol = "tcp" + security_groups = [aws_security_group.ecs.id] + description = "PostgreSQL from ECS" + } + + egress { + from_port = 0 + to_port = 0 + protocol = "-1" + cidr_blocks = ["0.0.0.0/0"] + } + + tags = { + Name = "${var.project_name}-${var.environment}-rds-sg" + } +} + +resource "aws_security_group" "elasticache" { + name_prefix = "${var.project_name}-${var.environment}-elasticache" + vpc_id = aws_vpc.main.id + + ingress { + from_port = 6379 + to_port = 6379 + protocol = "tcp" + security_groups = [aws_security_group.ecs.id] + description = "Redis from ECS" + } + + egress { + from_port = 0 + to_port = 0 + protocol = "-1" + cidr_blocks = ["0.0.0.0/0"] + } + + tags = { + Name = "${var.project_name}-${var.environment}-elasticache-sg" + } +} + +output "vpc_id" { + description = "VPC ID" + value = aws_vpc.main.id +} + +output "private_subnet_ids" { + description = "Private subnet IDs" + value = aws_subnet.private[*].id +} + +output "public_subnet_ids" { + description = "Public subnet IDs" + value = aws_subnet.public[*].id +} + +output "ecs_security_group_id" { + description = "ECS security group ID" + value = aws_security_group.ecs.id +} + +output "rds_security_group_id" { + description = "RDS security group ID" + value = aws_security_group.rds.id +} + +output "elasticache_security_group_id" { + description = "ElastiCache security group ID" + value = aws_security_group.elasticache.id +} diff --git a/infra/outputs.tf b/infra/outputs.tf new file mode 100644 index 0000000..2bb06b5 --- /dev/null +++ b/infra/outputs.tf @@ -0,0 +1,35 @@ +output "vpc_id" { + description = "VPC ID" + value = module.vpc.vpc_id +} + +output "cluster_name" { + description = "ECS cluster name" + value = "${var.project_name}-${var.environment}" +} + +output "rds_endpoint" { + description = "RDS endpoint" + value = module.rds.db_endpoint + sensitive = true +} + +output "elasticache_endpoint" { + description = "ElastiCache primary endpoint" + value = module.elasticache.cache_endpoint +} + +output "s3_bucket_name" { + description = "S3 bucket name" + value = module.s3.bucket_name +} + +output "secrets_manager_arn" { + description = "Secrets Manager ARN" + value = module.secrets.secrets_manager_arn +} + +output "cloudwatch_dashboard_url" { + description = "CloudWatch dashboard URL" + value = module.cloudwatch.dashboard_url +} diff --git a/infra/scripts/rollback.sh b/infra/scripts/rollback.sh new file mode 100755 index 0000000..2a046b3 --- /dev/null +++ b/infra/scripts/rollback.sh @@ -0,0 +1,32 @@ +#!/bin/bash +set -euo pipefail + +ENVIRONMENT=${1:-staging} +SERVICE=${2:-all} + +CLUSTER="shieldai-${ENVIRONMENT}" + +echo "Rolling back services in cluster: $CLUSTER" + +SERVICES="api darkwatch spamshield voiceprint" +if [ "$SERVICE" != "all" ]; then + SERVICES="$SERVICE" +fi + +for svc in $SERVICES; do + echo "Rolling back $svc..." + aws ecs update-service \ + --cluster "$CLUSTER" \ + --service "${CLUSTER}-${svc}" \ + --rollback \ + --no-cli-auto-prompt + + echo "Waiting for $svc to stabilize..." + aws ecs wait services-stable \ + --cluster "$CLUSTER" \ + --services "${CLUSTER}-${svc}" + + echo "$svc rolled back successfully" +done + +echo "Rollback complete for $SERVICES" diff --git a/infra/variables.tf b/infra/variables.tf new file mode 100644 index 0000000..fd4b569 --- /dev/null +++ b/infra/variables.tf @@ -0,0 +1,116 @@ +variable "aws_region" { + description = "AWS region" + type = string + default = "us-east-1" +} + +variable "environment" { + description = "Deployment environment" + type = string + validation { + condition = contains(["dev", "staging", "production"], var.environment) + error_message = "Environment must be one of: dev, staging, production." + } +} + +variable "project_name" { + description = "Project name for resource naming" + type = string + default = "shieldai" +} + +variable "vpc_cidr" { + description = "CIDR block for VPC" + type = string + default = "10.0.0.0/16" +} + +variable "az_count" { + description = "Number of availability zones" + type = number + default = 2 +} + +variable "db_name" { + description = "RDS database name" + type = string + default = "shieldai" +} + +variable "db_instance_class" { + description = "RDS instance class" + type = string + default = "db.t3.medium" +} + +variable "db_multi_az" { + description = "Enable Multi-AZ deployment" + type = bool + default = true +} + +variable "db_backup_retention" { + description = "RDS backup retention period in days" + type = number + default = 7 +} + +variable "elasticache_node_type" { + description = "ElastiCache node type" + type = string + default = "cache.t3.medium" +} + +variable "elasticache_num_nodes" { + description = "Number of ElastiCache nodes" + type = number + default = 2 +} + +variable "services" { + description = "ECS services to deploy" + type = map(object({ + cpu = number + memory = number + port = number + })) + default = { + api = { + cpu = 512 + memory = 1024 + port = 3000 + } + darkwatch = { + cpu = 256 + memory = 512 + port = 3001 + } + spamshield = { + cpu = 256 + memory = 512 + port = 3002 + } + voiceprint = { + cpu = 512 + memory = 1024 + port = 3003 + } + } +} + +variable "container_images" { + description = "Container image tags per service" + type = map(string) + default = { + api = "latest" + darkwatch = "latest" + spamshield = "latest" + voiceprint = "latest" + } +} + +variable "secrets" { + description = "Secrets to store in AWS Secrets Manager" + type = map(string) + default = {} +}