re-init
This commit is contained in:
405
tasks/production-ml-pipeline/08-production-hardening.md
Normal file
405
tasks/production-ml-pipeline/08-production-hardening.md
Normal file
@@ -0,0 +1,405 @@
|
||||
# 08. Production Hardening and Observability
|
||||
|
||||
meta:
|
||||
id: production-ml-pipeline-08
|
||||
feature: production-ml-pipeline
|
||||
priority: P1
|
||||
depends_on: [production-ml-pipeline-07]
|
||||
tags: [implementation, production, observability]
|
||||
|
||||
objective:
|
||||
|
||||
- Add comprehensive error handling at every layer of the pipeline
|
||||
- Implement structured logging for observability
|
||||
- Add rate limiting to prevent abuse
|
||||
- Create a health endpoint that reports model status and inference metrics
|
||||
- Ensure the system is production-ready with monitoring, cleanup, and resilience
|
||||
|
||||
deliverables:
|
||||
|
||||
- `src/app/api/health/route.ts` — enhanced health endpoint with model status
|
||||
- `src/lib/middleware/rate-limit.ts` — rate limiting middleware
|
||||
- `src/lib/middleware/error-handler.ts` — global error handler
|
||||
- `src/lib/observability/logger.ts` — structured logger
|
||||
- `src/lib/observability/metrics.ts` — inference metrics tracker
|
||||
- Updated API routes with error handling and logging
|
||||
- Updated `next.config.ts` with rate limiting configuration
|
||||
|
||||
steps:
|
||||
|
||||
1. **Create structured logger** `src/lib/observability/logger.ts`:
|
||||
|
||||
```typescript
|
||||
export interface LogEntry {
|
||||
timestamp: string;
|
||||
level: "debug" | "info" | "warn" | "error";
|
||||
event: string;
|
||||
data?: Record<string, any>;
|
||||
error?: { message: string; stack?: string };
|
||||
}
|
||||
|
||||
export function log(level: LogEntry["level"], event: string, data?: Record<string, any>) {
|
||||
const entry: LogEntry = {
|
||||
timestamp: new Date().toISOString(),
|
||||
level,
|
||||
event,
|
||||
data,
|
||||
};
|
||||
|
||||
if (level === "error" && data?.error) {
|
||||
entry.error = {
|
||||
message: data.error.message,
|
||||
stack: data.error.stack,
|
||||
};
|
||||
}
|
||||
|
||||
console.log(JSON.stringify(entry));
|
||||
}
|
||||
|
||||
export const logger = {
|
||||
debug: (event: string, data?: any) => log("debug", event, data),
|
||||
info: (event: string, data?: any) => log("info", event, data),
|
||||
warn: (event: string, data?: any) => log("warn", event, data),
|
||||
error: (event: string, data?: any) => log("error", event, data),
|
||||
};
|
||||
```
|
||||
|
||||
2. **Create metrics tracker** `src/lib/observability/metrics.ts`:
|
||||
|
||||
```typescript
|
||||
interface InferenceMetrics {
|
||||
totalInferences: number;
|
||||
totalErrors: number;
|
||||
avgInferenceTimeMs: number;
|
||||
lastInferenceAt: string | null;
|
||||
modelLoaded: boolean;
|
||||
modelLoadTimeMs: number | null;
|
||||
}
|
||||
|
||||
class MetricsTracker {
|
||||
private metrics: InferenceMetrics = {
|
||||
totalInferences: 0,
|
||||
totalErrors: 0,
|
||||
avgInferenceTimeMs: 0,
|
||||
lastInferenceAt: null,
|
||||
modelLoaded: false,
|
||||
modelLoadTimeMs: null,
|
||||
};
|
||||
|
||||
recordInference(inferenceTimeMs: number) {
|
||||
this.metrics.totalInferences++;
|
||||
this.metrics.lastInferenceAt = new Date().toISOString();
|
||||
// Running average
|
||||
this.metrics.avgInferenceTimeMs =
|
||||
(this.metrics.avgInferenceTimeMs * (this.metrics.totalInferences - 1) + inferenceTimeMs) /
|
||||
this.metrics.totalInferences;
|
||||
}
|
||||
|
||||
recordError() {
|
||||
this.metrics.totalErrors++;
|
||||
}
|
||||
|
||||
setModelStatus(loaded: boolean, loadTimeMs?: number) {
|
||||
this.metrics.modelLoaded = loaded;
|
||||
if (loadTimeMs !== undefined) {
|
||||
this.metrics.modelLoadTimeMs = loadTimeMs;
|
||||
}
|
||||
}
|
||||
|
||||
getMetrics(): InferenceMetrics {
|
||||
return { ...this.metrics };
|
||||
}
|
||||
}
|
||||
|
||||
export const metrics = new MetricsTracker();
|
||||
```
|
||||
|
||||
3. **Enhance health endpoint** `src/app/api/health/route.ts`:
|
||||
|
||||
```typescript
|
||||
import { NextResponse } from "next/server";
|
||||
import { getModel } from "@/lib/ml/model-loader";
|
||||
import { metrics } from "@/lib/observability/metrics";
|
||||
|
||||
export async function GET() {
|
||||
const model = await getModel();
|
||||
const modelStatus = model.getStatus();
|
||||
|
||||
return NextResponse.json({
|
||||
status: "ok",
|
||||
timestamp: new Date().toISOString(),
|
||||
model: {
|
||||
loaded: modelStatus.loaded,
|
||||
backend: modelStatus.backend,
|
||||
modelId: modelStatus.modelId,
|
||||
numClasses: modelStatus.numClasses,
|
||||
error: modelStatus.error,
|
||||
},
|
||||
metrics: metrics.getMetrics(),
|
||||
uptime: process.uptime(),
|
||||
});
|
||||
}
|
||||
```
|
||||
|
||||
4. **Create rate limiting middleware** `src/lib/middleware/rate-limit.ts`:
|
||||
|
||||
```typescript
|
||||
import { NextRequest, NextResponse } from "next/server";
|
||||
|
||||
// Simple in-memory rate limiter (for production, use Redis or similar)
|
||||
const requestCounts = new Map<string, { count: number; resetAt: number }>();
|
||||
|
||||
const RATE_LIMIT = {
|
||||
maxRequests: 10, // 10 requests per window
|
||||
windowMs: 60 * 1000, // 1 minute window
|
||||
};
|
||||
|
||||
export function rateLimit(request: NextRequest): NextResponse | null {
|
||||
const ip = request.headers.get("x-forwarded-for") || "unknown";
|
||||
const now = Date.now();
|
||||
|
||||
let record = requestCounts.get(ip);
|
||||
|
||||
if (!record || now > record.resetAt) {
|
||||
record = { count: 0, resetAt: now + RATE_LIMIT.windowMs };
|
||||
requestCounts.set(ip, record);
|
||||
}
|
||||
|
||||
record.count++;
|
||||
|
||||
if (record.count > RATE_LIMIT.maxRequests) {
|
||||
return NextResponse.json(
|
||||
{ error: "Rate limit exceeded", message: "Too many requests. Please try again later." },
|
||||
{ status: 429 },
|
||||
);
|
||||
}
|
||||
|
||||
return null; // No rate limit hit
|
||||
}
|
||||
```
|
||||
|
||||
5. **Create global error handler** `src/lib/middleware/error-handler.ts`:
|
||||
|
||||
```typescript
|
||||
import { NextResponse } from "next/server";
|
||||
import { logger } from "@/lib/observability/logger";
|
||||
|
||||
export function handleError(error: unknown, context: string): NextResponse {
|
||||
logger.error("unhandled_error", {
|
||||
context,
|
||||
error:
|
||||
error instanceof Error
|
||||
? { message: error.message, stack: error.stack }
|
||||
: { message: String(error) },
|
||||
});
|
||||
|
||||
return NextResponse.json(
|
||||
{
|
||||
error: "Internal server error",
|
||||
message: "An unexpected error occurred. Please try again later.",
|
||||
context,
|
||||
},
|
||||
{ status: 500 },
|
||||
);
|
||||
}
|
||||
```
|
||||
|
||||
6. **Add error handling to `/api/upload`**:
|
||||
|
||||
```typescript
|
||||
import { rateLimit } from "@/lib/middleware/rate-limit";
|
||||
import { handleError } from "@/lib/middleware/error-handler";
|
||||
import { logger } from "@/lib/observability/logger";
|
||||
|
||||
export async function POST(request: NextRequest) {
|
||||
// Rate limiting
|
||||
const rateLimitError = rateLimit(request);
|
||||
if (rateLimitError) return rateLimitError;
|
||||
|
||||
try {
|
||||
logger.info("upload_start", { ip: request.headers.get("x-forwarded-for") });
|
||||
|
||||
// ... existing upload logic ...
|
||||
|
||||
logger.info("upload_success", { imageId, fileSize: buffer.length });
|
||||
return NextResponse.json({ imageId, tensorShape, previewUrl });
|
||||
} catch (error) {
|
||||
return handleError(error, "upload");
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
7. **Add error handling to `/api/identify`**:
|
||||
|
||||
```typescript
|
||||
export async function POST(request: NextRequest) {
|
||||
const rateLimitError = rateLimit(request);
|
||||
if (rateLimitError) return rateLimitError;
|
||||
|
||||
try {
|
||||
logger.info("identify_start", { imageId, plantId });
|
||||
|
||||
const startTime = Date.now();
|
||||
|
||||
// ... existing identify logic ...
|
||||
|
||||
const inferenceTimeMs = Date.now() - startTime;
|
||||
metrics.recordInference(inferenceTimeMs);
|
||||
|
||||
logger.info("identify_success", {
|
||||
imageId,
|
||||
inferenceTimeMs,
|
||||
topPrediction: predictions[0]?.diseaseId,
|
||||
confidence: predictions[0]?.confidence.adjusted,
|
||||
});
|
||||
|
||||
return NextResponse.json({ predictions, metadata });
|
||||
} catch (error) {
|
||||
metrics.recordError();
|
||||
|
||||
if (error instanceof Error && error.message.includes("not loaded")) {
|
||||
return NextResponse.json(
|
||||
{
|
||||
error: "Model not available",
|
||||
message: "ML model failed to load. Please try again later.",
|
||||
},
|
||||
{ status: 503 },
|
||||
);
|
||||
}
|
||||
|
||||
return handleError(error, "identify");
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
8. **Add model status tracking to `model-loader.ts`**:
|
||||
|
||||
```typescript
|
||||
import { metrics } from "@/lib/observability/metrics";
|
||||
|
||||
async function loadModel(): Promise<PlantDiseaseModel> {
|
||||
const startTime = Date.now();
|
||||
|
||||
try {
|
||||
const model = await tryLoadTFJS();
|
||||
if (model) {
|
||||
const loadTimeMs = Date.now() - startTime;
|
||||
metrics.setModelStatus(true, loadTimeMs);
|
||||
logger.info("model_loaded", { backend: "tfjs", loadTimeMs });
|
||||
return model;
|
||||
}
|
||||
} catch (error) {
|
||||
logger.warn("model_load_failed", { backend: "tfjs", error });
|
||||
}
|
||||
|
||||
// ... fallback to mock ...
|
||||
metrics.setModelStatus(false);
|
||||
return createMockModel();
|
||||
}
|
||||
```
|
||||
|
||||
9. **Add cleanup for old uploads**:
|
||||
|
||||
```typescript
|
||||
// src/lib/cleanup.ts
|
||||
import fs from "fs/promises";
|
||||
import path from "path";
|
||||
|
||||
const UPLOADS_DIR = path.join(process.cwd(), "public", "uploads");
|
||||
const MAX_AGE_MS = 24 * 60 * 60 * 1000; // 24 hours
|
||||
|
||||
export async function cleanupOldUploads() {
|
||||
const files = await fs.readdir(UPLOADS_DIR);
|
||||
const now = Date.now();
|
||||
|
||||
for (const file of files) {
|
||||
const filePath = path.join(UPLOADS_DIR, file);
|
||||
const stat = await fs.stat(filePath);
|
||||
|
||||
if (now - stat.mtimeMs > MAX_AGE_MS) {
|
||||
await fs.unlink(filePath);
|
||||
logger.info("upload_cleaned", { file, ageMs: now - stat.mtimeMs });
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Run cleanup on server start and periodically
|
||||
if (process.env.NODE_ENV === "production") {
|
||||
cleanupOldUploads();
|
||||
setInterval(cleanupOldUploads, 60 * 60 * 1000); // Every hour
|
||||
}
|
||||
```
|
||||
|
||||
10. **Update `next.config.ts`** with security headers and rate limiting:
|
||||
|
||||
```typescript
|
||||
const nextConfig = {
|
||||
// ... existing config ...
|
||||
async headers() {
|
||||
return [
|
||||
{
|
||||
source: "/api/:path*",
|
||||
headers: [
|
||||
{ key: "X-Content-Type-Options", value: "nosniff" },
|
||||
{ key: "X-Frame-Options", value: "DENY" },
|
||||
{ key: "X-XSS-Protection", value: "1; mode=block" },
|
||||
],
|
||||
},
|
||||
];
|
||||
},
|
||||
};
|
||||
```
|
||||
|
||||
11. **Add monitoring dashboard** (optional) `src/app/admin/metrics/page.tsx`:
|
||||
- Simple page showing inference metrics
|
||||
- Model status
|
||||
- Recent inference times
|
||||
- Error rate
|
||||
- Protected by authentication (admin only)
|
||||
|
||||
12. **Document production checklist** in `docs/production-checklist.md`:
|
||||
- Environment variables needed
|
||||
- Model deployment steps
|
||||
- Monitoring setup
|
||||
- Backup strategy
|
||||
- Rollback procedure
|
||||
|
||||
tests:
|
||||
|
||||
- Unit: rate limiter blocks after max requests
|
||||
- Unit: rate limiter resets after window
|
||||
- Unit: metrics tracker records inference correctly
|
||||
- Unit: metrics tracker computes running average
|
||||
- Unit: logger produces valid JSON output
|
||||
- Integration: health endpoint returns model status and metrics
|
||||
- Integration: rate limit returns 429 after max requests
|
||||
- Integration: error handler catches unhandled errors and returns 500
|
||||
|
||||
acceptance_criteria:
|
||||
|
||||
- All API routes have rate limiting (10 requests per minute per IP)
|
||||
- All API routes have structured logging (JSON format)
|
||||
- Health endpoint reports model status, inference metrics, uptime
|
||||
- Error handler catches all unhandled errors and returns 500 with clear message
|
||||
- Old uploads are cleaned up automatically (24-hour TTL)
|
||||
- Metrics tracker records inference time, error rate, model status
|
||||
- Security headers are set (X-Content-Type-Options, X-Frame-Options, X-XSS-Protection)
|
||||
- Production checklist is documented
|
||||
|
||||
validation:
|
||||
|
||||
- `npx vitest run src/lib/middleware/rate-limit.test.ts`
|
||||
- `npx vitest run src/lib/observability/metrics.test.ts`
|
||||
- `curl http://localhost:3000/api/health` — returns model status and metrics
|
||||
- `curl -X POST http://localhost:3000/api/identify ...` (11 times) — 11th request returns 429
|
||||
- Check server logs: JSON-formatted log entries for all requests
|
||||
- Wait 25 minutes: old uploads are cleaned up
|
||||
|
||||
notes:
|
||||
|
||||
- Rate limiter uses in-memory storage — for multi-instance deployments, use Redis or similar
|
||||
- Metrics are in-memory — for persistent metrics, use a time-series database
|
||||
- Health endpoint should be monitored by uptime monitoring service (e.g., Pingdom, UptimeRobot)
|
||||
- Cleanup runs every hour in production — adjust frequency based on upload volume
|
||||
- Security headers are basic — consider adding CSP, HSTS for full security hardening
|
||||
- Production checklist should be reviewed before each deployment
|
||||
Reference in New Issue
Block a user