406 lines
12 KiB
Markdown
406 lines
12 KiB
Markdown
# 08. Production Hardening and Observability
|
|
|
|
meta:
|
|
id: production-ml-pipeline-08
|
|
feature: production-ml-pipeline
|
|
priority: P1
|
|
depends_on: [production-ml-pipeline-07]
|
|
tags: [implementation, production, observability]
|
|
|
|
objective:
|
|
|
|
- Add comprehensive error handling at every layer of the pipeline
|
|
- Implement structured logging for observability
|
|
- Add rate limiting to prevent abuse
|
|
- Create a health endpoint that reports model status and inference metrics
|
|
- Ensure the system is production-ready with monitoring, cleanup, and resilience
|
|
|
|
deliverables:
|
|
|
|
- `src/app/api/health/route.ts` — enhanced health endpoint with model status
|
|
- `src/lib/middleware/rate-limit.ts` — rate limiting middleware
|
|
- `src/lib/middleware/error-handler.ts` — global error handler
|
|
- `src/lib/observability/logger.ts` — structured logger
|
|
- `src/lib/observability/metrics.ts` — inference metrics tracker
|
|
- Updated API routes with error handling and logging
|
|
- Updated `next.config.ts` with rate limiting configuration
|
|
|
|
steps:
|
|
|
|
1. **Create structured logger** `src/lib/observability/logger.ts`:
|
|
|
|
```typescript
|
|
export interface LogEntry {
|
|
timestamp: string;
|
|
level: "debug" | "info" | "warn" | "error";
|
|
event: string;
|
|
data?: Record<string, any>;
|
|
error?: { message: string; stack?: string };
|
|
}
|
|
|
|
export function log(level: LogEntry["level"], event: string, data?: Record<string, any>) {
|
|
const entry: LogEntry = {
|
|
timestamp: new Date().toISOString(),
|
|
level,
|
|
event,
|
|
data,
|
|
};
|
|
|
|
if (level === "error" && data?.error) {
|
|
entry.error = {
|
|
message: data.error.message,
|
|
stack: data.error.stack,
|
|
};
|
|
}
|
|
|
|
console.log(JSON.stringify(entry));
|
|
}
|
|
|
|
export const logger = {
|
|
debug: (event: string, data?: any) => log("debug", event, data),
|
|
info: (event: string, data?: any) => log("info", event, data),
|
|
warn: (event: string, data?: any) => log("warn", event, data),
|
|
error: (event: string, data?: any) => log("error", event, data),
|
|
};
|
|
```
|
|
|
|
2. **Create metrics tracker** `src/lib/observability/metrics.ts`:
|
|
|
|
```typescript
|
|
interface InferenceMetrics {
|
|
totalInferences: number;
|
|
totalErrors: number;
|
|
avgInferenceTimeMs: number;
|
|
lastInferenceAt: string | null;
|
|
modelLoaded: boolean;
|
|
modelLoadTimeMs: number | null;
|
|
}
|
|
|
|
class MetricsTracker {
|
|
private metrics: InferenceMetrics = {
|
|
totalInferences: 0,
|
|
totalErrors: 0,
|
|
avgInferenceTimeMs: 0,
|
|
lastInferenceAt: null,
|
|
modelLoaded: false,
|
|
modelLoadTimeMs: null,
|
|
};
|
|
|
|
recordInference(inferenceTimeMs: number) {
|
|
this.metrics.totalInferences++;
|
|
this.metrics.lastInferenceAt = new Date().toISOString();
|
|
// Running average
|
|
this.metrics.avgInferenceTimeMs =
|
|
(this.metrics.avgInferenceTimeMs * (this.metrics.totalInferences - 1) + inferenceTimeMs) /
|
|
this.metrics.totalInferences;
|
|
}
|
|
|
|
recordError() {
|
|
this.metrics.totalErrors++;
|
|
}
|
|
|
|
setModelStatus(loaded: boolean, loadTimeMs?: number) {
|
|
this.metrics.modelLoaded = loaded;
|
|
if (loadTimeMs !== undefined) {
|
|
this.metrics.modelLoadTimeMs = loadTimeMs;
|
|
}
|
|
}
|
|
|
|
getMetrics(): InferenceMetrics {
|
|
return { ...this.metrics };
|
|
}
|
|
}
|
|
|
|
export const metrics = new MetricsTracker();
|
|
```
|
|
|
|
3. **Enhance health endpoint** `src/app/api/health/route.ts`:
|
|
|
|
```typescript
|
|
import { NextResponse } from "next/server";
|
|
import { getModel } from "@/lib/ml/model-loader";
|
|
import { metrics } from "@/lib/observability/metrics";
|
|
|
|
export async function GET() {
|
|
const model = await getModel();
|
|
const modelStatus = model.getStatus();
|
|
|
|
return NextResponse.json({
|
|
status: "ok",
|
|
timestamp: new Date().toISOString(),
|
|
model: {
|
|
loaded: modelStatus.loaded,
|
|
backend: modelStatus.backend,
|
|
modelId: modelStatus.modelId,
|
|
numClasses: modelStatus.numClasses,
|
|
error: modelStatus.error,
|
|
},
|
|
metrics: metrics.getMetrics(),
|
|
uptime: process.uptime(),
|
|
});
|
|
}
|
|
```
|
|
|
|
4. **Create rate limiting middleware** `src/lib/middleware/rate-limit.ts`:
|
|
|
|
```typescript
|
|
import { NextRequest, NextResponse } from "next/server";
|
|
|
|
// Simple in-memory rate limiter (for production, use Redis or similar)
|
|
const requestCounts = new Map<string, { count: number; resetAt: number }>();
|
|
|
|
const RATE_LIMIT = {
|
|
maxRequests: 10, // 10 requests per window
|
|
windowMs: 60 * 1000, // 1 minute window
|
|
};
|
|
|
|
export function rateLimit(request: NextRequest): NextResponse | null {
|
|
const ip = request.headers.get("x-forwarded-for") || "unknown";
|
|
const now = Date.now();
|
|
|
|
let record = requestCounts.get(ip);
|
|
|
|
if (!record || now > record.resetAt) {
|
|
record = { count: 0, resetAt: now + RATE_LIMIT.windowMs };
|
|
requestCounts.set(ip, record);
|
|
}
|
|
|
|
record.count++;
|
|
|
|
if (record.count > RATE_LIMIT.maxRequests) {
|
|
return NextResponse.json(
|
|
{ error: "Rate limit exceeded", message: "Too many requests. Please try again later." },
|
|
{ status: 429 },
|
|
);
|
|
}
|
|
|
|
return null; // No rate limit hit
|
|
}
|
|
```
|
|
|
|
5. **Create global error handler** `src/lib/middleware/error-handler.ts`:
|
|
|
|
```typescript
|
|
import { NextResponse } from "next/server";
|
|
import { logger } from "@/lib/observability/logger";
|
|
|
|
export function handleError(error: unknown, context: string): NextResponse {
|
|
logger.error("unhandled_error", {
|
|
context,
|
|
error:
|
|
error instanceof Error
|
|
? { message: error.message, stack: error.stack }
|
|
: { message: String(error) },
|
|
});
|
|
|
|
return NextResponse.json(
|
|
{
|
|
error: "Internal server error",
|
|
message: "An unexpected error occurred. Please try again later.",
|
|
context,
|
|
},
|
|
{ status: 500 },
|
|
);
|
|
}
|
|
```
|
|
|
|
6. **Add error handling to `/api/upload`**:
|
|
|
|
```typescript
|
|
import { rateLimit } from "@/lib/middleware/rate-limit";
|
|
import { handleError } from "@/lib/middleware/error-handler";
|
|
import { logger } from "@/lib/observability/logger";
|
|
|
|
export async function POST(request: NextRequest) {
|
|
// Rate limiting
|
|
const rateLimitError = rateLimit(request);
|
|
if (rateLimitError) return rateLimitError;
|
|
|
|
try {
|
|
logger.info("upload_start", { ip: request.headers.get("x-forwarded-for") });
|
|
|
|
// ... existing upload logic ...
|
|
|
|
logger.info("upload_success", { imageId, fileSize: buffer.length });
|
|
return NextResponse.json({ imageId, tensorShape, previewUrl });
|
|
} catch (error) {
|
|
return handleError(error, "upload");
|
|
}
|
|
}
|
|
```
|
|
|
|
7. **Add error handling to `/api/identify`**:
|
|
|
|
```typescript
|
|
export async function POST(request: NextRequest) {
|
|
const rateLimitError = rateLimit(request);
|
|
if (rateLimitError) return rateLimitError;
|
|
|
|
try {
|
|
logger.info("identify_start", { imageId, plantId });
|
|
|
|
const startTime = Date.now();
|
|
|
|
// ... existing identify logic ...
|
|
|
|
const inferenceTimeMs = Date.now() - startTime;
|
|
metrics.recordInference(inferenceTimeMs);
|
|
|
|
logger.info("identify_success", {
|
|
imageId,
|
|
inferenceTimeMs,
|
|
topPrediction: predictions[0]?.diseaseId,
|
|
confidence: predictions[0]?.confidence.adjusted,
|
|
});
|
|
|
|
return NextResponse.json({ predictions, metadata });
|
|
} catch (error) {
|
|
metrics.recordError();
|
|
|
|
if (error instanceof Error && error.message.includes("not loaded")) {
|
|
return NextResponse.json(
|
|
{
|
|
error: "Model not available",
|
|
message: "ML model failed to load. Please try again later.",
|
|
},
|
|
{ status: 503 },
|
|
);
|
|
}
|
|
|
|
return handleError(error, "identify");
|
|
}
|
|
}
|
|
```
|
|
|
|
8. **Add model status tracking to `model-loader.ts`**:
|
|
|
|
```typescript
|
|
import { metrics } from "@/lib/observability/metrics";
|
|
|
|
async function loadModel(): Promise<PlantDiseaseModel> {
|
|
const startTime = Date.now();
|
|
|
|
try {
|
|
const model = await tryLoadTFJS();
|
|
if (model) {
|
|
const loadTimeMs = Date.now() - startTime;
|
|
metrics.setModelStatus(true, loadTimeMs);
|
|
logger.info("model_loaded", { backend: "tfjs", loadTimeMs });
|
|
return model;
|
|
}
|
|
} catch (error) {
|
|
logger.warn("model_load_failed", { backend: "tfjs", error });
|
|
}
|
|
|
|
// ... fallback to mock ...
|
|
metrics.setModelStatus(false);
|
|
return createMockModel();
|
|
}
|
|
```
|
|
|
|
9. **Add cleanup for old uploads**:
|
|
|
|
```typescript
|
|
// src/lib/cleanup.ts
|
|
import fs from "fs/promises";
|
|
import path from "path";
|
|
|
|
const UPLOADS_DIR = path.join(process.cwd(), "public", "uploads");
|
|
const MAX_AGE_MS = 24 * 60 * 60 * 1000; // 24 hours
|
|
|
|
export async function cleanupOldUploads() {
|
|
const files = await fs.readdir(UPLOADS_DIR);
|
|
const now = Date.now();
|
|
|
|
for (const file of files) {
|
|
const filePath = path.join(UPLOADS_DIR, file);
|
|
const stat = await fs.stat(filePath);
|
|
|
|
if (now - stat.mtimeMs > MAX_AGE_MS) {
|
|
await fs.unlink(filePath);
|
|
logger.info("upload_cleaned", { file, ageMs: now - stat.mtimeMs });
|
|
}
|
|
}
|
|
}
|
|
|
|
// Run cleanup on server start and periodically
|
|
if (process.env.NODE_ENV === "production") {
|
|
cleanupOldUploads();
|
|
setInterval(cleanupOldUploads, 60 * 60 * 1000); // Every hour
|
|
}
|
|
```
|
|
|
|
10. **Update `next.config.ts`** with security headers and rate limiting:
|
|
|
|
```typescript
|
|
const nextConfig = {
|
|
// ... existing config ...
|
|
async headers() {
|
|
return [
|
|
{
|
|
source: "/api/:path*",
|
|
headers: [
|
|
{ key: "X-Content-Type-Options", value: "nosniff" },
|
|
{ key: "X-Frame-Options", value: "DENY" },
|
|
{ key: "X-XSS-Protection", value: "1; mode=block" },
|
|
],
|
|
},
|
|
];
|
|
},
|
|
};
|
|
```
|
|
|
|
11. **Add monitoring dashboard** (optional) `src/app/admin/metrics/page.tsx`:
|
|
- Simple page showing inference metrics
|
|
- Model status
|
|
- Recent inference times
|
|
- Error rate
|
|
- Protected by authentication (admin only)
|
|
|
|
12. **Document production checklist** in `docs/production-checklist.md`:
|
|
- Environment variables needed
|
|
- Model deployment steps
|
|
- Monitoring setup
|
|
- Backup strategy
|
|
- Rollback procedure
|
|
|
|
tests:
|
|
|
|
- Unit: rate limiter blocks after max requests
|
|
- Unit: rate limiter resets after window
|
|
- Unit: metrics tracker records inference correctly
|
|
- Unit: metrics tracker computes running average
|
|
- Unit: logger produces valid JSON output
|
|
- Integration: health endpoint returns model status and metrics
|
|
- Integration: rate limit returns 429 after max requests
|
|
- Integration: error handler catches unhandled errors and returns 500
|
|
|
|
acceptance_criteria:
|
|
|
|
- All API routes have rate limiting (10 requests per minute per IP)
|
|
- All API routes have structured logging (JSON format)
|
|
- Health endpoint reports model status, inference metrics, uptime
|
|
- Error handler catches all unhandled errors and returns 500 with clear message
|
|
- Old uploads are cleaned up automatically (24-hour TTL)
|
|
- Metrics tracker records inference time, error rate, model status
|
|
- Security headers are set (X-Content-Type-Options, X-Frame-Options, X-XSS-Protection)
|
|
- Production checklist is documented
|
|
|
|
validation:
|
|
|
|
- `npx vitest run src/lib/middleware/rate-limit.test.ts`
|
|
- `npx vitest run src/lib/observability/metrics.test.ts`
|
|
- `curl http://localhost:3000/api/health` — returns model status and metrics
|
|
- `curl -X POST http://localhost:3000/api/identify ...` (11 times) — 11th request returns 429
|
|
- Check server logs: JSON-formatted log entries for all requests
|
|
- Wait 25 minutes: old uploads are cleaned up
|
|
|
|
notes:
|
|
|
|
- Rate limiter uses in-memory storage — for multi-instance deployments, use Redis or similar
|
|
- Metrics are in-memory — for persistent metrics, use a time-series database
|
|
- Health endpoint should be monitored by uptime monitoring service (e.g., Pingdom, UptimeRobot)
|
|
- Cleanup runs every hour in production — adjust frequency based on upload volume
|
|
- Security headers are basic — consider adding CSP, HSTS for full security hardening
|
|
- Production checklist should be reviewed before each deployment
|