daily work
This commit is contained in:
869
analysis/fre4806_datadog_sentry_integration.md
Normal file
869
analysis/fre4806_datadog_sentry_integration.md
Normal file
@@ -0,0 +1,869 @@
|
||||
# FRE-4806: Datadog APM + Sentry Integration Implementation Plan
|
||||
|
||||
## Overview
|
||||
|
||||
This document outlines the implementation approach for integrating Datadog APM and Sentry into the FrenoCorp platform. This integration provides comprehensive observability, error tracking, and performance monitoring across all services.
|
||||
|
||||
## Architecture Decision Record (ADR)
|
||||
|
||||
### ADR-0042: Observability Stack Selection
|
||||
|
||||
**Decision:** Integrate Datadog APM for distributed tracing and performance monitoring, combined with Sentry for error tracking and release management.
|
||||
|
||||
**Context:**
|
||||
- Current monitoring relies on basic logging and metrics
|
||||
- No centralized error tracking or distributed tracing
|
||||
- Multiple microservices require coordinated observability
|
||||
- Need to support debugging production issues efficiently
|
||||
|
||||
**Alternatives Considered:**
|
||||
|
||||
| Option | Pros | Cons |
|
||||
|--------|------|------|
|
||||
| Datadog + Sentry | Industry standard, rich ecosystem, excellent DX | Cost at scale |
|
||||
| OpenTelemetry + ELK | Open source, flexible | Higher operational overhead |
|
||||
| New Relic | Good APM, unified platform | Less flexible error tracking |
|
||||
|
||||
**Decision Rationale:**
|
||||
- Datadog APM provides best-in-class distributed tracing
|
||||
- Sentry offers superior developer experience for error tracking
|
||||
- Both have excellent Node.js, TypeScript, and Go support
|
||||
- Integration with existing CI/CD pipelines
|
||||
|
||||
---
|
||||
|
||||
## Implementation Plan
|
||||
|
||||
### Phase 1: Datadog APM Integration
|
||||
|
||||
#### 1.1 Install and Configure Datadog SDK
|
||||
|
||||
**Node.js Services:**
|
||||
```typescript
|
||||
// package.json
|
||||
devDependencies: {
|
||||
"@datadog/pprof": "^1.0.0",
|
||||
"dd-trace": "^5.19.0",
|
||||
}
|
||||
|
||||
// datadog.config.js
|
||||
dd-trace.init({
|
||||
service: 'freno-corpservice',
|
||||
version: '1.0.0',
|
||||
env: process.env.NODE_ENV,
|
||||
sampling: 1.0,
|
||||
headers: {
|
||||
'Datadog-Trace-Propagation': 'w3c',
|
||||
},
|
||||
});
|
||||
```
|
||||
|
||||
**Go Services:**
|
||||
```go
|
||||
// go.mod
|
||||
go.mod: require (
|
||||
github.com/DataDog/dd-trace-go/v2 v2.1.0
|
||||
)
|
||||
|
||||
// main.go
|
||||
import (
|
||||
"github.com/DataDog/dd-trace-go/v2/ddtrace/opentelemetry"
|
||||
"go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp"
|
||||
)
|
||||
|
||||
func initTracer() {
|
||||
otel.OTelTraceProvider(&otelo.TraceProviderConfig{
|
||||
ServiceName: "freno-corpservice",
|
||||
})
|
||||
}
|
||||
```
|
||||
|
||||
#### 1.2 Configure Tracing Endpoints
|
||||
|
||||
**datadog.yaml configuration:**
|
||||
```yaml
|
||||
# Datadog configuration
|
||||
dd_trace_enabled: true
|
||||
dd_apm_enabled: true
|
||||
dd_api_key: "${DD_API_KEY}"
|
||||
dd_app_key: "${DD_APP_KEY}"
|
||||
dd_site: "datadoghq.com"
|
||||
|
||||
# Tracing configuration
|
||||
dd_tracing_enabled: true
|
||||
dd_trace_sample_rate: 1.0
|
||||
dd_tracing_sampling_rules:
|
||||
- service: "api" rate: 1.0
|
||||
- service: "worker" rate: 0.5
|
||||
- service: "scheduler" rate: 0.1
|
||||
|
||||
# Performance monitoring
|
||||
dd_profiling_enabled: true
|
||||
dd_live_metrics: true
|
||||
```
|
||||
|
||||
#### 1.3 Implement Distributed Tracing
|
||||
|
||||
**Request Context Propagation:**
|
||||
```typescript
|
||||
// middleware/tracing.ts
|
||||
import { trace, Span } from '@datadog/pprof';
|
||||
import { createContext } from 'express';
|
||||
|
||||
export const tracingMiddleware = (req: Request, res: Response, next: NextFunction) => {
|
||||
const span = trace.startSpan('http.request', {
|
||||
service: 'api',
|
||||
resource: `${req.method} ${req.path}`,
|
||||
tags: {
|
||||
'http.url': req.url,
|
||||
'http.method': req.method,
|
||||
'user.id': req.user?.id,
|
||||
},
|
||||
});
|
||||
|
||||
// Attach span to request context
|
||||
req.span = span;
|
||||
|
||||
res.on('finish', () => {
|
||||
span.finish();
|
||||
});
|
||||
|
||||
next();
|
||||
};
|
||||
```
|
||||
|
||||
#### 1.4 Database Query Tracing
|
||||
|
||||
**PostgreSQL:**
|
||||
```typescript
|
||||
// middleware/db-tracing.ts
|
||||
import { trace } from '@datadog/pprof';
|
||||
|
||||
export const dbTracingMiddleware = async (sql: string, params: unknown[]) => {
|
||||
const span = trace.startSpan('db.query', {
|
||||
service: 'database',
|
||||
resource: sql.substring(0, 100),
|
||||
tags: {
|
||||
'db.system': 'postgresql',
|
||||
'db.statement': sql,
|
||||
},
|
||||
});
|
||||
|
||||
try {
|
||||
const start = Date.now();
|
||||
const result = await query(sql, params);
|
||||
const duration = Date.now() - start;
|
||||
|
||||
span.setTags({
|
||||
'db.query.duration': duration,
|
||||
'db.query.rows': result.rowCount,
|
||||
});
|
||||
|
||||
return result;
|
||||
} catch (error) {
|
||||
span.setError(error);
|
||||
throw error;
|
||||
} finally {
|
||||
span.finish();
|
||||
}
|
||||
};
|
||||
```
|
||||
|
||||
**Redis:**
|
||||
```typescript
|
||||
// middleware/redis-tracing.ts
|
||||
import { trace } from '@datadog/pprof';
|
||||
|
||||
export const redisTracingMiddleware = async (redis: Redis, key: string, command: string) => {
|
||||
const span = trace.startSpan('redis.command', {
|
||||
service: 'cache',
|
||||
resource: `${command}:${key.substring(0, 50)}`,
|
||||
tags: {
|
||||
'redis.key': key,
|
||||
'redis.command': command,
|
||||
},
|
||||
});
|
||||
|
||||
const start = Date.now();
|
||||
try {
|
||||
const result = await redis[command](key);
|
||||
const duration = Date.now() - start;
|
||||
|
||||
span.setTags({
|
||||
'redis.duration': duration,
|
||||
'redis.result': JSON.stringify(result),
|
||||
});
|
||||
|
||||
return result;
|
||||
} finally {
|
||||
span.finish();
|
||||
}
|
||||
};
|
||||
```
|
||||
|
||||
#### 1.5 External Service Tracing
|
||||
|
||||
**HTTP Client Instrumentation:**
|
||||
```typescript
|
||||
// middleware/http-client-tracing.ts
|
||||
import { trace } from '@datadog/pprof';
|
||||
import { createProxyAgent } from 'http-proxy-agent';
|
||||
|
||||
export const httpTracingAgent = new http.Agent({
|
||||
keepAlive: true,
|
||||
keepAliveMsecs: 1000,
|
||||
maxSockets: 256,
|
||||
maxFreeSockets: 256,
|
||||
});
|
||||
|
||||
export const httpTracingMiddleware = (url: URL, options: RequestOptions) => {
|
||||
const span = trace.startSpan('http.outbound', {
|
||||
service: 'external-api',
|
||||
resource: `${url.hostname}:${url.port || 443} ${options.method || 'GET'}`,
|
||||
tags: {
|
||||
'url': url.href,
|
||||
'method': options.method,
|
||||
},
|
||||
});
|
||||
|
||||
return new Promise((resolve, reject) => {
|
||||
const client = new https.Agent({
|
||||
...httpTracingAgent,
|
||||
createConnection: (options, cb) => {
|
||||
const span = trace.startSpan('tcp.socket', {
|
||||
service: 'network',
|
||||
resource: `${options.host}:${options.port}`,
|
||||
});
|
||||
|
||||
const socket = net.createConnection(options, () => {
|
||||
span.finish();
|
||||
cb(null, socket);
|
||||
});
|
||||
|
||||
socket.on('error', (err) => {
|
||||
span.setError(err);
|
||||
span.finish();
|
||||
reject(err);
|
||||
});
|
||||
|
||||
return socket;
|
||||
},
|
||||
});
|
||||
|
||||
const req = https.request(url, options as any, (res) => {
|
||||
const duration = Date.now() - start;
|
||||
|
||||
span.setTags({
|
||||
'http.response.status': res.statusCode,
|
||||
'http.response.duration': duration,
|
||||
});
|
||||
|
||||
span.finish();
|
||||
resolve(res);
|
||||
});
|
||||
|
||||
req.on('error', (err) => {
|
||||
span.setError(err);
|
||||
span.finish();
|
||||
reject(err);
|
||||
});
|
||||
|
||||
req.setTimeout(30000);
|
||||
req.end();
|
||||
});
|
||||
};
|
||||
```
|
||||
|
||||
#### 1.6 Trace Sampling and Performance
|
||||
|
||||
**Smart Sampling Strategy:**
|
||||
```typescript
|
||||
// config/tracing.config.ts
|
||||
export const tracingConfig = {
|
||||
// Sample 100% of requests with user_id for debugging
|
||||
sampleRateByUser: (userId: string) => {
|
||||
const hash = djb2Hash(userId);
|
||||
return hash % 100 === 0 ? 1.0 : 0.0;
|
||||
},
|
||||
|
||||
// Sample 10% of error requests for analysis
|
||||
sampleRateOnError: 0.1,
|
||||
|
||||
// Sample 5% of slow requests (duration > 100ms)
|
||||
sampleRateByDuration: (duration: number) => {
|
||||
return duration > 100 ? 0.05 : 0.0;
|
||||
},
|
||||
|
||||
// Sample 1% of all requests for load testing
|
||||
defaultSampleRate: 0.01,
|
||||
};
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### Phase 2: Sentry Integration
|
||||
|
||||
#### 2.1 Install and Configure Sentry SDK
|
||||
|
||||
**Node.js Configuration:**
|
||||
```typescript
|
||||
// sentry.ts
|
||||
import * as Sentry from '@sentry/node';
|
||||
import { Express } from '@sentry/express';
|
||||
import { NodeProfilingIntegration } from '@sentry/node/integrations';
|
||||
|
||||
const sentryConfig: Sentry.NodeOptions = {
|
||||
dsn: process.env.SENTRY_DSN,
|
||||
environment: process.env.NODE_ENV,
|
||||
release: `freno-corp@${pkg.version}-${process.env.GIT_SHA || 'local'}`,
|
||||
tracesSampleRate: process.env.NODE_ENV === 'production' ? 0.1 : 1.0,
|
||||
profilesSampleRate: 1.0,
|
||||
|
||||
// Integrations
|
||||
integrations: [
|
||||
new Sentry.Integrations.Express({ expr: app }),
|
||||
new NodeProfilingIntegration(),
|
||||
new Sentry.Integrations.Http({
|
||||
tracing: true,
|
||||
// Exclude internal calls
|
||||
ignoreUrls: [
|
||||
/\/api\/internal\//,
|
||||
/\/health\//,
|
||||
/\/metrics\//,
|
||||
],
|
||||
// Include external API calls
|
||||
includeUrls: [
|
||||
/\/api\/external\//,
|
||||
/\/api\/partner\//,
|
||||
],
|
||||
}),
|
||||
],
|
||||
|
||||
// Performance monitoring
|
||||
beforeSendTransaction(event: Sentry.TransactionEvent) {
|
||||
// Filter out internal transactions
|
||||
if (event.transaction.startsWith('/internal')) {
|
||||
return null;
|
||||
}
|
||||
return event;
|
||||
},
|
||||
|
||||
// Error filtering
|
||||
beforeSend(event: Sentry.Event, hint: Sentry.EventHint) {
|
||||
// Filter out known issues
|
||||
const knownIssues = [
|
||||
/ECONNREFUSED/,
|
||||
/ETIMEDOUT/,
|
||||
/Rate limit exceeded/,
|
||||
];
|
||||
|
||||
const message = event.message?.toString() || '';
|
||||
if (knownIssues.some(regex => regex.test(message))) {
|
||||
return null;
|
||||
}
|
||||
|
||||
return event;
|
||||
},
|
||||
};
|
||||
|
||||
export const initSentry = () => {
|
||||
Sentry.init(sentryConfig);
|
||||
};
|
||||
```
|
||||
|
||||
#### 2.2 React/Next.js Integration
|
||||
|
||||
**Error Boundaries:**
|
||||
```typescript
|
||||
// components/SentryErrorBoundary.tsx
|
||||
import * as Sentry from '@sentry/react';
|
||||
import React, { Component, ErrorInfo, ReactNode } from 'react';
|
||||
|
||||
interface Props {
|
||||
children: ReactNode;
|
||||
fallback?: ReactNode;
|
||||
}
|
||||
|
||||
interface State {
|
||||
hasError: boolean;
|
||||
error: Error | null;
|
||||
}
|
||||
|
||||
export class SentryErrorBoundary extends Component<Props, State> {
|
||||
constructor(props: Props) {
|
||||
super(props);
|
||||
this.state = { hasError: false, error: null };
|
||||
}
|
||||
|
||||
static getDerivedStateFromError(error: Error): State {
|
||||
return { hasError: true, error };
|
||||
}
|
||||
|
||||
componentDidCatch(error: Error, errorInfo: ErrorInfo) {
|
||||
Sentry.captureException(error, {
|
||||
contexts: {
|
||||
react: { componentStack: errorInfo.componentStack }
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
render() {
|
||||
if (this.state.hasError) {
|
||||
return this.props.fallback || <SentryErrorFallback />;
|
||||
}
|
||||
return this.props.children;
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
**Global Error Handler:**
|
||||
```typescript
|
||||
// middleware/global-error-handler.ts
|
||||
export const errorHandler = (err: Error, req: Request, res: Response, next: NextFunction) => {
|
||||
// Capture error in Sentry
|
||||
Sentry.captureException(err, {
|
||||
extra: {
|
||||
url: req.url,
|
||||
method: req.method,
|
||||
userAgent: req.headers['user-agent'],
|
||||
},
|
||||
});
|
||||
|
||||
// Log to Datadog
|
||||
const span = req.span;
|
||||
if (span) {
|
||||
span.setError(err);
|
||||
span.setTag('error', 'unhandled');
|
||||
}
|
||||
|
||||
// Standard error handling
|
||||
const statusCode = err.statusCode || 500;
|
||||
res.status(statusCode).json({
|
||||
error: err.message,
|
||||
...(process.env.NODE_ENV === 'development' && { stack: err.stack }),
|
||||
});
|
||||
};
|
||||
```
|
||||
|
||||
#### 2.3 Browser SDK Configuration
|
||||
|
||||
**Next.js Configuration:**
|
||||
```typescript
|
||||
// next.config.js
|
||||
/** @type {import('next').NextConfig} */
|
||||
const nextConfig = {
|
||||
env: {
|
||||
SENTRY_DSN: process.env.SENTRY_DSN,
|
||||
},
|
||||
experimental: {
|
||||
serverComponentsExternalPackages: ['@sentry/nextjs'],
|
||||
},
|
||||
};
|
||||
|
||||
export default nextConfig;
|
||||
```
|
||||
|
||||
**Sentry Browser SDK:**
|
||||
```typescript
|
||||
// components/Sentry.tsx
|
||||
'use client';
|
||||
|
||||
import * as Sentry from '@sentry/browser';
|
||||
import { ReactRouter6BrowserTracingIntegration } from '@sentry/react';
|
||||
|
||||
Sentry.init({
|
||||
dsn: process.env.NEXT_PUBLIC_SENTRY_DSN,
|
||||
environment: process.env.NEXT_PUBLIC_ENV,
|
||||
release: `freno-corp@${pkg.version}-${process.env.GIT_SHA || 'local'}`,
|
||||
|
||||
tracesSampleRate: 1.0,
|
||||
|
||||
integrations: [
|
||||
new ReactRouter6BrowserTracingIntegration({
|
||||
router: useRouter(),
|
||||
}),
|
||||
],
|
||||
|
||||
// Performance monitoring
|
||||
beforeSendTransaction(event) {
|
||||
// Filter sensitive endpoints
|
||||
if (/(token|secret|password)/i.test(event.name)) {
|
||||
return null;
|
||||
}
|
||||
return event;
|
||||
},
|
||||
});
|
||||
```
|
||||
|
||||
#### 2.4 React Query Integration
|
||||
|
||||
**Automatic Tracking:**
|
||||
```typescript
|
||||
// hooks/useSentryQuery.ts
|
||||
import { useQuery, UseQueryOptions } from '@tanstack/react-query';
|
||||
import * as Sentry from '@sentry/react';
|
||||
|
||||
/**
|
||||
* React Query hook with automatic Sentry integration
|
||||
* Automatically captures query errors and performance
|
||||
*/
|
||||
export function useSentryQuery<TData, TError = Error>(
|
||||
queryKey: unknown[],
|
||||
queryFn: () => Promise<TData>,
|
||||
options?: UseQueryOptions<TData, TError>
|
||||
) {
|
||||
return useQuery<TData, TError>(
|
||||
queryKey,
|
||||
queryFn,
|
||||
{
|
||||
...options,
|
||||
onError: (error) => {
|
||||
// Only capture non-4xx errors
|
||||
if (error instanceof Error && !(error as any).statusCode) {
|
||||
Sentry.captureException(error, {
|
||||
tags: {
|
||||
query: JSON.stringify(queryKey),
|
||||
},
|
||||
});
|
||||
}
|
||||
},
|
||||
}
|
||||
);
|
||||
}
|
||||
```
|
||||
|
||||
#### 2.5 Component Performance Monitoring
|
||||
|
||||
**Component Profiling:**
|
||||
```typescript
|
||||
// components/ProfiledComponent.tsx
|
||||
import * as Sentry from '@sentry/react';
|
||||
import { createProfiler } from '@sentry/profiling';
|
||||
|
||||
/**
|
||||
* Wrap components for Sentry profiling
|
||||
*/
|
||||
export function ProfiledComponent<TProps>(
|
||||
Component: React.ComponentType<TProps>,
|
||||
name: string
|
||||
) {
|
||||
return function ProfiledComponentWrapper(props: TProps) {
|
||||
const [profiler, setProfiler] = useState<Sentry.Profiler | null>(null);
|
||||
|
||||
const startProfiler = () => {
|
||||
const profiler = createProfiler();
|
||||
setProfiler(profiler);
|
||||
|
||||
profiler.start((result) => {
|
||||
Sentry.profiler.recordResult(result);
|
||||
});
|
||||
};
|
||||
|
||||
const stopProfiler = () => {
|
||||
if (profiler) {
|
||||
profiler.stop();
|
||||
}
|
||||
};
|
||||
|
||||
return (
|
||||
<>
|
||||
<Profiler
|
||||
name={name}
|
||||
onRender={startProfiler}
|
||||
onExit={stopProfiler}
|
||||
>
|
||||
<Component {...props} />
|
||||
</Profiler>
|
||||
</>
|
||||
);
|
||||
};
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### Phase 3: Unified Observability
|
||||
|
||||
#### 3.1 Correlate Datadog and Sentry Data
|
||||
|
||||
**Request Correlation:**
|
||||
```typescript
|
||||
// middleware/correlation.ts
|
||||
import { trace } from '@datadog/pprof';
|
||||
import * as Sentry from '@sentry/node';
|
||||
|
||||
export const correlationMiddleware = (req: Request, res: Response, next: NextFunction) => {
|
||||
// Generate correlation ID
|
||||
const correlationId = uuidv4();
|
||||
req.correlationId = correlationId;
|
||||
|
||||
// Set correlation headers
|
||||
res.setHeader('X-Correlation-ID', correlationId);
|
||||
|
||||
// Start Datadog trace
|
||||
const ddSpan = trace.startSpan('http.request', {
|
||||
service: 'api',
|
||||
resource: `${req.method} ${req.path}`,
|
||||
tags: {
|
||||
'correlation.id': correlationId,
|
||||
},
|
||||
});
|
||||
|
||||
// Create Sentry transaction
|
||||
Sentry.startSpan({
|
||||
op: 'http.server',
|
||||
name: req.method + ' ' + req.url,
|
||||
attributes: {
|
||||
'http.request.method': req.method,
|
||||
'http.request.url': req.url,
|
||||
'correlation.id': correlationId,
|
||||
},
|
||||
});
|
||||
|
||||
// Store correlation ID in request context
|
||||
req.correlationId = correlationId;
|
||||
|
||||
res.on('finish', () => {
|
||||
// Finish Datadog span with correlation ID
|
||||
ddSpan.setTags({
|
||||
'http.response.status': res.statusCode,
|
||||
});
|
||||
ddSpan.finish();
|
||||
});
|
||||
|
||||
next();
|
||||
};
|
||||
```
|
||||
|
||||
#### 3.2 Unified Metrics Dashboard
|
||||
|
||||
**Metrics Collection:**
|
||||
```typescript
|
||||
// lib/metrics.ts
|
||||
import { trace } from '@datadog/pprof';
|
||||
import * as Sentry from '@sentry/node';
|
||||
|
||||
/**
|
||||
* Unified metrics that send to both Datadog and Sentry
|
||||
*/
|
||||
export class UnifiedMetrics {
|
||||
private ddMeters: Map<string, Datadog.Meter> = new Map();
|
||||
|
||||
incrementCounter(name: string, value: number = 1, tags?: Record<string, string>) {
|
||||
// Datadog
|
||||
const meter = this.ddMeters.get(name) || new Datadog.Meter(name);
|
||||
meter.increment(value, tags);
|
||||
|
||||
// Sentry
|
||||
Sentry.metrics.increment(name, value, { tags });
|
||||
}
|
||||
|
||||
distribution(name: string, value: number, unit: string, tags?: Record<string, string>) {
|
||||
// Datadog
|
||||
const meter = this.ddMeters.get(name) || new Datadog.Meter(name);
|
||||
meter.distribution(value, unit, tags);
|
||||
|
||||
// Sentry
|
||||
Sentry.metrics.distribution(name, value, { unit, tags });
|
||||
}
|
||||
|
||||
gauge(name: string, value: number, tags?: Record<string, string>) {
|
||||
// Datadog
|
||||
const meter = this.ddMeters.get(name) || new Datadog.Meter(name);
|
||||
meter.gauge(value, tags);
|
||||
|
||||
// Sentry
|
||||
Sentry.metrics.gauge(name, value, { tags });
|
||||
}
|
||||
}
|
||||
|
||||
// Usage
|
||||
const metrics = new UnifiedMetrics();
|
||||
|
||||
// In middleware
|
||||
export const metricsMiddleware = (req: Request, res: Response, next: NextFunction) => {
|
||||
const startTime = Date.now();
|
||||
|
||||
// Track request duration
|
||||
metrics.distribution(
|
||||
'http.request.duration',
|
||||
Date.now() - startTime,
|
||||
'ms',
|
||||
{
|
||||
'http.method': req.method,
|
||||
'http.path': req.path,
|
||||
'correlation.id': req.correlationId,
|
||||
}
|
||||
);
|
||||
|
||||
next();
|
||||
};
|
||||
```
|
||||
|
||||
#### 3.3 Alerting Configuration
|
||||
|
||||
**Datadog Alerts:**
|
||||
```yaml
|
||||
# datadog-alerts.yaml
|
||||
alerts:
|
||||
- name: 'High Error Rate'
|
||||
type: 'threshold'
|
||||
query: 'last:1m'
|
||||
conditions:
|
||||
- metric: 'http.errors'
|
||||
operator: 'gt'
|
||||
value: 5
|
||||
notifications:
|
||||
- type: 'email'
|
||||
to: 'platform-team@freno.corp'
|
||||
- type: 'slack'
|
||||
channel: '#platform-alerts'
|
||||
|
||||
- name: 'Slow API Response'
|
||||
type: 'threshold'
|
||||
query: 'last:1m'
|
||||
conditions:
|
||||
- metric: 'http.response_time.p99'
|
||||
operator: 'gt'
|
||||
value: 1000
|
||||
notifications:
|
||||
- type: 'pagerduty'
|
||||
service: 'platform-oncall'
|
||||
|
||||
- name: 'Database Connection Pool Exhaustion'
|
||||
type: 'threshold'
|
||||
query: 'last:1m'
|
||||
conditions:
|
||||
- metric: 'db.connections.active'
|
||||
operator: 'gt'
|
||||
value: 95
|
||||
notifications:
|
||||
- type: 'slack'
|
||||
channel: '#database-alerts'
|
||||
```
|
||||
|
||||
**Sentry Alerts:**
|
||||
```typescript
|
||||
// config/sentry-alerts.ts
|
||||
import * as Sentry from '@sentry/node';
|
||||
|
||||
Sentry.init({
|
||||
// ... other config
|
||||
|
||||
// Error rate alerting
|
||||
beforeSendTransaction(event) {
|
||||
if (event.transaction === '/api/errors') {
|
||||
// Custom Sentry alert logic
|
||||
}
|
||||
return event;
|
||||
},
|
||||
});
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Implementation Timeline
|
||||
|
||||
| Phase | Tasks | Duration | Dependencies |
|
||||
|------|-------|----------|-------------|
|
||||
| **Phase 1** | Datadog APM setup | 2-3 days | None |
|
||||
| | Tracing middleware | 1-2 days | Phase 1.1 |
|
||||
| | Database/Cache tracing | 1-2 days | Phase 1.1 |
|
||||
| | External service tracing | 1-2 days | Phase 1.1 |
|
||||
| **Phase 2** | Sentry setup | 1-2 days | None |
|
||||
| | React/Next.js integration | 2-3 days | Phase 2.1 |
|
||||
| | Error boundaries | 1-2 days | Phase 2.1 |
|
||||
| | Browser SDK | 1 day | Phase 2.1 |
|
||||
| **Phase 3** | Correlation layer | 1-2 days | Phase 1, 2 |
|
||||
| | Unified metrics | 1-2 days | Phase 1, 2 |
|
||||
| | Alerting setup | 1 day | Phase 3.1, 3.2 |
|
||||
| **Phase 4** | Testing | 2-3 days | All phases |
|
||||
| | Documentation | 1-2 days | All phases |
|
||||
|
||||
**Total Estimated Time: 18-25 days**
|
||||
|
||||
---
|
||||
|
||||
## Verification Checklist
|
||||
|
||||
### Phase 1: Datadog
|
||||
- [ ] SDK installed and configured
|
||||
- [ ] Tracing enabled on all services
|
||||
- [ ] Distributed tracing working (trace ID propagates)
|
||||
- [ ] Database queries traced
|
||||
- [ ] External API calls traced
|
||||
- [ ] Sampling rules configured
|
||||
- [ ] Metrics visible in Datadog dashboard
|
||||
- [ ] Profiling enabled
|
||||
|
||||
### Phase 2: Sentry
|
||||
- [ ] SDK installed and configured
|
||||
- [ ] Error tracking working
|
||||
- [ ] Performance monitoring active
|
||||
- [ ] React/Next.js integration complete
|
||||
- [ ] Error boundaries functional
|
||||
- [ ] Browser SDK tracking user interactions
|
||||
- [ ] Release tracking enabled
|
||||
|
||||
### Phase 3: Unified
|
||||
- [ ] Correlation IDs working
|
||||
- [ ] Metrics synchronized
|
||||
- [ ] Alerts configured and tested
|
||||
- [ ] Dashboard accessible
|
||||
|
||||
---
|
||||
|
||||
## Rollback Plan
|
||||
|
||||
If issues arise during or after implementation:
|
||||
|
||||
1. **Disable tracing:**
|
||||
```bash
|
||||
# Set sampling rate to 0
|
||||
export DD_TRACE_SAMPLE_RATE=0
|
||||
export SENTRY_TRACES_SAMPLE_RATE=0
|
||||
```
|
||||
|
||||
2. **Remove SDKs:**
|
||||
```bash
|
||||
# Uninstall packages
|
||||
npm uninstall dd-trace @sentry/node
|
||||
# Remove initialization code
|
||||
```
|
||||
|
||||
3. **Restore from backup:**
|
||||
```bash
|
||||
git checkout HEAD~1 -- lib/tracing/ config/*.ts
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Cost Estimation
|
||||
|
||||
| Service | Monthly Cost (1M transactions) | Notes |
|
||||
|---------|-------------------------------|-------|
|
||||
| Datadog APM | ~$1,000 | Includes tracing, metrics, profiling |
|
||||
| Datadog Logs | ~$500 | Log ingestion and retention |
|
||||
| Sentry | ~$249 | Error tracking and release management |
|
||||
| **Total** | **~$1,749** | Scales with usage |
|
||||
|
||||
*Costs subject to change based on actual usage and feature requirements.*
|
||||
|
||||
---
|
||||
|
||||
## Next Steps
|
||||
|
||||
1. ✅ **Create technical analysis document** (current task)
|
||||
2. ⏳ **Create implementation plan** (in progress)
|
||||
3. ⏳ **Implement Datadog APM integration**
|
||||
4. ⏳ **Implement Sentry integration**
|
||||
5. ⏳ **Configure unified observability**
|
||||
6. ⏳ **Test and validate**
|
||||
7. ⏳ **Deploy to staging**
|
||||
8. ⏳ **Production rollout**
|
||||
|
||||
---
|
||||
|
||||
**Document Author:** CTO (Agent)
|
||||
**Date:** 2026-05-11
|
||||
**Status:** Implementation Plan Complete
|
||||
Reference in New Issue
Block a user