autotel
Version:
Write Once, Observe Anywhere
738 lines (674 loc) • 21.9 kB
text/typescript
/**
* Sampling Strategies
*
* Provides intelligent sampling beyond simple random rates.
* Helps reduce telemetry costs while capturing critical data.
*
* Key strategies:
* - Always trace errors and slow requests (critical for debugging)
* - Sample by user ID for consistent request tracing
* - Adaptive sampling based on load
* - Sample by feature flags for A/B testing correlation
*
* @example
* ```typescript
* import { AlwaysOnErrorSampler, UserIdSampler } from './sampling'
*
* @Instrumented({
* serviceName: 'user',
* sampler: new AlwaysOnErrorSampler(0.1) // 10% baseline, 100% on errors
* })
* class UserService { }
* ```
*/
import type { Link, Attributes } from '@opentelemetry/api';
import { TraceFlags } from '@opentelemetry/api';
import { type Logger } from './logger';
/**
* Tail sampling attribute keys (autotel-internal, not OTel semconv)
*/
export const AUTOTEL_SAMPLING_TAIL_KEEP = 'autotel.sampling.tail.keep';
export const AUTOTEL_SAMPLING_TAIL_EVALUATED =
'autotel.sampling.tail.evaluated';
/**
* Sampler interface - return true to trace, false to skip
*/
export interface Sampler {
/**
* Decide whether to trace this operation
*
* @param context - Sampling context
* @returns true to trace, false to skip
*/
shouldSample(context: SamplingContext): boolean;
/**
* Whether this sampler needs tail sampling (post-execution decision)
* If true, spans are always created and shouldKeepTrace() is called after execution
*
* @returns true if this sampler needs to evaluate after operation completes
*/
needsTailSampling?(): boolean;
/**
* Re-evaluate sampling decision after operation completes (tail sampling)
* Only called if needsTailSampling() returns true
*
* @param context - Sampling context
* @param result - Operation result
* @returns true if this trace should be kept, false to drop it
*/
shouldKeepTrace?(context: SamplingContext, result: OperationResult): boolean;
}
/**
* Context information for sampling decisions
*/
export interface SamplingContext {
/** Operation name */
operationName: string;
/** Method arguments (for extracting user IDs, etc.) */
args: unknown[];
/** Optional metadata (e.g., feature flags, request headers) */
metadata?: Record<string, unknown>;
/** Optional span links for links-based sampling */
links?: Link[];
}
/**
* Result of a trace operation (for post-execution sampling)
*/
export interface OperationResult {
/** Whether the operation succeeded */
success: boolean;
/** Duration in milliseconds */
duration: number;
/** Error if operation failed */
error?: Error;
}
/**
* Simple random sampler
*
* @example
* ```typescript
* new RandomSampler(0.1) // Sample 10% of requests
* ```
*/
export class RandomSampler implements Sampler {
constructor(private readonly sampleRate: number) {
if (sampleRate < 0 || sampleRate > 1) {
throw new Error('Sample rate must be between 0 and 1');
}
}
// eslint-disable-next-line @typescript-eslint/no-unused-vars
shouldSample(_context: SamplingContext): boolean {
return Math.random() < this.sampleRate;
}
}
/**
* Always sample (100% tracing)
*/
export class AlwaysSampler implements Sampler {
// eslint-disable-next-line @typescript-eslint/no-unused-vars
shouldSample(_context: SamplingContext): boolean {
return true;
}
}
/**
* Never sample (0% tracing)
*/
export class NeverSampler implements Sampler {
// eslint-disable-next-line @typescript-eslint/no-unused-vars
shouldSample(_context: SamplingContext): boolean {
return false;
}
}
/**
* Adaptive sampler that always traces errors and slow requests
*
* This is the recommended sampler for production use.
* It ensures you never miss critical issues while keeping costs down.
*
* Strategy:
* - Always trace errors (critical for debugging)
* - Always trace slow requests (performance issues)
* - Use baseline sample rate for successful fast requests
*
* **IMPORTANT - Tail Sampling Requirement:**
* This sampler uses tail sampling (makes decisions AFTER execution).
* You MUST use TailSamplingSpanProcessor for it to work correctly:
*
* - If using initInstrumentation(): TailSamplingSpanProcessor is auto-configured
* - If using custom TracerProvider: You MUST manually register TailSamplingSpanProcessor
*
* Without TailSamplingSpanProcessor, ALL spans are exported (defeating the cost savings).
*
* @see TailSamplingSpanProcessor
* @see README.md "Tail Sampling with Custom Providers" section
*
* @example
* ```typescript
* new AdaptiveSampler({
* baselineSampleRate: 0.1, // 10% of normal requests
* slowThresholdMs: 1000, // Requests > 1s are "slow"
* alwaysSampleErrors: true, // Always trace errors
* alwaysSampleSlow: true // Always trace slow requests
* })
* ```
*/
export class AdaptiveSampler implements Sampler {
private baselineSampleRate: number;
private slowThresholdMs: number;
private alwaysSampleErrors: boolean;
private alwaysSampleSlow: boolean;
private linksBased: boolean;
private linksRate: number;
private logger?: Logger;
// Track whether we should sample this request
private readonly samplingDecisions = new WeakMap<unknown[], boolean>();
// Track operation results to enable post-execution decision
private readonly operationResults = new WeakMap<unknown[], OperationResult>();
constructor(
options: {
baselineSampleRate?: number;
slowThresholdMs?: number;
alwaysSampleErrors?: boolean;
alwaysSampleSlow?: boolean;
/** Enable links-based sampling for event-driven architectures */
linksBased?: boolean;
/** Sampling rate for spans linked to sampled spans (0.0-1.0) */
linksRate?: number;
logger?: Logger;
} = {},
) {
this.baselineSampleRate = options.baselineSampleRate ?? 0.1;
this.slowThresholdMs = options.slowThresholdMs ?? 1000;
this.alwaysSampleErrors = options.alwaysSampleErrors ?? true;
this.alwaysSampleSlow = options.alwaysSampleSlow ?? true;
this.linksBased = options.linksBased ?? false;
this.linksRate = options.linksRate ?? 1;
this.logger = options.logger;
if (this.baselineSampleRate < 0 || this.baselineSampleRate > 1) {
throw new Error('Baseline sample rate must be between 0 and 1');
}
if (this.linksRate < 0 || this.linksRate > 1) {
throw new Error('Links rate must be between 0 and 1');
}
}
needsTailSampling(): boolean {
// AdaptiveSampler ALWAYS needs tail sampling to implement error/slow capture
return true;
}
shouldSample(context: SamplingContext): boolean {
// For tail sampling, we optimistically create spans for all requests
// The real decision happens in shouldKeepTrace() after execution
// We still store the baseline decision for shouldKeepTrace() to use
const baselineDecision = Math.random() < this.baselineSampleRate;
this.samplingDecisions.set(context.args, baselineDecision);
// Always return true to create the span (tail sampling will decide if we keep it)
return true;
}
/**
* Check if any links point to sampled spans.
*
* A span is considered linked to a sampled span if any of its links
* have trace_flags with the sampled bit set (0x01).
*
* @param links - Array of span links to check
* @returns true if any linked span is sampled, false otherwise
*/
hasSampledLink(links: Link[]): boolean {
if (!links || links.length === 0) {
return false;
}
return links.some(
(link) =>
link.context && (link.context.traceFlags & TraceFlags.SAMPLED) !== 0,
);
}
/**
* Re-evaluate sampling decision after operation completes
*
* This allows us to always capture errors and slow requests,
* even if they weren't initially sampled.
*
* @param context - Sampling context
* @param result - Operation result
* @returns true if this operation should be kept (not discarded)
*/
shouldKeepTrace(context: SamplingContext, result: OperationResult): boolean {
const baselineDecision = this.samplingDecisions.get(context.args) ?? false;
// Always keep errors
if (this.alwaysSampleErrors && !result.success) {
if (!baselineDecision) {
this.logger?.debug(
{
operation: context.operationName,
error: result.error?.message,
},
'Adaptive sampling: Keeping error trace',
);
}
return true;
}
// Always keep slow requests
if (this.alwaysSampleSlow && result.duration >= this.slowThresholdMs) {
if (!baselineDecision) {
this.logger?.debug(
{
operation: context.operationName,
duration: result.duration,
},
'Adaptive sampling: Keeping slow trace',
);
}
return true;
}
// Check for sampled links (links-based sampling for event-driven systems)
if (
this.linksBased &&
context.links &&
this.hasSampledLink(context.links)
) {
// Use linksRate to decide whether to keep the linked span
const keepLinked = Math.random() < this.linksRate;
if (keepLinked && !baselineDecision) {
this.logger?.debug(
{
operation: context.operationName,
linkCount: context.links.length,
},
'Adaptive sampling: Keeping trace due to sampled link',
);
}
return keepLinked;
}
// Otherwise, use baseline decision
return baselineDecision;
}
}
/**
* User-based sampler for consistent tracing
*
* Always samples requests from specific user IDs.
* Useful for debugging specific user issues or monitoring VIP users.
*
* @example
* ```typescript
* new UserIdSampler({
* baselineSampleRate: 0.01, // 1% of normal users
* alwaysSampleUsers: ['vip_123'], // Always trace VIP users
* extractUserId: (args) => args[0]?.userId // Extract user ID from first arg
* })
* ```
*/
export class UserIdSampler implements Sampler {
private baselineSampleRate: number;
private alwaysSampleUsers: Set<string>;
private extractUserId: (args: unknown[]) => string | undefined;
private logger?: Logger;
constructor(options: {
baselineSampleRate?: number;
alwaysSampleUsers?: string[];
extractUserId: (args: unknown[]) => string | undefined;
logger?: Logger;
}) {
this.baselineSampleRate = options.baselineSampleRate ?? 0.1;
this.alwaysSampleUsers = new Set(options.alwaysSampleUsers || []);
this.extractUserId = options.extractUserId;
this.logger = options.logger;
}
shouldSample(context: SamplingContext): boolean {
const userId = this.extractUserId(context.args);
// Always sample specific users
if (userId && this.alwaysSampleUsers.has(userId)) {
this.logger?.debug(
{
operation: context.operationName,
userId,
},
'Sampling user request',
);
return true;
}
// For consistent per-user sampling, hash the user ID
if (userId) {
const hash = this.hashString(userId);
return hash < this.baselineSampleRate;
}
// Fallback to random sampling if no user ID
return Math.random() < this.baselineSampleRate;
}
/**
* Add user IDs to always-sample list
*/
addAlwaysSampleUsers(...userIds: string[]): void {
for (const userId of userIds) {
this.alwaysSampleUsers.add(userId);
}
}
/**
* Remove user IDs from always-sample list
*/
removeAlwaysSampleUsers(...userIds: string[]): void {
for (const userId of userIds) {
this.alwaysSampleUsers.delete(userId);
}
}
/**
* Simple hash function for consistent user sampling
*/
private hashString(str: string): number {
let hash = 0;
for (let i = 0; i < str.length; i++) {
const char = str.codePointAt(i) ?? 0;
hash = (hash << 5) - hash + char;
hash = hash & hash; // Convert to 32-bit integer
}
return Math.abs(hash) / 2_147_483_647; // Normalize to 0-1
}
}
/**
* Composite sampler that combines multiple samplers
*
* Samples if ANY of the child samplers returns true.
*
* @example
* ```typescript
* new CompositeSampler([
* new UserIdSampler({ extractUserId: (args) => args[0]?.userId }),
* new AdaptiveSampler({ baselineSampleRate: 0.1 })
* ])
* ```
*/
export class CompositeSampler implements Sampler {
constructor(private readonly samplers: Sampler[]) {
if (samplers.length === 0) {
throw new Error('CompositeSampler requires at least one child sampler');
}
}
shouldSample(context: SamplingContext): boolean {
return this.samplers.some((sampler) => sampler.shouldSample(context));
}
}
/**
* Feature flag sampler
*
* Always samples requests with specific feature flags enabled.
* Perfect for correlating A/B test experiments with metrics.
*
* @example
* ```typescript
* new FeatureFlagSampler({
* baselineSampleRate: 0.01,
* alwaysSampleFlags: ['new_checkout', 'experimental_ui'],
* extractFlags: (args, metadata) => metadata?.featureFlags
* })
* ```
*/
export class FeatureFlagSampler implements Sampler {
private baselineSampleRate: number;
private alwaysSampleFlags: Set<string>;
private extractFlags: (
args: unknown[],
metadata?: Record<string, unknown>,
) => string[] | undefined;
private logger?: Logger;
constructor(options: {
baselineSampleRate?: number;
alwaysSampleFlags?: string[];
extractFlags: (
args: unknown[],
metadata?: Record<string, unknown>,
) => string[] | undefined;
logger?: Logger;
}) {
this.baselineSampleRate = options.baselineSampleRate ?? 0.1;
this.alwaysSampleFlags = new Set(options.alwaysSampleFlags || []);
this.extractFlags = options.extractFlags;
this.logger = options.logger;
}
shouldSample(context: SamplingContext): boolean {
const flags = this.extractFlags(context.args, context.metadata);
// Always sample if any monitored flag is enabled
if (flags && flags.some((flag) => this.alwaysSampleFlags.has(flag))) {
this.logger?.debug(
{
operation: context.operationName,
flags,
},
'Sampling feature flag request',
);
return true;
}
// Fallback to random sampling
return Math.random() < this.baselineSampleRate;
}
/**
* Add feature flags to always-sample list
*/
addAlwaysSampleFlags(...flags: string[]): void {
for (const flag of flags) {
this.alwaysSampleFlags.add(flag);
}
}
/**
* Remove feature flags from always-sample list
*/
removeAlwaysSampleFlags(...flags: string[]): void {
for (const flag of flags) {
this.alwaysSampleFlags.delete(flag);
}
}
}
// ============================================================================
// Sampling Presets
// ============================================================================
/**
* Named sampling presets for common environments.
* Use with `init({ sampling: 'production' })` or directly via factories.
*/
export type SamplingPreset =
| 'development'
| 'errors-only'
| 'production'
| 'off';
/**
* Sampling preset factories.
*
* For most users, the string shorthand on `init()` is simpler:
* ```typescript
* init({ service: 'my-app', sampling: 'production' })
* ```
*
* Use factories when you need to customize:
* ```typescript
* init({ service: 'my-app', sampler: samplingPresets.production({ baselineSampleRate: 0.05 }) })
* ```
*/
export const samplingPresets = {
/** Capture everything — best for local development and debugging */
development: () => new AlwaysSampler(),
/** Only bad outcomes — zero baseline, errors always kept */
errorsOnly: () =>
new AdaptiveSampler({
baselineSampleRate: 0,
alwaysSampleErrors: true,
}),
/**
* Balanced production defaults — 10% baseline + errors + slow traces.
* Pass overrides to tune (uses the same option names as AdaptiveSampler).
*/
production: (overrides?: {
baselineSampleRate?: number;
slowThresholdMs?: number;
alwaysSampleErrors?: boolean;
alwaysSampleSlow?: boolean;
}) =>
new AdaptiveSampler({
baselineSampleRate: 0.1,
alwaysSampleErrors: true,
alwaysSampleSlow: true,
slowThresholdMs: 1000,
...overrides,
}),
/** Disable sampling entirely */
off: () => new NeverSampler(),
};
/**
* Resolve a preset string to a Sampler instance.
* Used internally by `init()` when `sampling` string is provided.
*
* @throws Error if preset is not recognized
*/
export function resolveSamplingPreset(preset: SamplingPreset): Sampler {
switch (preset) {
case 'development':
return samplingPresets.development();
case 'errors-only':
return samplingPresets.errorsOnly();
case 'production':
return samplingPresets.production();
case 'off':
return samplingPresets.off();
default:
throw new Error(
`Unknown sampling preset: "${preset}". Valid presets: development, errors-only, production, off`,
);
}
}
// ============================================================================
// Link Helper Functions
// ============================================================================
/**
* Create a Link from W3C trace context headers (e.g., from a message queue).
*
* This is useful for message consumers that need to link to the producer span.
* The headers should contain at least a `traceparent` header in W3C format.
*
* @param headers - Dictionary containing traceparent/tracestate headers
* @param attributes - Optional attributes for the link
* @returns Link object if context could be extracted, null otherwise
*
* @example
* ```typescript
* // In a Kafka consumer
* const headers = { traceparent: '00-abc123...-def456...-01' };
* const link = createLinkFromHeaders(headers);
* if (link) {
* // Use with tracer.startActiveSpan options or ctx.addLink()
* tracer.startActiveSpan('process.message', { links: [link] }, span => { ... });
* }
* ```
*/
export function createLinkFromHeaders(
headers: Record<string, string>,
attributes?: Attributes,
): Link | null {
// Parse W3C traceparent header directly for reliability
// Format: version-traceId-spanId-traceFlags (e.g., 00-abc123...-def456...-01)
const traceparent = headers.traceparent || headers['traceparent'];
if (!traceparent) {
return null;
}
const spanContext = parseTraceparent(traceparent);
if (!spanContext || !isValidSpanContext(spanContext)) {
return null;
}
return {
context: spanContext,
attributes: attributes ?? {},
};
}
/**
* Extract Links from a batch of messages for fan-in scenarios.
*
* Useful for batch processing where multiple producer spans should be linked.
* This enables tracing causality in event-driven architectures where a single
* consumer processes messages from multiple producers.
*
* @param messages - List of message objects
* @param headersKey - Key in each message containing trace headers (default: 'headers')
* @returns List of Link objects for all valid trace contexts
*
* @example
* ```typescript
* // Processing a batch of SQS/Kafka messages
* const messages = [
* { body: '...', headers: { traceparent: '...' } },
* { body: '...', headers: { traceparent: '...' } },
* ];
* const links = extractLinksFromBatch(messages);
*
* tracer.startActiveSpan('process.batch', { links }, span => {
* for (const msg of messages) {
* processMessage(msg);
* }
* });
* ```
*/
export function extractLinksFromBatch(
messages: Array<{ [key: string]: unknown }>,
headersKey: string = 'headers',
): Link[] {
const links: Link[] = [];
for (const msg of messages) {
const msgHeaders = msg[headersKey];
if (msgHeaders && typeof msgHeaders === 'object' && msgHeaders !== null) {
const link = createLinkFromHeaders(msgHeaders as Record<string, string>, {
'messaging.batch.message_index': links.length,
});
if (link) {
links.push(link);
}
}
}
return links;
}
/**
* Parse W3C traceparent header into SpanContext
* Format: version-traceId-spanId-traceFlags (e.g., 00-abc123...-def456...-01)
*
* @see https://www.w3.org/TR/trace-context/#traceparent-header
*/
function parseTraceparent(
traceparent: string,
): import('@opentelemetry/api').SpanContext | null {
// W3C traceparent format: version-traceId-parentId-traceFlags
// Example: 00-0af7651916cd43dd8448eb211c80319c-b7ad6b7169203331-01
const TRACEPARENT_REGEX =
/^([0-9a-f]{2})-([0-9a-f]{32})-([0-9a-f]{16})-([0-9a-f]{2})$/i;
const match = traceparent.match(TRACEPARENT_REGEX);
if (!match || match.length < 5) {
return null;
}
const version = match[1];
const traceId = match[2];
const spanId = match[3];
const flags = match[4];
// Validate all parts are present (TypeScript narrowing)
if (!version || !traceId || !spanId || !flags) {
return null;
}
// Version 00 is currently the only version, but we should be forward compatible
if (version === 'ff') {
// Version ff is invalid according to spec
return null;
}
return {
traceId,
spanId,
traceFlags: Number.parseInt(flags, 16),
isRemote: true,
};
}
/**
* Check if a SpanContext is valid (has non-zero trace and span IDs)
*/
function isValidSpanContext(
spanContext: import('@opentelemetry/api').SpanContext | null,
): spanContext is import('@opentelemetry/api').SpanContext {
if (!spanContext) return false;
// TraceId should not be all zeros (00000000000000000000000000000000)
// SpanId should not be all zeros (0000000000000000)
return (
spanContext.traceId !== '00000000000000000000000000000000' &&
spanContext.spanId !== '0000000000000000'
);
}