UNPKG

dd-trace

Version:

Datadog APM tracing client for JavaScript

367 lines (299 loc) 12.3 kB
'use strict' // TODO: capture every second and flush every 10 seconds const v8 = require('v8') const os = require('os') const process = require('process') const { performance, PerformanceObserver, monitorEventLoopDelay } = require('perf_hooks') const { DogStatsDClient, MetricsAggregationClient } = require('../dogstatsd') const log = require('../log') const { getEnvironmentVariable } = require('../config-helper') const { NODE_MAJOR } = require('../../../../version') // TODO: This environment variable may not be changed, since the agent expects a flush every ten seconds. // It is only a variable for testing. Think about alternatives. const DD_RUNTIME_METRICS_FLUSH_INTERVAL = getEnvironmentVariable('DD_RUNTIME_METRICS_FLUSH_INTERVAL') ?? '10000' const INTERVAL = Number.parseInt(DD_RUNTIME_METRICS_FLUSH_INTERVAL, 10) const eventLoopDelayResolution = 4 let nativeMetrics = null let gcObserver = null let interval = null let client = null let lastTime = 0 let lastCpuUsage = null let eventLoopDelayObserver = null // !!!!!!!!!!! // IMPORTANT // !!!!!!!!!!! // // ALL metrics that relate to time are handled in nanoseconds in the backend. // https://github.com/DataDog/dogweb/blob/prod/integration/node/node_metadata.csv module.exports = { start (config) { this.stop() const clientConfig = DogStatsDClient.generateClientConfig(config) const trackEventLoop = config.runtimeMetrics.eventLoop !== false const trackGc = config.runtimeMetrics.gc !== false if (trackGc) { startGCObserver() } // Using no-gc prevents the native gc metrics from being tracked. Not // passing any options means all metrics are tracked. // TODO: This is a workaround. We should find a better solution. const watchers = trackEventLoop ? ['loop'] : ['no-gc'] try { nativeMetrics = require('@datadog/native-metrics') nativeMetrics.start(...watchers) } catch (error) { log.error('Error starting native metrics', error) nativeMetrics = null } client = new MetricsAggregationClient(new DogStatsDClient(clientConfig)) lastTime = performance.now() if (nativeMetrics) { interval = setInterval(() => { captureNativeMetrics(trackEventLoop, trackGc) captureCommonMetrics(trackEventLoop) client.flush() }, INTERVAL) } else { lastCpuUsage = process.cpuUsage() if (trackEventLoop) { eventLoopDelayObserver = monitorEventLoopDelay({ resolution: eventLoopDelayResolution }) eventLoopDelayObserver.enable() } interval = setInterval(() => { captureCpuUsage() captureCommonMetrics(trackEventLoop) captureHeapSpace() if (trackEventLoop) { // Experimental: The Node.js implementation deviates from the native metrics. // We normalize the metrics to the same format but the Node.js values // are that way lower than they should be, while they are still nearer // to the native ones that way. // We use these only as fallback values. captureEventLoopDelay() } client.flush() }, INTERVAL) } interval.unref() }, stop () { nativeMetrics?.stop() nativeMetrics = null clearInterval(interval) interval = null client = null lastCpuUsage = null gcObserver?.disconnect() gcObserver = null eventLoopDelayObserver?.disable() eventLoopDelayObserver = null }, track (span) { if (nativeMetrics) { const handle = nativeMetrics.track(span) return { finish: () => nativeMetrics.finish(handle) } } return { finish: () => {} } }, boolean (name, value, tag) { client?.boolean(name, value, tag) }, histogram (name, value, tag) { client?.histogram(name, value, tag) }, count (name, count, tag, monotonic = false) { client?.count(name, count, tag, monotonic) }, gauge (name, value, tag) { client?.gauge(name, value, tag) }, increment (name, tag, monotonic) { this.count(name, 1, tag, monotonic) }, decrement (name, tag) { this.count(name, -1, tag) } } function captureCpuUsage () { const currentCpuUsage = process.cpuUsage() const elapsedUsageUser = currentCpuUsage.user - lastCpuUsage.user const elapsedUsageSystem = currentCpuUsage.system - lastCpuUsage.system const currentTime = performance.now() // Milliseconds with decimal places const elapsedUsDividedBy100 = (currentTime - lastTime) * 10 const userPercent = elapsedUsageUser / elapsedUsDividedBy100 const systemPercent = elapsedUsageSystem / elapsedUsDividedBy100 const totalPercent = userPercent + systemPercent lastTime = currentTime lastCpuUsage = currentCpuUsage client.gauge('runtime.node.cpu.system', systemPercent.toFixed(2)) client.gauge('runtime.node.cpu.user', userPercent.toFixed(2)) client.gauge('runtime.node.cpu.total', totalPercent.toFixed(2)) } function captureMemoryUsage () { const stats = process.memoryUsage() client.gauge('runtime.node.mem.heap_total', stats.heapTotal) client.gauge('runtime.node.mem.heap_used', stats.heapUsed) client.gauge('runtime.node.mem.rss', stats.rss) client.gauge('runtime.node.mem.total', os.totalmem()) client.gauge('runtime.node.mem.free', os.freemem()) client.gauge('runtime.node.mem.external', stats.external) // TODO: Add arrayBuffers to the metrics. That also requires the // node_metadata.csv to be updated for the website. // // client.gauge('runtime.node.mem.arrayBuffers', stats.arrayBuffers) } function captureUptime () { // WARNING: lastTime must be updated in the same interval before this function is called! // This is a faster `process.uptime()`. client.gauge('runtime.node.process.uptime', Math.round((lastTime + 499) / 1000)) } function captureEventLoopDelay () { eventLoopDelayObserver.disable() if (eventLoopDelayObserver.count !== 0) { const minimum = eventLoopDelayResolution * 1e6 const avg = Math.max(eventLoopDelayObserver.mean - minimum, 0) const sum = Math.round(avg * eventLoopDelayObserver.count) if (sum !== 0) { // Normalize the metrics to the same format as the native metrics. const stats = { min: Math.max(eventLoopDelayObserver.min - minimum, 0), max: Math.max(eventLoopDelayObserver.max - minimum, 0), sum, total: sum, avg, count: eventLoopDelayObserver.count, p95: Math.max(eventLoopDelayObserver.percentile(95) - minimum, 0) } histogram('runtime.node.event_loop.delay', stats) } } eventLoopDelayObserver = monitorEventLoopDelay({ resolution: eventLoopDelayResolution }) eventLoopDelayObserver.enable() } function captureHeapStats () { const stats = v8.getHeapStatistics() client.gauge('runtime.node.heap.total_heap_size', stats.total_heap_size) client.gauge('runtime.node.heap.total_heap_size_executable', stats.total_heap_size_executable) client.gauge('runtime.node.heap.total_physical_size', stats.total_physical_size) client.gauge('runtime.node.heap.total_available_size', stats.total_available_size) client.gauge('runtime.node.heap.heap_size_limit', stats.heap_size_limit) client.gauge('runtime.node.heap.malloced_memory', stats.malloced_memory) client.gauge('runtime.node.heap.peak_malloced_memory', stats.peak_malloced_memory) // TODO: Add number_of_native_contexts and number_of_detached_contexts to the // metrics. Those metrics allow to identify memory leaks. Adding them also // requires the node_metadata.csv to be updated for the website. // // client.gauge('runtime.node.heap.number_of_native_contexts', stats.number_of_native_contexts) // client.gauge('runtime.node.heap.number_of_detached_contexts', stats.number_of_detached_contexts) } function captureHeapSpace () { const stats = v8.getHeapSpaceStatistics() for (let i = 0, l = stats.length; i < l; i++) { const tags = [`space:${stats[i].space_name}`] client.gauge('runtime.node.heap.size.by.space', stats[i].space_size, tags) client.gauge('runtime.node.heap.used_size.by.space', stats[i].space_used_size, tags) client.gauge('runtime.node.heap.available_size.by.space', stats[i].space_available_size, tags) client.gauge('runtime.node.heap.physical_size.by.space', stats[i].physical_space_size, tags) } } /** * Gathers and reports Event Loop Utilization (ELU) since last run, or from the * start of the process on first run. * * ELU is a measure of how busy the event loop is, like running JavaScript or * waiting on *Sync functions. The value is between 0 (idle) and 1 (exhausted). */ let lastElu = { idle: 0, active: 0 } function captureELU () { const elu = performance.eventLoopUtilization() const idle = elu.idle - lastElu.idle const active = elu.active - lastElu.active const utilization = active / (idle + active) lastElu = elu client.gauge('runtime.node.event_loop.utilization', utilization) } function captureCommonMetrics (trackEventLoop) { captureMemoryUsage() captureUptime() captureHeapStats() if (trackEventLoop) { captureELU() } } function captureNativeMetrics (trackEventLoop, trackGc) { const stats = nativeMetrics.stats() const spaces = stats.heap.spaces const currentTime = performance.now() // Milliseconds with decimal places const elapsedUsDividedBy100 = (currentTime - lastTime) * 10 lastTime = currentTime const userPercent = stats.cpu.user / elapsedUsDividedBy100 const systemPercent = stats.cpu.system / elapsedUsDividedBy100 const totalPercent = userPercent + systemPercent client.gauge('runtime.node.cpu.system', systemPercent.toFixed(2)) client.gauge('runtime.node.cpu.user', userPercent.toFixed(2)) client.gauge('runtime.node.cpu.total', totalPercent.toFixed(2)) if (trackEventLoop) { histogram('runtime.node.event_loop.delay', stats.eventLoop) } if (trackGc) { for (const [type, value] of Object.entries(stats.gc)) { if (type === 'all') { histogram('runtime.node.gc.pause', value) } else { histogram('runtime.node.gc.pause.by.type', value, `gc_type:${type}`) } } } for (let i = 0, l = spaces.length; i < l; i++) { const tag = `heap_space:${spaces[i].space_name}` client.gauge('runtime.node.heap.size.by.space', spaces[i].space_size, tag) client.gauge('runtime.node.heap.used_size.by.space', spaces[i].space_used_size, tag) client.gauge('runtime.node.heap.available_size.by.space', spaces[i].space_available_size, tag) client.gauge('runtime.node.heap.physical_size.by.space', spaces[i].physical_space_size, tag) } } function histogram (name, stats, tag) { if (stats.count === 0) { return } client.gauge(`${name}.min`, stats.min, tag) client.gauge(`${name}.max`, stats.max, tag) client.increment(`${name}.sum`, stats.sum, tag) client.increment(`${name}.total`, stats.sum, tag) client.gauge(`${name}.avg`, stats.avg, tag) client.increment(`${name}.count`, stats.count, tag) if (stats.median !== undefined) { // TODO: Consider adding the median to the Node.js histogram/adding stddev to native metrics. client.gauge(`${name}.median`, stats.median, tag) } client.gauge(`${name}.95percentile`, stats.p95, tag) } function startGCObserver () { if (gcObserver) return gcObserver = new PerformanceObserver(list => { for (const entry of list.getEntries()) { // @ts-expect-error - entry.detail?.kind and entry.kind are not typed const type = gcType(entry.detail?.kind || entry.kind) const duration = entry.duration * 1_000_000 // These are individual metrics for each type of GC. client.histogram('runtime.node.gc.pause.by.type', duration, `gc_type:${type}`) client.histogram('runtime.node.gc.pause', duration) } }) gcObserver.observe({ type: 'gc' }) } const minorGCType = NODE_MAJOR >= 22 ? 'minor_mark_sweep' : 'minor_mark_compact' function gcType (kind) { switch (kind) { case 1: return 'scavenge' case 2: return minorGCType case 4: return 'mark_sweep_compact' // Deprecated, might be removed soon. case 8: return 'incremental_marking' case 16: return 'process_weak_callbacks' case 31: return 'all' default: return 'unknown' } }