UNPKG

@graphql-yoga/plugin-apollo-usage-report

Version:

Apollo's GraphOS usage report plugin for GraphQL Yoga.

540 lines (539 loc) • 23.8 kB
// Copied from https://github.com/apollographql/apollo-server/blob/8c6579e5b61276b62dc7e30e6fac9a4242e24daa/packages/server/src/plugin/usageReporting/stats.ts /* eslint-disable */ import { Trace, } from '@apollo/usage-reporting-protobuf'; // protobuf.js exports both a class and an interface (starting with I) for each // message type. The class is what it produces when it decodes the message; the // interface is what is accepted as input. We build up our messages using custom // types implementing the interfaces, so that we can take advantage of the // js_use_toArray option we added to our protobuf.js fork which allows us to use // classes like DurationHistogram to generate repeated fields. We end up // re-creating most of the report structure as custom classes (starting with // "Our"). TypeScript validates that we've properly listed all of the message // fields with the appropriate types (we use `Required` to ensure we implement // all message fields). Using our own classes has other advantages, like being // able to specify that nested messages are instances of the same class rather // than the interface type and thus that they have non-null fields (because the // interface type allows all fields to be optional, even though the protobuf // format doesn't differentiate between missing and falsey). export class SizeEstimator { bytes = 0; } export class OurReport { header; // Apollo Server includes each operation either as aggregated stats or as a // trace, but not both. Other reporting agents such as Apollo Router include // all operations in stats (even those that are sent as traces), and they set // this flag to true. tracesPreAggregated = false; constructor(header) { this.header = header; } tracesPerQuery = Object.create(null); endTime = null; operationCount = 0; // A rough estimate of the number of bytes currently in the report. We start // at zero and don't count `header` and `endTime`, which have the same size // for every report. This really is a rough estimate, so we don't stress too // much about counting bytes for the tags and string/message lengths, etc: // we mostly just count the lengths of strings plus some estimates for the // messages with a bunch of numbers in them. // // We store this in a class so we can pass it down as a reference to other // methods which increment it. sizeEstimator = new SizeEstimator(); ensureCountsAreIntegers() { for (const tracesAndStats of Object.values(this.tracesPerQuery)) { tracesAndStats.ensureCountsAreIntegers(); } } addTrace({ statsReportKey, trace, asTrace, referencedFieldsByType, // The max size a trace can be before it is sent as stats. Note that the // Apollo reporting ingress server will never store any traces over 10mb // anyway. They will still be converted to stats as we would do here. maxTraceBytes = 10 * 1024 * 1024, nonFtv1ErrorPaths, }) { const tracesAndStats = this.getTracesAndStats({ statsReportKey, referencedFieldsByType, }); if (asTrace) { const encodedTrace = Trace.encode(trace).finish(); if (!isNaN(maxTraceBytes) && encodedTrace.length > maxTraceBytes) { tracesAndStats.statsWithContext.addTrace(trace, this.sizeEstimator, nonFtv1ErrorPaths); } else { tracesAndStats.trace.push(encodedTrace); this.sizeEstimator.bytes += 2 + encodedTrace.length; } } else { tracesAndStats.statsWithContext.addTrace(trace, this.sizeEstimator, nonFtv1ErrorPaths); } } getTracesAndStats({ statsReportKey, referencedFieldsByType, }) { const existing = this.tracesPerQuery[statsReportKey]; if (existing) { return existing; } this.sizeEstimator.bytes += estimatedBytesForString(statsReportKey); // Update the size estimator for the referenced field structure. for (const [typeName, referencedFieldsForType] of Object.entries(referencedFieldsByType)) { // Two bytes each for the map entry and for the ReferencedFieldsForType, // and for the isInterface bool if it's set. this.sizeEstimator.bytes += 2 + 2; if (referencedFieldsForType.isInterface) { this.sizeEstimator.bytes += 2; } this.sizeEstimator.bytes += estimatedBytesForString(typeName); for (const fieldName of referencedFieldsForType.fieldNames) { this.sizeEstimator.bytes += estimatedBytesForString(fieldName); } } // Include the referenced fields map in the report. (In an ideal world we // could have a slightly more sophisticated protocol and ingestion pipeline // that allowed us to only have to send this data once for each // schema/operation pair.) return (this.tracesPerQuery[statsReportKey] = new OurTracesAndStats(referencedFieldsByType)); } } class OurTracesAndStats { referencedFieldsByType; constructor(referencedFieldsByType) { this.referencedFieldsByType = referencedFieldsByType; } trace = []; statsWithContext = new StatsByContext(); internalTracesContributingToStats = []; ensureCountsAreIntegers() { this.statsWithContext.ensureCountsAreIntegers(); } } class StatsByContext { map = Object.create(null); /** * This function is used by the protobuf generator to convert this map into * an array of contextualized stats to serialize */ toArray() { return Object.values(this.map); } ensureCountsAreIntegers() { for (const contextualizedStats of Object.values(this.map)) { contextualizedStats.ensureCountsAreIntegers(); } } addTrace(trace, sizeEstimator, nonFtv1ErrorPaths) { this.getContextualizedStats(trace, sizeEstimator).addTrace(trace, sizeEstimator, nonFtv1ErrorPaths); } getContextualizedStats(trace, sizeEstimator) { const statsContext = { clientName: trace.clientName, clientVersion: trace.clientVersion, }; const statsContextKey = JSON.stringify(statsContext); const existing = this.map[statsContextKey]; if (existing) { return existing; } // Adding a ContextualizedStats means adding a StatsContext plus a // QueryLatencyStats. Let's guess about 20 bytes for a QueryLatencyStats; // it'll be more if more features are used (like cache, APQ, etc). sizeEstimator.bytes += 20 + estimatedBytesForString(trace.clientName) + estimatedBytesForString(trace.clientVersion); const contextualizedStats = new OurContextualizedStats(statsContext); this.map[statsContextKey] = contextualizedStats; return contextualizedStats; } } export class OurContextualizedStats { context; queryLatencyStats = new OurQueryLatencyStats(); perTypeStat = Object.create(null); constructor(context) { this.context = context; } ensureCountsAreIntegers() { for (const typeStat of Object.values(this.perTypeStat)) { typeStat.ensureCountsAreIntegers(); } } // Extract statistics from the trace, and increment the estimated report size. // We only add to the estimate when adding whole sub-messages. If it really // mattered, we could do a lot more careful things like incrementing it // whenever a numeric field on queryLatencyStats gets incremented over 0. addTrace(trace, sizeEstimator, nonFtv1ErrorPaths = []) { const { fieldExecutionWeight } = trace; if (!fieldExecutionWeight) { this.queryLatencyStats.requestsWithoutFieldInstrumentation++; } this.queryLatencyStats.requestCount++; if (trace.fullQueryCacheHit) { this.queryLatencyStats.cacheLatencyCount.incrementDuration(trace.durationNs); this.queryLatencyStats.cacheHits++; } else { this.queryLatencyStats.latencyCount.incrementDuration(trace.durationNs); } // We only provide stats about cache TTLs on cache misses (ie, TTLs directly // calculated by the backend), not for cache hits. This matches the // behavior we've had for a while when converting traces into statistics // in Studio's servers. if (!trace.fullQueryCacheHit && trace.cachePolicy?.maxAgeNs != null) { switch (trace.cachePolicy.scope) { case Trace.CachePolicy.Scope.PRIVATE: this.queryLatencyStats.privateCacheTtlCount.incrementDuration(trace.cachePolicy.maxAgeNs); break; case Trace.CachePolicy.Scope.PUBLIC: this.queryLatencyStats.publicCacheTtlCount.incrementDuration(trace.cachePolicy.maxAgeNs); break; } } if (trace.persistedQueryHit) { this.queryLatencyStats.persistedQueryHits++; } if (trace.persistedQueryRegister) { this.queryLatencyStats.persistedQueryMisses++; } if (trace.forbiddenOperation) { this.queryLatencyStats.forbiddenOperationCount++; } if (trace.registeredOperation) { this.queryLatencyStats.registeredOperationCount++; } let hasError = false; const errorPathStats = new Set(); const traceNodeStats = (node, path) => { // Generate error stats and error path information if (node.error?.length) { hasError = true; let currPathErrorStats = this.queryLatencyStats.rootErrorStats; path.toArray().forEach(subPath => { currPathErrorStats = currPathErrorStats.getChild(subPath, sizeEstimator); }); errorPathStats.add(currPathErrorStats); currPathErrorStats.errorsCount += node.error.length; } if (fieldExecutionWeight) { // The actual field name behind the node; originalFieldName is set // if an alias was used, otherwise responseName. (This is falsey for // nodes that are not fields (root, array index, etc).) const fieldName = node.originalFieldName || node.responseName; // Protobuf doesn't really differentiate between "unset" and "falsey" so // we're mostly actually checking that these things are non-empty string / // non-zero numbers. The time fields represent the number of nanoseconds // since the beginning of the entire trace, so let's pretend for the // moment that it's plausible for a node to start or even end exactly when // the trace started (ie, for the time values to be 0). This is unlikely // in practice (everything should take at least 1ns). In practice we only // write `type` and `parentType` on a Node when we write `startTime`, so // the main thing we're looking out for by checking the time values is // whether we somehow failed to write `endTime` at the end of the field; // in this case, the `endTime >= startTime` check won't match. if (node.parentType && fieldName && node.type && node.endTime != null && node.startTime != null && node.endTime >= node.startTime) { const typeStat = this.getTypeStat(node.parentType, sizeEstimator); const fieldStat = typeStat.getFieldStat(fieldName, node.type, sizeEstimator); fieldStat.errorsCount += node.error?.length ?? 0; fieldStat.observedExecutionCount++; fieldStat.estimatedExecutionCount += fieldExecutionWeight; // Note: this is actually counting the number of resolver calls for this // field that had at least one error, not the number of overall GraphQL // queries that had at least one error for this field. That doesn't seem // to match the name, but it does match the other implementations of this // logic. fieldStat.requestsWithErrorsCount += (node.error?.length ?? 0) > 0 ? 1 : 0; fieldStat.latencyCount.incrementDuration(node.endTime - node.startTime, // The latency histogram is always "estimated"; we don't track // "observed" and "estimated" separately. fieldExecutionWeight); } } return false; }; iterateOverTrace(trace, traceNodeStats, true); // iterate over nonFtv1ErrorPaths, using some bits from traceNodeStats function for (const { subgraph, path } of nonFtv1ErrorPaths) { hasError = true; if (path) { let currPathErrorStats = this.queryLatencyStats.rootErrorStats.getChild(`service:${subgraph}`, sizeEstimator); path.forEach(subPath => { if (typeof subPath === 'string') { currPathErrorStats = currPathErrorStats.getChild(subPath, sizeEstimator); } }); errorPathStats.add(currPathErrorStats); currPathErrorStats.errorsCount += 1; } } for (const errorPath of errorPathStats) { errorPath.requestsWithErrorsCount += 1; } if (hasError) { this.queryLatencyStats.requestsWithErrorsCount++; } } getTypeStat(parentType, sizeEstimator) { const existing = this.perTypeStat[parentType]; if (existing) { return existing; } sizeEstimator.bytes += estimatedBytesForString(parentType); const typeStat = new OurTypeStat(); this.perTypeStat[parentType] = typeStat; return typeStat; } } class OurQueryLatencyStats { latencyCount = new DurationHistogram(); requestCount = 0; requestsWithoutFieldInstrumentation = 0; cacheHits = 0; persistedQueryHits = 0; persistedQueryMisses = 0; cacheLatencyCount = new DurationHistogram(); rootErrorStats = new OurPathErrorStats(); requestsWithErrorsCount = 0; publicCacheTtlCount = new DurationHistogram(); privateCacheTtlCount = new DurationHistogram(); registeredOperationCount = 0; forbiddenOperationCount = 0; } class OurPathErrorStats { children = Object.create(null); errorsCount = 0; requestsWithErrorsCount = 0; getChild(subPath, sizeEstimator) { const existing = this.children[subPath]; if (existing) { return existing; } const child = new OurPathErrorStats(); this.children[subPath] = child; // Include a few bytes in the estimate for the numbers etc. sizeEstimator.bytes += estimatedBytesForString(subPath) + 4; return child; } } class OurTypeStat { perFieldStat = Object.create(null); getFieldStat(fieldName, returnType, sizeEstimator) { const existing = this.perFieldStat[fieldName]; if (existing) { return existing; } // Rough estimate of 10 bytes for the numbers in the FieldStat. sizeEstimator.bytes += estimatedBytesForString(fieldName) + estimatedBytesForString(returnType) + 10; const fieldStat = new OurFieldStat(returnType); this.perFieldStat[fieldName] = fieldStat; return fieldStat; } ensureCountsAreIntegers() { for (const fieldStat of Object.values(this.perFieldStat)) { fieldStat.ensureCountsAreIntegers(); } } } class OurFieldStat { returnType; errorsCount = 0; observedExecutionCount = 0; // Note that this number isn't necessarily an integer while it is being // aggregated. Before encoding as a protobuf we call ensureCountsAreIntegers // which floors it. estimatedExecutionCount = 0; requestsWithErrorsCount = 0; latencyCount = new DurationHistogram(); constructor(returnType) { this.returnType = returnType; } ensureCountsAreIntegers() { // This is the only one that ever can receive non-integers. this.estimatedExecutionCount = Math.floor(this.estimatedExecutionCount); } } function estimatedBytesForString(s) { // 2 is for the tag (field ID + wire type) plus the encoded length. (The // encoded length takes up more than 1 byte for strings that are longer than // 127 bytes, but this is an estimate.) return 2 + Buffer.byteLength(s); } export class DurationHistogram { // Note that it's legal for the values in "buckets" to be non-integers; they // will be floored by toArray (which is called by the protobuf encoder). // (We take advantage of this for field latencies specifically, because // the ability to return a non-1 weight from fieldLevelInstrumentation // means we want to build up our histograms as floating-point rather than // rounding after every operation.) buckets; static BUCKET_COUNT = 384; static EXPONENT_LOG = Math.log(1.1); toArray() { let bufferedZeroes = 0; const outputArray = []; for (const value of this.buckets) { if (value === 0) { bufferedZeroes++; } else { if (bufferedZeroes === 1) { outputArray.push(0); } else if (bufferedZeroes !== 0) { outputArray.push(-bufferedZeroes); } outputArray.push(Math.floor(value)); bufferedZeroes = 0; } } return outputArray; } static durationToBucket(durationNs) { const log = Math.log(durationNs / 1000.0); const unboundedBucket = Math.ceil(log / DurationHistogram.EXPONENT_LOG); // Compare <= 0 to catch -0 and -infinity return unboundedBucket <= 0 || Number.isNaN(unboundedBucket) ? 0 : unboundedBucket >= DurationHistogram.BUCKET_COUNT ? DurationHistogram.BUCKET_COUNT - 1 : unboundedBucket; } incrementDuration(durationNs, value = 1) { this.incrementBucket(DurationHistogram.durationToBucket(durationNs), value); return this; } incrementBucket(bucket, value = 1) { if (bucket >= DurationHistogram.BUCKET_COUNT) { // Since we don't have fixed size arrays I'd rather throw the error manually throw Error('Bucket is out of bounds of the buckets array'); } // Extend the array if we haven't gotten it long enough to handle the new bucket if (bucket >= this.buckets.length) { const oldLength = this.buckets.length; this.buckets.length = bucket + 1; this.buckets.fill(0, oldLength); } this.buckets[bucket] += value; // ! is safe, we've already ensured the array is long enough } combine(otherHistogram) { for (let i = 0; i < otherHistogram.buckets.length; i++) { this.incrementBucket(i, otherHistogram.buckets[i]); } } constructor(options) { const initSize = options?.initSize || 74; const buckets = options?.buckets; const arrayInitSize = Math.max(buckets?.length || 0, initSize); this.buckets = Array(arrayInitSize).fill(0); if (buckets) { buckets.forEach((val, index) => (this.buckets[index] = val)); } } } /** * Iterates over the entire trace, calling `f` on each Trace.Node found. It * looks under the "root" node as well as any inside the query plan. If any `f` * returns true, it stops walking the tree. * * Each call to `f` will receive an object that implements ResponseNamePath. If * `includePath` is true, `f` can call `toArray()` on it to convert the * linked-list representation to an array of the response name (field name) * nodes that you navigate to get to the node (including a "service:subgraph" * top-level node if this is a federated trace). Note that we don't add anything * to the path for index (list element) nodes. This is because the only use case * we have (error path statistics) does not care about list indexes (it's not * that interesting to know that sometimes an error was at foo.3.bar and * sometimes foo.5.bar, vs just generally foo.bar). * * If `includePath` is false, we don't bother to build up the linked lists, and * calling `toArray()` will throw. */ export function iterateOverTrace(trace, f, includePath) { const rootPath = includePath ? new RootCollectingPathsResponseNamePath() : notCollectingPathsResponseNamePath; if (trace.root) { if (iterateOverTraceNode(trace.root, rootPath, f)) return; } if (trace.queryPlan) { if (iterateOverQueryPlan(trace.queryPlan, rootPath, f)) return; } } // Helper for iterateOverTrace; returns true to stop the overall walk. function iterateOverQueryPlan(node, rootPath, f) { if (!node) return false; if (node.fetch?.trace?.root && node.fetch.serviceName) { return iterateOverTraceNode(node.fetch.trace.root, rootPath.child(`service:${node.fetch.serviceName}`), f); } if (node.flatten?.node) { return iterateOverQueryPlan(node.flatten.node, rootPath, f); } if (node.parallel?.nodes) { // We want to stop as soon as some call returns true, which happens to be // exactly what 'some' does. return node.parallel.nodes.some(node => iterateOverQueryPlan(node, rootPath, f)); } if (node.sequence?.nodes) { // We want to stop as soon as some call returns true, which happens to be // exactly what 'some' does. return node.sequence.nodes.some(node => iterateOverQueryPlan(node, rootPath, f)); } return false; } // Helper for iterateOverTrace; returns true to stop the overall walk. function iterateOverTraceNode(node, path, f) { // Invoke the function; if it returns true, don't descend and tell callers to // stop walking. if (f(node, path)) { return true; } return ( // We want to stop as soon as some call returns true, which happens to be // exactly what 'some' does. node.child?.some(child => { const childPath = child.responseName ? path.child(child.responseName) : path; return iterateOverTraceNode(child, childPath, f); }) ?? false); } const notCollectingPathsResponseNamePath = { toArray() { throw Error('not collecting paths!'); }, child() { return this; }, }; class RootCollectingPathsResponseNamePath { toArray() { return []; } child(responseName) { return new ChildCollectingPathsResponseNamePath(responseName, this); } } class ChildCollectingPathsResponseNamePath { responseName; prev; constructor(responseName, prev) { this.responseName = responseName; this.prev = prev; } toArray() { const out = []; let curr = this; while (curr instanceof ChildCollectingPathsResponseNamePath) { out.push(curr.responseName); curr = curr.prev; } return out.reverse(); } child(responseName) { return new ChildCollectingPathsResponseNamePath(responseName, this); } }