@apollo/server

import type { NonFtv1ErrorPath } from '@apollo/server-gateway-interface'; import { type google, type IContextualizedStats, type IFieldStat, type IPathErrorStats, type IQueryLatencyStats, type IReport, type IStatsContext, type ITracesAndStats, type ITypeStat, type ReportHeader, Trace, } from '@apollo/usage-reporting-protobuf'; import type { ReferencedFieldsByType } from '@apollo/utils.usagereporting'; import { DurationHistogram } from './durationHistogram.js'; import { iterateOverTrace, type ResponseNamePath } from './iterateOverTrace.js'; // protobuf.js exports both a class and an interface (starting with I) for each // message type. The class is what it produces when it decodes the message; the // interface is what is accepted as input. We build up our messages using custom // types implementing the interfaces, so that we can take advantage of the // js_use_toArray option we added to our protobuf.js fork which allows us to use // classes like DurationHistogram to generate repeated fields. We end up // re-creating most of the report structure as custom classes (starting with // "Our"). TypeScript validates that we've properly listed all of the message // fields with the appropriate types (we use `Required` to ensure we implement // all message fields). Using our own classes has other advantages, like being // able to specify that nested messages are instances of the same class rather // than the interface type and thus that they have non-null fields (because the // interface type allows all fields to be optional, even though the protobuf // format doesn't differentiate between missing and falsey). export class SizeEstimator { bytes = 0; } export class OurReport implements Required<IReport> { // Apollo Server includes each operation either as aggregated stats or as a // trace, but not both. Other reporting agents such as Apollo Router include // all operations in stats (even those that are sent as traces), and they set // this flag to true. tracesPreAggregated = false; constructor(readonly header: ReportHeader) {} readonly tracesPerQuery: Record<string, OurTracesAndStats> = Object.create(null); endTime: google.protobuf.ITimestamp | null = null; operationCount = 0; // A rough estimate of the number of bytes currently in the report. We start // at zero and don't count `header` and `endTime`, which have the same size // for every report. This really is a rough estimate, so we don't stress too // much about counting bytes for the tags and string/message lengths, etc: // we mostly just count the lengths of strings plus some estimates for the // messages with a bunch of numbers in them. // // We store this in a class so we can pass it down as a reference to other // methods which increment it. readonly sizeEstimator = new SizeEstimator(); ensureCountsAreIntegers() { for (const tracesAndStats of Object.values(this.tracesPerQuery)) { tracesAndStats.ensureCountsAreIntegers(); } } addTrace({ statsReportKey, trace, asTrace, referencedFieldsByType, // The max size a trace can be before it is sent as stats. Note that the // Apollo reporting ingress server will never store any traces over 10mb // anyway. They will still be converted to stats as we would do here. maxTraceBytes = 10 * 1024 * 1024, nonFtv1ErrorPaths, }: { statsReportKey: string; trace: Trace; asTrace: boolean; referencedFieldsByType: ReferencedFieldsByType; maxTraceBytes?: number; nonFtv1ErrorPaths: NonFtv1ErrorPath[]; }) { const tracesAndStats = this.getTracesAndStats({ statsReportKey, referencedFieldsByType, }); if (asTrace) { const encodedTrace = Trace.encode(trace).finish(); if (!isNaN(maxTraceBytes) && encodedTrace.length > maxTraceBytes) { tracesAndStats.statsWithContext.addTrace( trace, this.sizeEstimator, nonFtv1ErrorPaths, ); } else { tracesAndStats.trace.push(encodedTrace); this.sizeEstimator.bytes += 2 + encodedTrace.length; } } else { tracesAndStats.statsWithContext.addTrace( trace, this.sizeEstimator, nonFtv1ErrorPaths, ); } } private getTracesAndStats({ statsReportKey, referencedFieldsByType, }: { statsReportKey: string; referencedFieldsByType: ReferencedFieldsByType; }) { const existing = this.tracesPerQuery[statsReportKey]; if (existing) { return existing; } this.sizeEstimator.bytes += estimatedBytesForString(statsReportKey); // Update the size estimator for the referenced field structure. for (const [typeName, referencedFieldsForType] of Object.entries( referencedFieldsByType, )) { // Two bytes each for the map entry and for the ReferencedFieldsForType, // and for the isInterface bool if it's set. this.sizeEstimator.bytes += 2 + 2; if (referencedFieldsForType.isInterface) { this.sizeEstimator.bytes += 2; } this.sizeEstimator.bytes += estimatedBytesForString(typeName); for (const fieldName of referencedFieldsForType.fieldNames) { this.sizeEstimator.bytes += estimatedBytesForString(fieldName); } } // Include the referenced fields map in the report. (In an ideal world we // could have a slightly more sophisticated protocol and ingestion pipeline // that allowed us to only have to send this data once for each // schema/operation pair.) return (this.tracesPerQuery[statsReportKey] = new OurTracesAndStats( referencedFieldsByType, )); } } class OurTracesAndStats implements Required<ITracesAndStats> { constructor(readonly referencedFieldsByType: ReferencedFieldsByType) {} readonly trace: Uint8Array[] = []; readonly statsWithContext = new StatsByContext(); readonly internalTracesContributingToStats: Uint8Array[] = []; ensureCountsAreIntegers() { this.statsWithContext.ensureCountsAreIntegers(); } } class StatsByContext { readonly map: { [k: string]: OurContextualizedStats } = Object.create(null); /** * This function is used by the protobuf generator to convert this map into * an array of contextualized stats to serialize */ toArray(): IContextualizedStats[] { return Object.values(this.map); } ensureCountsAreIntegers() { for (const contextualizedStats of Object.values(this.map)) { contextualizedStats.ensureCountsAreIntegers(); } } addTrace( trace: Trace, sizeEstimator: SizeEstimator, nonFtv1ErrorPaths: NonFtv1ErrorPath[], ) { this.getContextualizedStats(trace, sizeEstimator).addTrace( trace, sizeEstimator, nonFtv1ErrorPaths, ); } private getContextualizedStats( trace: Trace, sizeEstimator: SizeEstimator, ): OurContextualizedStats { const statsContext: IStatsContext = { clientName: trace.clientName, clientVersion: trace.clientVersion, }; const statsContextKey = JSON.stringify(statsContext); const existing = this.map[statsContextKey]; if (existing) { return existing; } // Adding a ContextualizedStats means adding a StatsContext plus a // QueryLatencyStats. Let's guess about 20 bytes for a QueryLatencyStats; // it'll be more if more features are used (like cache, APQ, etc). sizeEstimator.bytes += 20 + estimatedBytesForString(trace.clientName) + estimatedBytesForString(trace.clientVersion); const contextualizedStats = new OurContextualizedStats(statsContext); this.map[statsContextKey] = contextualizedStats; return contextualizedStats; } } export class OurContextualizedStats implements Required<IContextualizedStats> { queryLatencyStats = new OurQueryLatencyStats(); perTypeStat: { [k: string]: OurTypeStat } = Object.create(null); constructor(readonly context: IStatsContext) {} ensureCountsAreIntegers() { for (const typeStat of Object.values(this.perTypeStat)) { typeStat.ensureCountsAreIntegers(); } } // Extract statistics from the trace, and increment the estimated report size. // We only add to the estimate when adding whole sub-messages. If it really // mattered, we could do a lot more careful things like incrementing it // whenever a numeric field on queryLatencyStats gets incremented over 0. addTrace( trace: Trace, sizeEstimator: SizeEstimator, nonFtv1ErrorPaths: NonFtv1ErrorPath[] = [], ) { const { fieldExecutionWeight } = trace; if (!fieldExecutionWeight) { this.queryLatencyStats.requestsWithoutFieldInstrumentation++; } this.queryLatencyStats.requestCount++; if (trace.fullQueryCacheHit) { this.queryLatencyStats.cacheLatencyCount.incrementDuration( trace.durationNs, ); this.queryLatencyStats.cacheHits++; } else { this.queryLatencyStats.latencyCount.incrementDuration(trace.durationNs); } // We only provide stats about cache TTLs on cache misses (ie, TTLs directly // calculated by the backend), not for cache hits. This matches the // behavior we've had for a while when converting traces into statistics // in Studio's servers. if (!trace.fullQueryCacheHit && trace.cachePolicy?.maxAgeNs != null) { switch (trace.cachePolicy.scope) { case Trace.CachePolicy.Scope.PRIVATE: this.queryLatencyStats.privateCacheTtlCount.incrementDuration( trace.cachePolicy.maxAgeNs, ); break; case Trace.CachePolicy.Scope.PUBLIC: this.queryLatencyStats.publicCacheTtlCount.incrementDuration( trace.cachePolicy.maxAgeNs, ); break; } } if (trace.persistedQueryHit) { this.queryLatencyStats.persistedQueryHits++; } if (trace.persistedQueryRegister) { this.queryLatencyStats.persistedQueryMisses++; } if (trace.forbiddenOperation) { this.queryLatencyStats.forbiddenOperationCount++; } if (trace.registeredOperation) { this.queryLatencyStats.registeredOperationCount++; } let hasError = false; const errorPathStats = new Set<OurPathErrorStats>(); const traceNodeStats = (node: Trace.INode, path: ResponseNamePath) => { // Generate error stats and error path information if (node.error?.length) { hasError = true; let currPathErrorStats = this.queryLatencyStats.rootErrorStats; path.toArray().forEach((subPath) => { currPathErrorStats = currPathErrorStats.getChild( subPath, sizeEstimator, ); }); errorPathStats.add(currPathErrorStats); currPathErrorStats.errorsCount += node.error.length; } if (fieldExecutionWeight) { // The actual field name behind the node; originalFieldName is set // if an alias was used, otherwise responseName. (This is falsey for // nodes that are not fields (root, array index, etc).) const fieldName = node.originalFieldName || node.responseName; // Protobuf doesn't really differentiate between "unset" and "falsey" so // we're mostly actually checking that these things are non-empty string / // non-zero numbers. The time fields represent the number of nanoseconds // since the beginning of the entire trace, so let's pretend for the // moment that it's plausible for a node to start or even end exactly when // the trace started (ie, for the time values to be 0). This is unlikely // in practice (everything should take at least 1ns). In practice we only // write `type` and `parentType` on a Node when we write `startTime`, so // the main thing we're looking out for by checking the time values is // whether we somehow failed to write `endTime` at the end of the field; // in this case, the `endTime >= startTime` check won't match. if ( node.parentType && fieldName && node.type && node.endTime != null && node.startTime != null && node.endTime >= node.startTime ) { const typeStat = this.getTypeStat(node.parentType, sizeEstimator); const fieldStat = typeStat.getFieldStat( fieldName, node.type, sizeEstimator, ); fieldStat.errorsCount += node.error?.length ?? 0; fieldStat.observedExecutionCount++; fieldStat.estimatedExecutionCount += fieldExecutionWeight; // Note: this is actually counting the number of resolver calls for this // field that had at least one error, not the number of overall GraphQL // queries that had at least one error for this field. That doesn't seem // to match the name, but it does match the other implementations of this // logic. fieldStat.requestsWithErrorsCount += (node.error?.length ?? 0) > 0 ? 1 : 0; fieldStat.latencyCount.incrementDuration( node.endTime - node.startTime, // The latency histogram is always "estimated"; we don't track // "observed" and "estimated" separately. fieldExecutionWeight, ); } } return false; }; iterateOverTrace(trace, traceNodeStats, true); // iterate over nonFtv1ErrorPaths, using some bits from traceNodeStats function for (const { subgraph, path } of nonFtv1ErrorPaths) { hasError = true; if (path) { let currPathErrorStats = this.queryLatencyStats.rootErrorStats.getChild( `service:${subgraph}`, sizeEstimator, ); path.forEach((subPath) => { if (typeof subPath === 'string') { currPathErrorStats = currPathErrorStats.getChild( subPath, sizeEstimator, ); } }); errorPathStats.add(currPathErrorStats); currPathErrorStats.errorsCount += 1; } } for (const errorPath of errorPathStats) { errorPath.requestsWithErrorsCount += 1; } if (hasError) { this.queryLatencyStats.requestsWithErrorsCount++; } } getTypeStat(parentType: string, sizeEstimator: SizeEstimator): OurTypeStat { const existing = this.perTypeStat[parentType]; if (existing) { return existing; } sizeEstimator.bytes += estimatedBytesForString(parentType); const typeStat = new OurTypeStat(); this.perTypeStat[parentType] = typeStat; return typeStat; } } class OurQueryLatencyStats implements Required<IQueryLatencyStats> { latencyCount: DurationHistogram = new DurationHistogram(); requestCount = 0; requestsWithoutFieldInstrumentation = 0; cacheHits = 0; persistedQueryHits = 0; persistedQueryMisses = 0; cacheLatencyCount: DurationHistogram = new DurationHistogram(); rootErrorStats: OurPathErrorStats = new OurPathErrorStats(); requestsWithErrorsCount = 0; publicCacheTtlCount: DurationHistogram = new DurationHistogram(); privateCacheTtlCount: DurationHistogram = new DurationHistogram(); registeredOperationCount = 0; forbiddenOperationCount = 0; } class OurPathErrorStats implements Required<IPathErrorStats> { children: { [k: string]: OurPathErrorStats } = Object.create(null); errorsCount = 0; requestsWithErrorsCount = 0; getChild(subPath: string, sizeEstimator: SizeEstimator): OurPathErrorStats { const existing = this.children[subPath]; if (existing) { return existing; } const child = new OurPathErrorStats(); this.children[subPath] = child; // Include a few bytes in the estimate for the numbers etc. sizeEstimator.bytes += estimatedBytesForString(subPath) + 4; return child; } } class OurTypeStat implements Required<ITypeStat> { perFieldStat: { [k: string]: OurFieldStat } = Object.create(null); getFieldStat( fieldName: string, returnType: string, sizeEstimator: SizeEstimator, ): OurFieldStat { const existing = this.perFieldStat[fieldName]; if (existing) { return existing; } // Rough estimate of 10 bytes for the numbers in the FieldStat. sizeEstimator.bytes += estimatedBytesForString(fieldName) + estimatedBytesForString(returnType) + 10; const fieldStat = new OurFieldStat(returnType); this.perFieldStat[fieldName] = fieldStat; return fieldStat; } ensureCountsAreIntegers() { for (const fieldStat of Object.values(this.perFieldStat)) { fieldStat.ensureCountsAreIntegers(); } } } class OurFieldStat implements Required<IFieldStat> { errorsCount = 0; observedExecutionCount = 0; // Note that this number isn't necessarily an integer while it is being // aggregated. Before encoding as a protobuf we call ensureCountsAreIntegers // which floors it. estimatedExecutionCount = 0; requestsWithErrorsCount = 0; latencyCount: DurationHistogram = new DurationHistogram(); constructor(readonly returnType: string) {} ensureCountsAreIntegers() { // This is the only one that ever can receive non-integers. this.estimatedExecutionCount = Math.floor(this.estimatedExecutionCount); } } function estimatedBytesForString(s: string) { // 2 is for the tag (field ID + wire type) plus the encoded length. (The // encoded length takes up more than 1 byte for strings that are longer than // 127 bytes, but this is an estimate.) return 2 + Buffer.byteLength(s); }