UNPKG

@tanstack/db

Version:

A reactive client store for building super fast apps on sync

617 lines (552 loc) 21.5 kB
import { filter, join as joinOperator, map, tap } from '@tanstack/db-ivm' import { CollectionInputNotFoundError, InvalidJoinCondition, InvalidJoinConditionLeftSourceError, InvalidJoinConditionRightSourceError, InvalidJoinConditionSameSourceError, InvalidJoinConditionSourceMismatchError, JoinCollectionNotFoundError, SubscriptionNotFoundError, UnsupportedJoinSourceTypeError, UnsupportedJoinTypeError, } from '../../errors.js' import { ensureIndexForField } from '../../indexes/auto-index.js' import { PropRef, followRef } from '../ir.js' import { inArray } from '../builder/functions.js' import { compileExpression } from './evaluators.js' import type { CompileQueryFn } from './index.js' import type { OrderByOptimizationInfo } from './order-by.js' import type { BasicExpression, CollectionRef, JoinClause, QueryIR, QueryRef, } from '../ir.js' import type { IStreamBuilder, JoinType } from '@tanstack/db-ivm' import type { Collection } from '../../collection/index.js' import type { KeyedStream, NamespacedAndKeyedStream, NamespacedRow, } from '../../types.js' import type { QueryCache, QueryMapping, WindowOptions } from './types.js' import type { CollectionSubscription } from '../../collection/subscription.js' /** Function type for loading specific keys into a lazy collection */ export type LoadKeysFn = (key: Set<string | number>) => void /** Callbacks for managing lazy-loaded collections in optimized joins */ export type LazyCollectionCallbacks = { loadKeys: LoadKeysFn loadInitialState: () => void } /** * Processes all join clauses, applying lazy loading optimizations and maintaining * alias tracking for per-alias subscriptions (enables self-joins). */ export function processJoins( pipeline: NamespacedAndKeyedStream, joinClauses: Array<JoinClause>, sources: Record<string, KeyedStream>, mainCollectionId: string, mainSource: string, allInputs: Record<string, KeyedStream>, cache: QueryCache, queryMapping: QueryMapping, collections: Record<string, Collection>, subscriptions: Record<string, CollectionSubscription>, callbacks: Record<string, LazyCollectionCallbacks>, lazySources: Set<string>, optimizableOrderByCollections: Record<string, OrderByOptimizationInfo>, setWindowFn: (windowFn: (options: WindowOptions) => void) => void, rawQuery: QueryIR, onCompileSubquery: CompileQueryFn, aliasToCollectionId: Record<string, string>, aliasRemapping: Record<string, string>, ): NamespacedAndKeyedStream { let resultPipeline = pipeline for (const joinClause of joinClauses) { resultPipeline = processJoin( resultPipeline, joinClause, sources, mainCollectionId, mainSource, allInputs, cache, queryMapping, collections, subscriptions, callbacks, lazySources, optimizableOrderByCollections, setWindowFn, rawQuery, onCompileSubquery, aliasToCollectionId, aliasRemapping, ) } return resultPipeline } /** * Processes a single join clause with lazy loading optimization. * For LEFT/RIGHT/INNER joins, marks one side as "lazy" (loads on-demand based on join keys). */ function processJoin( pipeline: NamespacedAndKeyedStream, joinClause: JoinClause, sources: Record<string, KeyedStream>, mainCollectionId: string, mainSource: string, allInputs: Record<string, KeyedStream>, cache: QueryCache, queryMapping: QueryMapping, collections: Record<string, Collection>, subscriptions: Record<string, CollectionSubscription>, callbacks: Record<string, LazyCollectionCallbacks>, lazySources: Set<string>, optimizableOrderByCollections: Record<string, OrderByOptimizationInfo>, setWindowFn: (windowFn: (options: WindowOptions) => void) => void, rawQuery: QueryIR, onCompileSubquery: CompileQueryFn, aliasToCollectionId: Record<string, string>, aliasRemapping: Record<string, string>, ): NamespacedAndKeyedStream { const isCollectionRef = joinClause.from.type === `collectionRef` // Get the joined source alias and input stream const { alias: joinedSource, input: joinedInput, collectionId: joinedCollectionId, } = processJoinSource( joinClause.from, allInputs, collections, subscriptions, callbacks, lazySources, optimizableOrderByCollections, setWindowFn, cache, queryMapping, onCompileSubquery, aliasToCollectionId, aliasRemapping, ) // Add the joined source to the sources map sources[joinedSource] = joinedInput if (isCollectionRef) { // Only direct collection references form new alias bindings. Subquery // aliases reuse the mapping returned from the recursive compilation above. aliasToCollectionId[joinedSource] = joinedCollectionId } const mainCollection = collections[mainCollectionId] const joinedCollection = collections[joinedCollectionId] if (!mainCollection) { throw new JoinCollectionNotFoundError(mainCollectionId) } if (!joinedCollection) { throw new JoinCollectionNotFoundError(joinedCollectionId) } const { activeSource, lazySource } = getActiveAndLazySources( joinClause.type, mainCollection, joinedCollection, ) // Analyze which source each expression refers to and swap if necessary const availableSources = Object.keys(sources) const { mainExpr, joinedExpr } = analyzeJoinExpressions( joinClause.left, joinClause.right, availableSources, joinedSource, ) // Pre-compile the join expressions const compiledMainExpr = compileExpression(mainExpr) const compiledJoinedExpr = compileExpression(joinedExpr) // Prepare the main pipeline for joining let mainPipeline = pipeline.pipe( map(([currentKey, namespacedRow]) => { // Extract the join key from the main source expression const mainKey = compiledMainExpr(namespacedRow) // Return [joinKey, [originalKey, namespacedRow]] return [mainKey, [currentKey, namespacedRow]] as [ unknown, [string, typeof namespacedRow], ] }), ) // Prepare the joined pipeline let joinedPipeline = joinedInput.pipe( map(([currentKey, row]) => { // Wrap the row in a namespaced structure const namespacedRow: NamespacedRow = { [joinedSource]: row } // Extract the join key from the joined source expression const joinedKey = compiledJoinedExpr(namespacedRow) // Return [joinKey, [originalKey, namespacedRow]] return [joinedKey, [currentKey, namespacedRow]] as [ unknown, [string, typeof namespacedRow], ] }), ) // Apply the join operation if (![`inner`, `left`, `right`, `full`].includes(joinClause.type)) { throw new UnsupportedJoinTypeError(joinClause.type) } if (activeSource) { // If the lazy collection comes from a subquery that has a limit and/or an offset clause // then we need to deoptimize the join because we don't know which rows are in the result set // since we simply lookup matching keys in the index but the index contains all rows // (not just the ones that pass the limit and offset clauses) const lazyFrom = activeSource === `main` ? joinClause.from : rawQuery.from const limitedSubquery = lazyFrom.type === `queryRef` && (lazyFrom.query.limit || lazyFrom.query.offset) // If join expressions contain computed values (like concat functions) // we don't optimize the join because we don't have an index over the computed values const hasComputedJoinExpr = mainExpr.type === `func` || joinedExpr.type === `func` if (!limitedSubquery && !hasComputedJoinExpr) { // This join can be optimized by having the active collection // dynamically load keys into the lazy collection // based on the value of the joinKey and by looking up // matching rows in the index of the lazy collection // Mark the lazy source alias as lazy // this Set is passed by the liveQueryCollection to the compiler // such that the liveQueryCollection can check it after compilation // to know which source aliases should load data lazily (not initially) const lazyAlias = activeSource === `main` ? joinedSource : mainSource lazySources.add(lazyAlias) const activePipeline = activeSource === `main` ? mainPipeline : joinedPipeline const lazySourceJoinExpr = activeSource === `main` ? (joinedExpr as PropRef) : (mainExpr as PropRef) const followRefResult = followRef( rawQuery, lazySourceJoinExpr, lazySource, )! const followRefCollection = followRefResult.collection const fieldName = followRefResult.path[0] if (fieldName) { ensureIndexForField( fieldName, followRefResult.path, followRefCollection, ) } // Set up lazy loading: intercept active side's stream and dynamically load // matching rows from lazy side based on join keys. const activePipelineWithLoading: IStreamBuilder< [key: unknown, [originalKey: string, namespacedRow: NamespacedRow]] > = activePipeline.pipe( tap((data) => { // Find the subscription for lazy loading. // Subscriptions are keyed by the innermost alias (where the collection subscription // was actually created). For subqueries, the join alias may differ from the inner alias. // aliasRemapping provides a flattened one-hop lookup from outer → innermost alias. // Example: .join({ activeUser: subquery }) where subquery uses .from({ user: collection }) // → aliasRemapping['activeUser'] = 'user' (always maps directly to innermost, never recursive) const resolvedAlias = aliasRemapping[lazyAlias] || lazyAlias const lazySourceSubscription = subscriptions[resolvedAlias] if (!lazySourceSubscription) { throw new SubscriptionNotFoundError( resolvedAlias, lazyAlias, lazySource.id, Object.keys(subscriptions), ) } if (lazySourceSubscription.hasLoadedInitialState()) { // Entire state was already loaded because we deoptimized the join return } // Request filtered snapshot from lazy collection for matching join keys const joinKeys = data.getInner().map(([[joinKey]]) => joinKey) const lazyJoinRef = new PropRef(followRefResult.path) const loaded = lazySourceSubscription.requestSnapshot({ where: inArray(lazyJoinRef, joinKeys), optimizedOnly: true, }) if (!loaded) { // Snapshot wasn't sent because it could not be loaded from the indexes lazySourceSubscription.requestSnapshot() } }), ) if (activeSource === `main`) { mainPipeline = activePipelineWithLoading } else { joinedPipeline = activePipelineWithLoading } } } return mainPipeline.pipe( joinOperator(joinedPipeline, joinClause.type as JoinType), processJoinResults(joinClause.type), ) } /** * Analyzes join expressions to determine which refers to which source * and returns them in the correct order (available source expression first, joined source expression second) */ function analyzeJoinExpressions( left: BasicExpression, right: BasicExpression, allAvailableSourceAliases: Array<string>, joinedSource: string, ): { mainExpr: BasicExpression; joinedExpr: BasicExpression } { // Filter out the joined source alias from the available source aliases const availableSources = allAvailableSourceAliases.filter( (alias) => alias !== joinedSource, ) const leftSourceAlias = getSourceAliasFromExpression(left) const rightSourceAlias = getSourceAliasFromExpression(right) // If left expression refers to an available source and right refers to joined source, keep as is if ( leftSourceAlias && availableSources.includes(leftSourceAlias) && rightSourceAlias === joinedSource ) { return { mainExpr: left, joinedExpr: right } } // If left expression refers to joined source and right refers to an available source, swap them if ( leftSourceAlias === joinedSource && rightSourceAlias && availableSources.includes(rightSourceAlias) ) { return { mainExpr: right, joinedExpr: left } } // If one expression doesn't refer to any source, this is an invalid join if (!leftSourceAlias || !rightSourceAlias) { throw new InvalidJoinConditionSourceMismatchError() } // If both expressions refer to the same alias, this is an invalid join if (leftSourceAlias === rightSourceAlias) { throw new InvalidJoinConditionSameSourceError(leftSourceAlias) } // Left side must refer to an available source // This cannot happen with the query builder as there is no way to build a ref // to an unavailable source, but just in case, but could happen with the IR if (!availableSources.includes(leftSourceAlias)) { throw new InvalidJoinConditionLeftSourceError(leftSourceAlias) } // Right side must refer to the joined source if (rightSourceAlias !== joinedSource) { throw new InvalidJoinConditionRightSourceError(joinedSource) } // This should not be reachable given the logic above, but just in case throw new InvalidJoinCondition() } /** * Extracts the source alias from a join expression */ function getSourceAliasFromExpression(expr: BasicExpression): string | null { switch (expr.type) { case `ref`: // PropRef path has the source alias as the first element return expr.path[0] || null case `func`: { // For function expressions, we need to check if all arguments refer to the same source const sourceAliases = new Set<string>() for (const arg of expr.args) { const alias = getSourceAliasFromExpression(arg) if (alias) { sourceAliases.add(alias) } } // If all arguments refer to the same source, return that source alias return sourceAliases.size === 1 ? Array.from(sourceAliases)[0]! : null } default: // Values (type='val') don't reference any source return null } } /** * Processes the join source (collection or sub-query) */ function processJoinSource( from: CollectionRef | QueryRef, allInputs: Record<string, KeyedStream>, collections: Record<string, Collection>, subscriptions: Record<string, CollectionSubscription>, callbacks: Record<string, LazyCollectionCallbacks>, lazySources: Set<string>, optimizableOrderByCollections: Record<string, OrderByOptimizationInfo>, setWindowFn: (windowFn: (options: WindowOptions) => void) => void, cache: QueryCache, queryMapping: QueryMapping, onCompileSubquery: CompileQueryFn, aliasToCollectionId: Record<string, string>, aliasRemapping: Record<string, string>, ): { alias: string; input: KeyedStream; collectionId: string } { switch (from.type) { case `collectionRef`: { const input = allInputs[from.alias] if (!input) { throw new CollectionInputNotFoundError( from.alias, from.collection.id, Object.keys(allInputs), ) } aliasToCollectionId[from.alias] = from.collection.id return { alias: from.alias, input, collectionId: from.collection.id } } case `queryRef`: { // Find the original query for caching purposes const originalQuery = queryMapping.get(from.query) || from.query // Recursively compile the sub-query with cache const subQueryResult = onCompileSubquery( originalQuery, allInputs, collections, subscriptions, callbacks, lazySources, optimizableOrderByCollections, setWindowFn, cache, queryMapping, ) // Pull up alias mappings from subquery to parent scope. // This includes both the innermost alias-to-collection mappings AND // any existing remappings from nested subquery levels. Object.assign(aliasToCollectionId, subQueryResult.aliasToCollectionId) Object.assign(aliasRemapping, subQueryResult.aliasRemapping) // Create a flattened remapping from outer alias to innermost alias. // For nested subqueries, this ensures one-hop lookups (not recursive chains). // // Example with 3-level nesting: // Inner: .from({ user: usersCollection }) // Middle: .from({ activeUser: innerSubquery }) → creates: activeUser → user // Outer: .join({ author: middleSubquery }, ...) → creates: author → user (not author → activeUser) // // We search through the PULLED-UP aliasToCollectionId (which contains the // innermost 'user' alias), so we always map directly to the deepest level. // This means aliasRemapping[lazyAlias] is always a single lookup, never recursive. const innerAlias = Object.keys(subQueryResult.aliasToCollectionId).find( (alias) => subQueryResult.aliasToCollectionId[alias] === subQueryResult.collectionId, ) if (innerAlias && innerAlias !== from.alias) { aliasRemapping[from.alias] = innerAlias } // Extract the pipeline from the compilation result const subQueryInput = subQueryResult.pipeline // Subqueries may return [key, [value, orderByIndex]] (with ORDER BY) or [key, value] (without ORDER BY) // We need to extract just the value for use in parent queries const extractedInput = subQueryInput.pipe( map((data: any) => { const [key, [value, _orderByIndex]] = data return [key, value] as [unknown, any] }), ) return { alias: from.alias, input: extractedInput as KeyedStream, collectionId: subQueryResult.collectionId, } } default: throw new UnsupportedJoinSourceTypeError((from as any).type) } } /** * Processes the results of a join operation */ function processJoinResults(joinType: string) { return function ( pipeline: IStreamBuilder< [ key: string, [ [string, NamespacedRow] | undefined, [string, NamespacedRow] | undefined, ], ] >, ): NamespacedAndKeyedStream { return pipeline.pipe( // Process the join result and handle nulls filter((result) => { const [_key, [main, joined]] = result const mainNamespacedRow = main?.[1] const joinedNamespacedRow = joined?.[1] // Handle different join types if (joinType === `inner`) { return !!(mainNamespacedRow && joinedNamespacedRow) } if (joinType === `left`) { return !!mainNamespacedRow } if (joinType === `right`) { return !!joinedNamespacedRow } // For full joins, always include return true }), map((result) => { const [_key, [main, joined]] = result const mainKey = main?.[0] const mainNamespacedRow = main?.[1] const joinedKey = joined?.[0] const joinedNamespacedRow = joined?.[1] // Merge the namespaced rows const mergedNamespacedRow: NamespacedRow = {} // Add main row data if it exists if (mainNamespacedRow) { Object.assign(mergedNamespacedRow, mainNamespacedRow) } // Add joined row data if it exists if (joinedNamespacedRow) { Object.assign(mergedNamespacedRow, joinedNamespacedRow) } // We create a composite key that combines the main and joined keys const resultKey = `[${mainKey},${joinedKey}]` return [resultKey, mergedNamespacedRow] as [string, NamespacedRow] }), ) } } /** * Returns the active and lazy collections for a join clause. * The active collection is the one that we need to fully iterate over * and it can be the main source (i.e. left collection) or the joined source (i.e. right collection). * The lazy collection is the one that we should join-in lazily based on matches in the active collection. * @param joinClause - The join clause to analyze * @param leftCollection - The left collection * @param rightCollection - The right collection * @returns The active and lazy collections. They are undefined if we need to loop over both collections (i.e. both are active) */ function getActiveAndLazySources( joinType: JoinClause[`type`], leftCollection: Collection, rightCollection: Collection, ): | { activeSource: `main` | `joined`; lazySource: Collection } | { activeSource: undefined; lazySource: undefined } { // Self-joins can now be optimized since we track lazy loading by source alias // rather than collection ID. Each alias has its own subscription and lazy state. switch (joinType) { case `left`: return { activeSource: `main`, lazySource: rightCollection } case `right`: return { activeSource: `joined`, lazySource: leftCollection } case `inner`: // The smallest collection should be the active collection // and the biggest collection should be lazy return leftCollection.size < rightCollection.size ? { activeSource: `main`, lazySource: rightCollection } : { activeSource: `joined`, lazySource: leftCollection } default: return { activeSource: undefined, lazySource: undefined } } }