@tanstack/db
Version:
A reactive client store for building super fast apps on sync
813 lines (729 loc) • 26.8 kB
text/typescript
/**
* # Index-Based Query Optimization
*
* This module provides utilities for optimizing query expressions by leveraging
* available indexes to quickly find matching keys instead of scanning all data.
*
* This is different from the query structure optimizer in `query/optimizer.ts`
* which rewrites query IR structure. This module focuses on using indexes during
* query execution to speed up data filtering.
*
* ## Key Features:
* - Uses indexes to find matching keys for WHERE conditions
* - Supports AND/OR logic with set operations
* - Handles range queries (eq, gt, gte, lt, lte)
* - Optimizes IN array expressions
*/
import { DEFAULT_COMPARE_OPTIONS } from '../utils.js'
import { ReverseIndex } from '../indexes/reverse-index.js'
import { hasVirtualPropPath } from '../virtual-props.js'
import { makeComparator } from './comparison.js'
import type { CompareOptions } from '../query/builder/types.js'
import type { IndexInterface, IndexOperation } from '../indexes/base-index.js'
import type { BasicExpression } from '../query/ir.js'
import type { CollectionLike } from '../types.js'
/**
* Result of index-based query optimization
*/
export interface OptimizationResult<TKey> {
canOptimize: boolean
matchingKeys: Set<TKey>
/**
* Whether `matchingKeys` is exactly the set of keys matching the expression.
* When `false`, the keys are a superset of the true result (some conditions
* could not be served by an index) and each row must be re-checked against
* the full expression before being included in the result.
*/
isExact: boolean
}
/**
* Finds an index that matches a given field path
*/
export function findIndexForField<TKey extends string | number>(
collection: CollectionLike<any, TKey>,
fieldPath: Array<string>,
compareOptions?: CompareOptions,
): IndexInterface<TKey> | undefined {
if (hasVirtualPropPath(fieldPath)) {
return undefined
}
const compareOpts = compareOptions ?? {
...DEFAULT_COMPARE_OPTIONS,
...collection.compareOptions,
}
for (const index of collection.indexes.values()) {
if (
index.matchesField(fieldPath) &&
index.matchesCompareOptions(compareOpts)
) {
if (!index.matchesDirection(compareOpts.direction)) {
return new ReverseIndex(index)
}
return index
}
}
return undefined
}
/**
* Intersects multiple sets (AND logic)
*/
export function intersectSets<T>(sets: Array<Set<T>>): Set<T> {
if (sets.length === 0) return new Set()
if (sets.length === 1) return new Set(sets[0])
let result = new Set(sets[0])
for (let i = 1; i < sets.length; i++) {
const newResult = new Set<T>()
for (const item of result) {
if (sets[i]!.has(item)) {
newResult.add(item)
}
}
result = newResult
}
return result
}
/**
* Unions multiple sets (OR logic)
*/
export function unionSets<T>(sets: Array<Set<T>>): Set<T> {
const result = new Set<T>()
for (const set of sets) {
for (const item of set) {
result.add(item)
}
}
return result
}
/**
* Whether a value can be matched exactly by an index lookup, i.e. the index
* result for it is not a superset that the caller must re-filter.
*
* Only `null`/`undefined` are inexact: the WHERE evaluator's three-valued logic
* makes any comparison against them UNKNOWN, yet a BTree index stores and
* returns rows with nullish keys (they sort to the nulls end), so a result that
* could include such rows must be re-filtered.
*
* `NaN` and invalid Dates are exact: under the engine's PostgreSQL float
* semantics they are equal to themselves and ordered (greatest non-null value),
* so the evaluator and the index agree on them and no re-filtering is needed.
*/
function isExactComparisonValue(value: unknown): boolean {
return value != null
}
/**
* Whether the collection orders strings using locale collation.
*
* Under `stringSort: 'locale'` a BTree string index orders values with
* `localeCompare`, but the WHERE evaluator compares strings with JS relational
* operators (code-point order). For range predicates these orders disagree
* (e.g. `'ö' > 'z'` is true in JS but `'ö'` sorts before `'z'` under locale
* `en`), so an index range lookup can omit matching rows. Such omissions cannot
* be recovered by re-filtering, so locale-backed string range predicates must
* not be index-optimized.
*/
function usesLocaleStringSort(collection: CollectionLike<any, any>): boolean {
const opts = { ...DEFAULT_COMPARE_OPTIONS, ...collection.compareOptions }
return opts.stringSort === `locale`
}
/**
* Whether a range predicate on this operand would use an index ordering that
* differs from the WHERE evaluator's relational operators, so an index range
* lookup could omit genuine matches that re-filtering cannot recover.
*
* The evaluator compares with JS relational operators (extended with
* PostgreSQL float semantics for `NaN`/invalid Dates). That order matches the
* index comparator for numbers, booleans, bigints, lexically-sorted strings,
* Dates (valid, ordered by time; invalid, ordered as the greatest value) and
* `NaN`. It diverges for locale-sorted strings (localeCompare vs code-point
* order) and for arrays, plain objects, Temporal values and typed arrays
* (recursive/identity ordering vs string coercion).
*
* Note: `null`/`undefined` operands are not handled here — those are superset
* cases handled by re-filtering ({@link isExactComparisonValue}).
*/
function isRangeOrderingDivergent(
value: unknown,
collection: CollectionLike<any, any>,
): boolean {
switch (typeof value) {
case `number`:
case `bigint`:
case `boolean`:
return false
case `string`:
return usesLocaleStringSort(collection)
case `object`: {
if (value === null) return false
// Dates order consistently with the evaluator: valid Dates by time, and
// invalid Dates as the greatest value under PostgreSQL float semantics.
return !(value instanceof Date)
}
default:
return false
}
}
/**
* Whether a range predicate (gt/gte/lt/lte) on this operand can be safely
* served by the given index: the operand's domain must order the same way the
* index does, and the index itself must support trustworthy range traversal
* (no custom comparator).
*/
function canRangeOptimize(
value: unknown,
index: IndexInterface<any>,
collection: CollectionLike<any, any>,
): boolean {
return (
!isRangeOrderingDivergent(value, collection) &&
index.supportsRangeOptimization
)
}
/**
* Optimizes a query expression using available indexes to find matching keys
*/
export function optimizeExpressionWithIndexes<
T extends object,
TKey extends string | number,
>(
expression: BasicExpression,
collection: CollectionLike<T, TKey>,
): OptimizationResult<TKey> {
return optimizeQueryRecursive(expression, collection)
}
/**
* Recursively optimizes query expressions
*/
function optimizeQueryRecursive<T extends object, TKey extends string | number>(
expression: BasicExpression,
collection: CollectionLike<T, TKey>,
): OptimizationResult<TKey> {
if (expression.type === `func`) {
switch (expression.name) {
case `eq`:
case `gt`:
case `gte`:
case `lt`:
case `lte`:
return optimizeSimpleComparison(expression, collection)
case `and`:
return optimizeAndExpression(expression, collection)
case `or`:
return optimizeOrExpression(expression, collection)
case `in`:
return optimizeInArrayExpression(expression, collection)
}
}
return { canOptimize: false, matchingKeys: new Set(), isExact: false }
}
/**
* Checks if an expression can be optimized
*/
export function canOptimizeExpression<
T extends object,
TKey extends string | number,
>(expression: BasicExpression, collection: CollectionLike<T, TKey>): boolean {
if (expression.type === `func`) {
switch (expression.name) {
case `eq`:
case `gt`:
case `gte`:
case `lt`:
case `lte`:
return canOptimizeSimpleComparison(expression, collection)
case `and`:
return canOptimizeAndExpression(expression, collection)
case `or`:
return canOptimizeOrExpression(expression, collection)
case `in`:
return canOptimizeInArrayExpression(expression, collection)
}
}
return false
}
/**
* Result of compound range optimization, including which AND arguments
* were covered by the range query so the caller can process the rest.
*/
interface CompoundRangeResult<TKey> extends OptimizationResult<TKey> {
coveredArgIndices: Set<number>
}
/**
* Optimizes compound range queries on the same field
* Example: WHERE age > 5 AND age < 10
*/
function optimizeCompoundRangeQuery<
T extends object,
TKey extends string | number,
>(
expression: BasicExpression,
collection: CollectionLike<T, TKey>,
): CompoundRangeResult<TKey> {
if (expression.type !== `func` || expression.args.length < 2) {
return {
canOptimize: false,
matchingKeys: new Set(),
isExact: false,
coveredArgIndices: new Set(),
}
}
// Group range operations by field
const fieldOperations = new Map<
string,
Array<{
operation: `gt` | `gte` | `lt` | `lte`
value: any
argIndex: number
}>
>()
// Collect all range operations from AND arguments
for (const [argIndex, arg] of expression.args.entries()) {
if (arg.type === `func` && [`gt`, `gte`, `lt`, `lte`].includes(arg.name)) {
const rangeOp = arg as any
if (rangeOp.args.length === 2) {
const leftArg = rangeOp.args[0]!
const rightArg = rangeOp.args[1]!
// Check both directions: field op value AND value op field
let fieldArg: BasicExpression | null = null
let valueArg: BasicExpression | null = null
let operation = rangeOp.name as `gt` | `gte` | `lt` | `lte`
if (leftArg.type === `ref` && rightArg.type === `val`) {
// field op value
fieldArg = leftArg
valueArg = rightArg
} else if (leftArg.type === `val` && rightArg.type === `ref`) {
// value op field - need to flip the operation
fieldArg = rightArg
valueArg = leftArg
// Flip the operation for reverse comparison
switch (operation) {
case `gt`:
operation = `lt`
break
case `gte`:
operation = `lte`
break
case `lt`:
operation = `gt`
break
case `lte`:
operation = `gte`
break
}
}
if (fieldArg && valueArg) {
const fieldPath = (fieldArg as any).path
const fieldKey = fieldPath.join(`.`)
const value = (valueArg as any).value
if (!fieldOperations.has(fieldKey)) {
fieldOperations.set(fieldKey, [])
}
fieldOperations.get(fieldKey)!.push({ operation, value, argIndex })
}
}
}
}
// Check if we have multiple operations on the same field
for (const [fieldKey, operations] of fieldOperations) {
if (operations.length >= 2) {
const fieldPath = fieldKey.split(`.`)
const index = findIndexForField(collection, fieldPath)
// Only collapse this field into a range query when every bound's domain
// orders the same way the index does and the index supports trustworthy
// range traversal. Otherwise the index may omit matching rows that
// re-filtering cannot recover, so leave the field for a full scan.
if (
index &&
operations.some((op) => !canRangeOptimize(op.value, index, collection))
) {
continue
}
if (index && index.supports(`gt`) && index.supports(`lt`)) {
// Compare values with the same semantics the index uses (dates,
// locale strings, ...), in ascending order since bounds are about
// value order regardless of the index direction
const compare = makeComparator({
...DEFAULT_COMPARE_OPTIONS,
...collection.compareOptions,
direction: `asc`,
})
// Build range query options, keeping the strictest bound on each
// side: a larger lower bound (or smaller upper bound) wins, and at
// equal values the exclusive operation wins over the inclusive one.
// `hasFromBound`/`hasToBound` track whether a bound was selected,
// separately from the bound value (which may legitimately be falsy).
let from: any = undefined
let to: any = undefined
let hasFromBound = false
let hasToBound = false
let fromInclusive = true
let toInclusive = true
// A comparison against null/undefined is never true, but in an index
// nullish values sort to the nulls end, so a range query cannot
// represent such a bound. Track it and force a re-filter instead of
// claiming the result is exact. (NaN/invalid Dates are ordered and
// comparable under PostgreSQL semantics, so they are real bounds.)
let hasNonComparableBound = false
for (const { operation, value } of operations) {
if (!isExactComparisonValue(value)) {
hasNonComparableBound = true
continue
}
switch (operation) {
case `gt`:
case `gte`: {
const cmp = hasFromBound ? compare(value, from) : 1
if (cmp > 0) {
from = value
hasFromBound = true
fromInclusive = operation === `gte`
} else if (cmp === 0 && operation === `gt`) {
fromInclusive = false
}
break
}
case `lt`:
case `lte`: {
const cmp = hasToBound ? compare(value, to) : -1
if (cmp < 0) {
to = value
hasToBound = true
toInclusive = operation === `lte`
} else if (cmp === 0 && operation === `lt`) {
toInclusive = false
}
break
}
}
}
// Only pass the bounds that were selected: rangeQuery distinguishes
// an absent bound (open-ended) from an explicitly provided one
const rangeOptions: Record<string, any> = {}
if (hasFromBound) {
rangeOptions.from = from
rangeOptions.fromInclusive = fromInclusive
}
if (hasToBound) {
rangeOptions.to = to
rangeOptions.toInclusive = toInclusive
}
const matchingKeys = (index as any).rangeQuery(rangeOptions)
return {
canOptimize: true,
matchingKeys,
// The range result is exact only when it cannot include rows with a
// nullish indexed value (which a comparison would reject but the
// index returns, as they sort as the smallest key). That requires a
// non-nullish lower bound to exclude them: without `hasFromBound`
// the range is open at the bottom and captures those rows, and a
// non-comparable bound value (`hasNonComparableBound`) can never
// bound them out.
isExact: hasFromBound && !hasNonComparableBound,
coveredArgIndices: new Set(operations.map((op) => op.argIndex)),
}
}
}
}
return {
canOptimize: false,
matchingKeys: new Set(),
isExact: false,
coveredArgIndices: new Set(),
}
}
/**
* Optimizes simple comparison expressions (eq, gt, gte, lt, lte)
*/
function optimizeSimpleComparison<
T extends object,
TKey extends string | number,
>(
expression: BasicExpression,
collection: CollectionLike<T, TKey>,
): OptimizationResult<TKey> {
if (expression.type !== `func` || expression.args.length !== 2) {
return { canOptimize: false, matchingKeys: new Set(), isExact: false }
}
const leftArg = expression.args[0]!
const rightArg = expression.args[1]!
// Check both directions: field op value AND value op field
let fieldArg: BasicExpression | null = null
let valueArg: BasicExpression | null = null
let operation = expression.name as `eq` | `gt` | `gte` | `lt` | `lte`
if (leftArg.type === `ref` && rightArg.type === `val`) {
// field op value
fieldArg = leftArg
valueArg = rightArg
} else if (leftArg.type === `val` && rightArg.type === `ref`) {
// value op field - need to flip the operation
fieldArg = rightArg
valueArg = leftArg
// Flip the operation for reverse comparison
switch (operation) {
case `gt`:
operation = `lt`
break
case `gte`:
operation = `lte`
break
case `lt`:
operation = `gt`
break
case `lte`:
operation = `gte`
break
// eq stays the same
}
}
if (fieldArg && valueArg) {
const fieldPath = (fieldArg as any).path
const index = findIndexForField(collection, fieldPath)
if (index) {
const queryValue = (valueArg as any).value
// Map operation to IndexOperation enum
const indexOperation = operation as IndexOperation
// Check if the index supports this operation
if (!index.supports(indexOperation)) {
return { canOptimize: false, matchingKeys: new Set(), isExact: false }
}
// A range op can only use the index when the operand's domain orders the
// same way the index does and the index supports trustworthy traversal.
// Otherwise the index may omit matching rows, which re-filtering cannot
// recover, so fall back to a full scan.
if (
(operation === `gt` ||
operation === `gte` ||
operation === `lt` ||
operation === `lte`) &&
!canRangeOptimize(queryValue, index, collection)
) {
return { canOptimize: false, matchingKeys: new Set(), isExact: false }
}
const matchingKeys = index.lookup(indexOperation, queryValue)
// A comparison against a nullish value is never true, but BTree indexes
// store and return rows with nullish keys (they sort to the nulls end).
// Determine whether the index result is exact or a superset that the
// caller must re-filter:
// - eq/gt/gte: a nullish query value matches nothing while the index
// still returns nullish-keyed rows -> inexact. A non-nullish lower
// bound (gt/gte) excludes those bottom-sorted rows, so they stay exact.
// - lt/lte: the open lower bound always includes nullish-keyed rows,
// so the result is conservatively inexact.
// NaN/invalid Dates are exact here: under PostgreSQL float semantics the
// evaluator and the index agree on them (equal to self, greatest).
const isExact =
operation === `lt` || operation === `lte`
? false
: isExactComparisonValue(queryValue)
return { canOptimize: true, matchingKeys, isExact }
}
}
return { canOptimize: false, matchingKeys: new Set(), isExact: false }
}
/**
* Checks if a simple comparison can be optimized
*/
function canOptimizeSimpleComparison<
T extends object,
TKey extends string | number,
>(expression: BasicExpression, collection: CollectionLike<T, TKey>): boolean {
if (expression.type !== `func` || expression.args.length !== 2) {
return false
}
const leftArg = expression.args[0]!
const rightArg = expression.args[1]!
// Check both directions: field op value AND value op field
let fieldPath: Array<string> | null = null
if (leftArg.type === `ref` && rightArg.type === `val`) {
fieldPath = (leftArg as any).path
} else if (leftArg.type === `val` && rightArg.type === `ref`) {
fieldPath = (rightArg as any).path
}
if (fieldPath) {
const index = findIndexForField(collection, fieldPath)
return index !== undefined
}
return false
}
/**
* Optimizes AND expressions
*/
function optimizeAndExpression<T extends object, TKey extends string | number>(
expression: BasicExpression,
collection: CollectionLike<T, TKey>,
): OptimizationResult<TKey> {
if (expression.type !== `func` || expression.args.length < 2) {
return { canOptimize: false, matchingKeys: new Set(), isExact: false }
}
// First, try to optimize compound range queries on the same field
// (e.g. age > 5 AND age < 10 becomes a single range query)
const compoundRangeResult = optimizeCompoundRangeQuery(expression, collection)
const coveredArgIndices = compoundRangeResult.canOptimize
? compoundRangeResult.coveredArgIndices
: new Set<number>()
const results: Array<OptimizationResult<TKey>> = []
if (compoundRangeResult.canOptimize) {
results.push(compoundRangeResult)
}
// Try to optimize the remaining conjuncts, keep the optimizable ones.
// Conjuncts that cannot use an index make the result inexact: the
// intersection is then a superset of the true result and must be
// re-filtered against the full expression by the caller. The compound
// range result may itself be inexact (e.g. a null/undefined bound).
let allConjunctsExact = !compoundRangeResult.canOptimize
? true
: compoundRangeResult.isExact
for (const [argIndex, arg] of expression.args.entries()) {
if (coveredArgIndices.has(argIndex)) {
continue
}
const result = optimizeQueryRecursive(arg, collection)
if (result.canOptimize) {
results.push(result)
if (!result.isExact) {
allConjunctsExact = false
}
} else {
allConjunctsExact = false
}
}
if (results.length > 0) {
// Use intersectSets utility for AND logic
const allMatchingSets = results.map((r) => r.matchingKeys)
const intersectedKeys = intersectSets(allMatchingSets)
return {
canOptimize: true,
matchingKeys: intersectedKeys,
isExact: allConjunctsExact,
}
}
return { canOptimize: false, matchingKeys: new Set(), isExact: false }
}
/**
* Checks if an AND expression can be optimized
*/
function canOptimizeAndExpression<
T extends object,
TKey extends string | number,
>(expression: BasicExpression, collection: CollectionLike<T, TKey>): boolean {
if (expression.type !== `func` || expression.args.length < 2) {
return false
}
// If any argument can be optimized, we can gain some speedup
return expression.args.some((arg) => canOptimizeExpression(arg, collection))
}
/**
* Optimizes OR expressions
*/
function optimizeOrExpression<T extends object, TKey extends string | number>(
expression: BasicExpression,
collection: CollectionLike<T, TKey>,
): OptimizationResult<TKey> {
if (expression.type !== `func` || expression.args.length < 2) {
return { canOptimize: false, matchingKeys: new Set(), isExact: false }
}
const results: Array<OptimizationResult<TKey>> = []
// Every disjunct must be optimizable: rows matched only by a disjunct
// that cannot use an index would be missing from the union, and no
// post-filtering can recover them. In that case fall back to a full scan.
for (const arg of expression.args) {
const result = optimizeQueryRecursive(arg, collection)
if (!result.canOptimize) {
return { canOptimize: false, matchingKeys: new Set(), isExact: false }
}
results.push(result)
}
// Use unionSets utility for OR logic
const allMatchingSets = results.map((r) => r.matchingKeys)
const unionedKeys = unionSets(allMatchingSets)
return {
canOptimize: true,
matchingKeys: unionedKeys,
// An inexact (superset) disjunct makes the union a superset as well
isExact: results.every((r) => r.isExact),
}
}
/**
* Checks if an OR expression can be optimized
*/
function canOptimizeOrExpression<
T extends object,
TKey extends string | number,
>(expression: BasicExpression, collection: CollectionLike<T, TKey>): boolean {
if (expression.type !== `func` || expression.args.length < 2) {
return false
}
// Every disjunct must be optimizable, otherwise the union would miss
// rows matched only by the non-optimizable disjuncts
return expression.args.every((arg) => canOptimizeExpression(arg, collection))
}
/**
* Optimizes IN array expressions
*/
function optimizeInArrayExpression<
T extends object,
TKey extends string | number,
>(
expression: BasicExpression,
collection: CollectionLike<T, TKey>,
): OptimizationResult<TKey> {
if (expression.type !== `func` || expression.args.length !== 2) {
return { canOptimize: false, matchingKeys: new Set(), isExact: false }
}
const fieldArg = expression.args[0]!
const arrayArg = expression.args[1]!
if (
fieldArg.type === `ref` &&
arrayArg.type === `val` &&
Array.isArray((arrayArg as any).value)
) {
const fieldPath = (fieldArg as any).path
const values = (arrayArg as any).value
const index = findIndexForField(collection, fieldPath)
// A nullish or NaN member can never be matched by `IN` (a comparison
// against null/undefined/NaN is never true), but the index would still
// return rows with such an indexed value. When the list contains one of
// those the result is a superset that the caller must re-filter.
const isExact = values.every((value: any) => isExactComparisonValue(value))
if (index) {
// Check if the index supports IN operation
if (index.supports(`in`)) {
const matchingKeys = index.lookup(`in`, values)
return { canOptimize: true, matchingKeys, isExact }
} else if (index.supports(`eq`)) {
// Fallback to multiple equality lookups
const matchingKeys = new Set<TKey>()
for (const value of values) {
const keysForValue = index.lookup(`eq`, value)
for (const key of keysForValue) {
matchingKeys.add(key)
}
}
return { canOptimize: true, matchingKeys, isExact }
}
}
}
return { canOptimize: false, matchingKeys: new Set(), isExact: false }
}
/**
* Checks if an IN array expression can be optimized
*/
function canOptimizeInArrayExpression<
T extends object,
TKey extends string | number,
>(expression: BasicExpression, collection: CollectionLike<T, TKey>): boolean {
if (expression.type !== `func` || expression.args.length !== 2) {
return false
}
const fieldArg = expression.args[0]!
const arrayArg = expression.args[1]!
if (
fieldArg.type === `ref` &&
arrayArg.type === `val` &&
Array.isArray((arrayArg as any).value)
) {
const fieldPath = (fieldArg as any).path
const index = findIndexForField(collection, fieldPath)
return index !== undefined
}
return false
}