UNPKG

@mastra/core

Version:

Mastra is a framework for building AI-powered applications and agents with a modern TypeScript stack.

256 lines 8.13 kB
/** * SearchEngine - Unified search engine supporting BM25, vector, and hybrid search. * * Provides search capabilities for Workspace, enabling keyword-based (BM25), * semantic (vector), and combined hybrid search across indexed content. */ import type { MastraVector } from '../../vector/index.js'; import type { LineRange } from '../line-utils.js'; import { BM25Index } from './bm25.js'; import type { BM25Config, TokenizeOptions } from './bm25.js'; /** * Search mode options */ export type SearchMode = 'vector' | 'bm25' | 'hybrid'; /** * Single-text embedder - takes one text and returns its embedding. * * This is the legacy embedder shape and remains the default. Each document is * embedded with a separate call. */ export interface SingleEmbedder { (text: string): Promise<number[]>; } /** * Batch-capable embedder - takes an array of texts and returns their embeddings * in the same order. * * Branded with `batch: true` so {@link SearchEngine} can detect batch support at * runtime and dispatch to a single batched embedder call instead of one call * per document. This dramatically speeds up large index rebuilds against * providers that support batch embedding (e.g. OpenAI's `embedMany`). * * @example * ```ts * import { embedMany } from 'ai'; * import { openai } from '@ai-sdk/openai'; * * const model = openai.embedding('text-embedding-3-small'); * const embedder: BatchEmbedder = Object.assign( * async (texts: string[]) => { * const { embeddings } = await embedMany({ model, values: texts }); * return embeddings; * }, * { batch: true as const, maxBatchSize: 2048 }, * ); * ``` */ export interface BatchEmbedder { (texts: string[]): Promise<number[][]>; /** Brand that marks this embedder as batch-capable. */ readonly batch: true; /** * Maximum number of texts the underlying provider accepts per call. When * unset, all pending texts are sent in a single request. */ readonly maxBatchSize?: number; } /** * Embedder interface - either a legacy single-text embedder or a batch-capable * embedder branded with `batch: true`. */ export type Embedder = SingleEmbedder | BatchEmbedder; /** * Type guard: returns true when the embedder is the batch-capable variant. */ export declare function isBatchEmbedder(embedder: Embedder): embedder is BatchEmbedder; /** * Configuration for vector search */ export interface VectorConfig { /** Vector store for semantic search */ vectorStore: MastraVector; /** Embedder function for generating vectors */ embedder: Embedder; /** Index name for the vector store */ indexName: string; } /** * Configuration for BM25 search */ export interface BM25SearchConfig { /** BM25 algorithm parameters */ bm25?: BM25Config; /** Tokenization options */ tokenize?: TokenizeOptions; } /** * A document to be indexed */ export interface IndexDocument { /** Unique identifier for this document */ id: string; /** Text content to index */ content: string; /** Optional metadata to store with the document */ metadata?: Record<string, unknown>; /** * For chunked documents: the starting line number of this chunk in the original document. * When provided, lineRange in search results will be adjusted to reflect original document lines. * (1-indexed) */ startLineOffset?: number; } /** * Base search result with common fields */ export interface SearchResult { /** Document identifier */ id: string; /** Document content */ content: string; /** Search score (0-1 for normalized results) */ score: number; /** Line range where query terms appear */ lineRange?: LineRange; /** Optional metadata */ metadata?: Record<string, unknown>; /** Score breakdown by search type */ scoreDetails?: { vector?: number; bm25?: number; }; } /** * Options for searching */ export interface SearchOptions { /** Maximum number of results to return */ topK?: number; /** Minimum score threshold */ minScore?: number; /** Search mode: 'bm25', 'vector', or 'hybrid' */ mode?: SearchMode; /** Weight for vector scores in hybrid search (0-1, default 0.5) */ vectorWeight?: number; /** Filter for vector search */ filter?: Record<string, unknown>; } /** Options for batch indexing */ export interface IndexManyOptions { /** * Maximum number of documents to index concurrently (embedder + vector upsert). * Must be a safe integer ≥ 1 (same rule as `p-map`). * @default 8 */ concurrency?: number; /** * When `true` (default), the first rejected `index` rejects the whole `indexMany` call. * When `false`, all documents are processed; if any failed, the promise rejects with an `AggregateError`. */ stopOnError?: boolean; } /** * Configuration for SearchEngine */ export interface SearchEngineConfig { /** BM25 configuration (enables BM25 search) */ bm25?: BM25SearchConfig; /** Vector configuration (enables vector search) */ vector?: VectorConfig; /** Whether to use lazy vector indexing (default: false = eager) */ lazyVectorIndex?: boolean; } export interface ChunkOptions { maxChunkChars?: number; overlapLines?: number; } export interface TextChunk { content: string; startLine: number; } /** * Split text into line-based chunks that stay within a character budget. * * Each chunk is formed by accumulating whole lines until adding the next line * would exceed `maxChunkChars`. Adjacent chunks share `overlapLines` lines so * that context around chunk boundaries is preserved for embedding quality. * * Returns the original text as a single chunk when it already fits. */ export declare function splitIntoChunks(text: string, options?: ChunkOptions): TextChunk[]; /** * Unified search engine supporting BM25, vector, and hybrid search. * * Used internally by Workspace to provide consistent search functionality. * * @example * ```typescript * const engine = new SearchEngine({ * bm25: { tokenize: { lowercase: true } }, * vector: { vectorStore, embedder, indexName: 'my-index' }, * }); * * // Index documents * await engine.index({ id: 'doc1', content: 'Hello world' }); * * // Search * const results = await engine.search('hello', { mode: 'hybrid', topK: 5 }); * ``` */ export declare class SearchEngine { #private; constructor(config?: SearchEngineConfig); /** * Index a document for search */ index(doc: IndexDocument): Promise<void>; /** * Index multiple documents (up to `concurrency` at a time when async vector work runs). * * @param docs - Documents to index * @param options - `p-map` options; `concurrency` defaults to 8 */ indexMany(docs: IndexDocument[], options?: IndexManyOptions): Promise<void>; /** * Remove a document from the index */ remove(id: string): Promise<void>; /** * Remove all documents whose ID starts with the given prefix. * Used to remove all chunks belonging to a single source document. */ removeByPrefix(prefix: string): Promise<void>; /** * Remove a source document and all of its chunked variants. * * This also attempts a metadata-based bulk delete for chunk vectors so stale * chunk IDs from previous process runs are cleaned up in persistent stores. */ removeSource(sourceId: string): Promise<void>; /** * Clear all indexed documents */ clear(): void; /** * Search for documents */ search(query: string, options?: SearchOptions): Promise<SearchResult[]>; /** * Check if BM25 search is available */ get canBM25(): boolean; /** * Check if vector search is available */ get canVector(): boolean; /** * Check if hybrid search is available */ get canHybrid(): boolean; /** * Get the BM25 index (for serialization/debugging) */ get bm25Index(): BM25Index | undefined; } //# sourceMappingURL=search-engine.d.ts.map