@mastra/core
Version:
Mastra is a framework for building AI-powered applications and agents with a modern TypeScript stack.
164 lines • 4.7 kB
TypeScript
/**
* BM25 (Best Matching 25) implementation for keyword-based search.
*
* BM25 is a probabilistic ranking function used for information retrieval.
* It ranks documents based on the query terms appearing in each document,
* taking into account term frequency and document length normalization.
*/
import type { LineRange } from '../line-utils.js';
/**
* BM25 configuration parameters
*/
export interface BM25Config {
/**
* Controls term frequency saturation.
* Higher values give more weight to term frequency.
* Typical range: 1.2 - 2.0
* @default 1.5
*/
k1?: number;
/**
* Controls document length normalization.
* 0 = no length normalization, 1 = full normalization
* @default 0.75
*/
b?: number;
}
/**
* Represents a document in the BM25 index
*/
export interface BM25Document {
/** Document identifier */
id: string;
/** Document content */
content: string;
/** Pre-computed tokens for the document */
tokens: string[];
/** Token frequency map */
termFrequencies: Map<string, number>;
/** Total number of tokens */
length: number;
/** Optional metadata */
metadata?: Record<string, unknown>;
}
/**
* Result from a BM25 search
*/
export interface BM25SearchResult {
/** Document identifier */
id: string;
/** Document content */
content: string;
/** BM25 score (higher is more relevant) */
score: number;
/** Optional metadata */
metadata?: Record<string, unknown>;
/** Line range where query terms were found (if computed) */
lineRange?: LineRange;
}
/**
* Tokenization options
*/
export interface TokenizeOptions {
/** Convert to lowercase */
lowercase?: boolean;
/** Remove punctuation */
removePunctuation?: boolean;
/** Minimum token length */
minLength?: number;
/** Custom stopwords to remove */
stopwords?: Set<string>;
/** Custom split pattern (default: /\s+/) */
splitPattern?: RegExp;
}
/**
* Default English stopwords
*/
export declare const DEFAULT_STOPWORDS: Set<string>;
/**
* Tokenize text into an array of terms
*/
export declare function tokenize(text: string, options?: TokenizeOptions): string[];
export { extractLines, extractLinesWithLimit, formatWithLineNumbers, replaceString, StringNotFoundError, StringNotUniqueError, } from '../line-utils.js';
/**
* Find the line range where query terms appear in content.
* Returns the range spanning from the first to the last line containing any query term.
*
* @param content - The document content
* @param queryTerms - Tokenized query terms to find
* @param options - Tokenization options (should match indexing options)
* @returns LineRange if terms found, undefined otherwise
*/
export declare function findLineRange(content: string, queryTerms: string[], options?: TokenizeOptions): LineRange | undefined;
/**
* BM25 Index for keyword-based document retrieval
*/
export declare class BM25Index {
/** BM25 k1 parameter */
readonly k1: number;
/** BM25 b parameter */
readonly b: number;
constructor(config?: BM25Config, tokenizeOptions?: TokenizeOptions);
/**
* Add a document to the index
*/
add(id: string, content: string, metadata?: Record<string, unknown>): void;
/**
* Remove a document from the index
*/
remove(id: string): boolean;
/**
* Clear all documents from the index
*/
clear(): void;
/**
* Search for documents matching the query
*/
search(query: string, topK?: number, minScore?: number): BM25SearchResult[];
/**
* Get a document by ID
*/
get(id: string): BM25Document | undefined;
/**
* Check if a document exists in the index
*/
has(id: string): boolean;
/**
* Get the number of documents in the index
*/
get size(): number;
/**
* Get all document IDs
*/
get documentIds(): string[];
/**
* Serialize the index to a JSON-compatible object
*/
serialize(): BM25IndexData;
/**
* Deserialize an index from a JSON object
*/
static deserialize(data: BM25IndexData, tokenizeOptions?: TokenizeOptions): BM25Index;
}
/**
* Serialized document format for persistence
*/
interface SerializedBM25Document {
id: string;
content: string;
tokens: string[];
termFrequencies: Record<string, number>;
length: number;
metadata?: Record<string, unknown>;
}
/**
* Serialized index data for persistence
*/
export interface BM25IndexData {
k1: number;
b: number;
documents: SerializedBM25Document[];
avgDocLength: number;
}
//# sourceMappingURL=bm25.d.ts.map