@asktext/core

Version:

Core embedding and vector store utilities for AskText voice Q&A.

112 lines (101 loc) • 3.55 kB

text/typescript

import { PrismaClient } from '@prisma/client'; /** * Abstract interface for vector storage and similarity search operations. * Implementations can use different backends (Prisma, external vector DBs, etc). */ interface VectorStore { /** Insert or update an embedding */ upsertEmbedding(opts: { id: string; vector: number[]; metadata?: Record<string, any>; }): Promise<void>; /** * Return the top-k most similar vectors to `vector`. * Implementations may ignore the filter if unsupported. */ query(opts: { vector: number[]; topK: number; filter?: Record<string, any>; }): Promise<Array<{ id: string; score: number; metadata?: Record<string, any>; }>>; } /** * Text chunking utilities for splitting content into overlapping segments suitable for embedding. */ interface TextChunk { content: string; startChar: number; endChar: number; chunkIndex: number; } /** * Splits plain text into overlapping chunks suitable for embedding. * @param text Input plain text * @param maxLen Max characters per chunk (default 1500) * @param overlap Overlap between chunks to preserve context (default 200) */ declare function chunkText(text: string, maxLen?: number, overlap?: number): TextChunk[]; interface EmbedderOptions { apiKey: string; model?: string; } declare class OpenAIEmbedder { private openai; private model; constructor(opts: EmbedderOptions); /** Embed multiple texts in a single API call */ embed(texts: string[]): Promise<number[][]>; /** Embed a single text string */ embedOne(text: string): Promise<number[]>; } /** * High-level retrieval utilities for semantic search over embedded content. */ interface RetrieveOptions { query: string; store: VectorStore; topK?: number; embedder: OpenAIEmbedder; filter?: Record<string, any>; } /** Retrieve the most semantically similar text passages for a query */ declare function retrievePassages({ query, store, topK, embedder, filter }: RetrieveOptions): Promise<Array<{ id: string; score: number; content?: string; }>>; /** * Text processing utilities for content normalization. */ /** Remove HTML tags and normalize whitespace from HTML/markdown content */ declare function stripHtml(html: string): string; /** * Prisma-based vector store implementation using JSON serialization. * Expects an ArticleChunk model with embedding field stored as JSON string. */ /** * Creates a VectorStore implementation using Prisma with JSON-serialized embeddings. * Requires ArticleChunk model with fields: id, postId, chunkIndex, content, startChar, endChar, embedding. */ declare function createPrismaJsonStore(prisma: PrismaClient): VectorStore; interface EmbedAndStoreOptions { articleId: string; htmlOrMarkdown: string; embedder: OpenAIEmbedder; store: VectorStore; /** Max characters per chunk (default 1500) */ maxLen?: number; /** Overlap between chunks (default 200) */ overlap?: number; } /** * Convenience helper that strips markup, splits the text, embeds every chunk * with OpenAI and upserts rows to the given VectorStore. */ declare function embedAndStore({ articleId, htmlOrMarkdown, embedder, store, maxLen, overlap, }: EmbedAndStoreOptions): Promise<void>; export { type EmbedderOptions, OpenAIEmbedder, type RetrieveOptions, type TextChunk, type VectorStore, chunkText, createPrismaJsonStore, embedAndStore, retrievePassages, stripHtml };