@asktext/core
Version:
Core embedding and vector store utilities for AskText voice Q&A.
112 lines (101 loc) • 3.55 kB
TypeScript
import { PrismaClient } from '@prisma/client';
/**
* Abstract interface for vector storage and similarity search operations.
* Implementations can use different backends (Prisma, external vector DBs, etc).
*/
interface VectorStore {
/** Insert or update an embedding */
upsertEmbedding(opts: {
id: string;
vector: number[];
metadata?: Record<string, any>;
}): Promise<void>;
/**
* Return the top-k most similar vectors to `vector`.
* Implementations may ignore the filter if unsupported.
*/
query(opts: {
vector: number[];
topK: number;
filter?: Record<string, any>;
}): Promise<Array<{
id: string;
score: number;
metadata?: Record<string, any>;
}>>;
}
/**
* Text chunking utilities for splitting content into overlapping segments suitable for embedding.
*/
interface TextChunk {
content: string;
startChar: number;
endChar: number;
chunkIndex: number;
}
/**
* Splits plain text into overlapping chunks suitable for embedding.
* @param text Input plain text
* @param maxLen Max characters per chunk (default 1500)
* @param overlap Overlap between chunks to preserve context (default 200)
*/
declare function chunkText(text: string, maxLen?: number, overlap?: number): TextChunk[];
interface EmbedderOptions {
apiKey: string;
model?: string;
}
declare class OpenAIEmbedder {
private openai;
private model;
constructor(opts: EmbedderOptions);
/** Embed multiple texts in a single API call */
embed(texts: string[]): Promise<number[][]>;
/** Embed a single text string */
embedOne(text: string): Promise<number[]>;
}
/**
* High-level retrieval utilities for semantic search over embedded content.
*/
interface RetrieveOptions {
query: string;
store: VectorStore;
topK?: number;
embedder: OpenAIEmbedder;
filter?: Record<string, any>;
}
/** Retrieve the most semantically similar text passages for a query */
declare function retrievePassages({ query, store, topK, embedder, filter }: RetrieveOptions): Promise<Array<{
id: string;
score: number;
content?: string;
}>>;
/**
* Text processing utilities for content normalization.
*/
/** Remove HTML tags and normalize whitespace from HTML/markdown content */
declare function stripHtml(html: string): string;
/**
* Prisma-based vector store implementation using JSON serialization.
* Expects an ArticleChunk model with embedding field stored as JSON string.
*/
/**
* Creates a VectorStore implementation using Prisma with JSON-serialized embeddings.
* Requires ArticleChunk model with fields: id, postId, chunkIndex, content, startChar, endChar, embedding.
*/
declare function createPrismaJsonStore(prisma: PrismaClient): VectorStore;
interface EmbedAndStoreOptions {
articleId: string;
htmlOrMarkdown: string;
embedder: OpenAIEmbedder;
store: VectorStore;
/** Max characters per chunk (default 1500) */
maxLen?: number;
/** Overlap between chunks (default 200) */
overlap?: number;
}
/**
* Convenience helper that strips markup, splits the text, embeds every chunk
* with OpenAI and upserts rows to the given VectorStore.
*/
declare function embedAndStore({ articleId, htmlOrMarkdown, embedder, store, maxLen, overlap, }: EmbedAndStoreOptions): Promise<void>;
export { type EmbedderOptions, OpenAIEmbedder, type RetrieveOptions, type TextChunk, type VectorStore, chunkText, createPrismaJsonStore, embedAndStore, retrievePassages, stripHtml };