UNPKG

@asktext/core

Version:

Core embedding and vector store utilities for AskText voice Q&A.

139 lines (133 loc) 3.7 kB
// src/TextSplitter.ts function chunkText(text, maxLen = 1500, overlap = 200) { const chunks = []; if (overlap >= maxLen) throw new Error("overlap must be smaller than maxLen"); const step = maxLen - overlap; let index = 0; for (let start = 0; start < text.length; start += step) { const end = Math.min(start + maxLen, text.length); const content = text.slice(start, end); chunks.push({ content, startChar: start, endChar: end, chunkIndex: index }); index += 1; } return chunks; } // src/Embedder.ts import OpenAI from "openai"; var OpenAIEmbedder = class { constructor(opts) { this.openai = new OpenAI({ apiKey: opts.apiKey }); this.model = opts.model ?? "text-embedding-3-small"; } /** Embed multiple texts in a single API call */ async embed(texts) { const resp = await this.openai.embeddings.create({ model: this.model, input: texts }); return resp.data.map((d) => d.embedding); } /** Embed a single text string */ async embedOne(text) { const [vec] = await this.embed([text]); return vec; } }; // src/Retriever.ts async function retrievePassages({ query, store, topK = 4, embedder, filter }) { const vector = await embedder.embedOne(query); const results = await store.query({ vector, topK, filter }); return results.map((r) => ({ id: r.id, score: r.score, content: r.metadata?.content })); } // src/utils.ts function stripHtml(html) { return html.replace(/<[^>]*>/g, " ").replace(/\s+/g, " ").trim(); } // src/PrismaJsonStore.ts function cosineSimilarity(a, b) { let dot = 0; let normA = 0; let normB = 0; for (let i = 0; i < a.length; i++) { dot += a[i] * b[i]; normA += a[i] ** 2; normB += b[i] ** 2; } return dot / (Math.sqrt(normA) * Math.sqrt(normB)); } function createPrismaJsonStore(prisma) { return { async upsertEmbedding({ id, vector, metadata }) { await prisma.articleChunk.upsert({ where: { id }, update: { embedding: JSON.stringify(vector), ...metadata ? { metadata } : {} }, create: { id, postId: metadata?.postId ?? "", chunkIndex: metadata?.chunkIndex ?? 0, content: metadata?.content ?? "", startChar: metadata?.startChar ?? 0, endChar: metadata?.endChar ?? 0, embedding: JSON.stringify(vector) } }); }, async query({ vector, topK, filter }) { const where = {}; if (filter?.postId) where.postId = filter.postId; const chunks = await prisma.articleChunk.findMany({ where, select: { id: true, content: true, embedding: true } }); const scored = chunks.map((c) => { const vec = JSON.parse(c.embedding); const score = cosineSimilarity(vector, vec); return { id: c.id, score, metadata: { content: c.content } }; }); scored.sort((a, b) => b.score - a.score); return scored.slice(0, topK); } }; } // src/embedAndStore.ts async function embedAndStore({ articleId, htmlOrMarkdown, embedder, store, maxLen = 1500, overlap = 200 }) { const plain = stripHtml(htmlOrMarkdown); const chunks = chunkText(plain, maxLen, overlap); for (const c of chunks) { const vector = await embedder.embedOne(c.content); await store.upsertEmbedding({ id: `${articleId}-${c.chunkIndex}`, vector, metadata: { ...c, postId: articleId } }); } } embedAndStore.createPrismaJsonStore = createPrismaJsonStore; export { OpenAIEmbedder, chunkText, createPrismaJsonStore, embedAndStore, retrievePassages, stripHtml };