UNPKG

@asktext/core

Version:

Core embedding and vector store utilities for AskText voice Q&A.

181 lines (173 loc) 5.55 kB
"use strict"; var __create = Object.create; var __defProp = Object.defineProperty; var __getOwnPropDesc = Object.getOwnPropertyDescriptor; var __getOwnPropNames = Object.getOwnPropertyNames; var __getProtoOf = Object.getPrototypeOf; var __hasOwnProp = Object.prototype.hasOwnProperty; var __export = (target, all) => { for (var name in all) __defProp(target, name, { get: all[name], enumerable: true }); }; var __copyProps = (to, from, except, desc) => { if (from && typeof from === "object" || typeof from === "function") { for (let key of __getOwnPropNames(from)) if (!__hasOwnProp.call(to, key) && key !== except) __defProp(to, key, { get: () => from[key], enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable }); } return to; }; var __toESM = (mod, isNodeMode, target) => (target = mod != null ? __create(__getProtoOf(mod)) : {}, __copyProps( // If the importer is in node compatibility mode or this is not an ESM // file that has been converted to a CommonJS file using a Babel- // compatible transform (i.e. "__esModule" has not been set), then set // "default" to the CommonJS "module.exports" for node compatibility. isNodeMode || !mod || !mod.__esModule ? __defProp(target, "default", { value: mod, enumerable: true }) : target, mod )); var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod); // src/index.ts var src_exports = {}; __export(src_exports, { OpenAIEmbedder: () => OpenAIEmbedder, chunkText: () => chunkText, createPrismaJsonStore: () => createPrismaJsonStore, embedAndStore: () => embedAndStore, retrievePassages: () => retrievePassages, stripHtml: () => stripHtml }); module.exports = __toCommonJS(src_exports); // src/TextSplitter.ts function chunkText(text, maxLen = 1500, overlap = 200) { const chunks = []; if (overlap >= maxLen) throw new Error("overlap must be smaller than maxLen"); const step = maxLen - overlap; let index = 0; for (let start = 0; start < text.length; start += step) { const end = Math.min(start + maxLen, text.length); const content = text.slice(start, end); chunks.push({ content, startChar: start, endChar: end, chunkIndex: index }); index += 1; } return chunks; } // src/Embedder.ts var import_openai = __toESM(require("openai")); var OpenAIEmbedder = class { constructor(opts) { this.openai = new import_openai.default({ apiKey: opts.apiKey }); this.model = opts.model ?? "text-embedding-3-small"; } /** Embed multiple texts in a single API call */ async embed(texts) { const resp = await this.openai.embeddings.create({ model: this.model, input: texts }); return resp.data.map((d) => d.embedding); } /** Embed a single text string */ async embedOne(text) { const [vec] = await this.embed([text]); return vec; } }; // src/Retriever.ts async function retrievePassages({ query, store, topK = 4, embedder, filter }) { const vector = await embedder.embedOne(query); const results = await store.query({ vector, topK, filter }); return results.map((r) => ({ id: r.id, score: r.score, content: r.metadata?.content })); } // src/utils.ts function stripHtml(html) { return html.replace(/<[^>]*>/g, " ").replace(/\s+/g, " ").trim(); } // src/PrismaJsonStore.ts function cosineSimilarity(a, b) { let dot = 0; let normA = 0; let normB = 0; for (let i = 0; i < a.length; i++) { dot += a[i] * b[i]; normA += a[i] ** 2; normB += b[i] ** 2; } return dot / (Math.sqrt(normA) * Math.sqrt(normB)); } function createPrismaJsonStore(prisma) { return { async upsertEmbedding({ id, vector, metadata }) { await prisma.articleChunk.upsert({ where: { id }, update: { embedding: JSON.stringify(vector), ...metadata ? { metadata } : {} }, create: { id, postId: metadata?.postId ?? "", chunkIndex: metadata?.chunkIndex ?? 0, content: metadata?.content ?? "", startChar: metadata?.startChar ?? 0, endChar: metadata?.endChar ?? 0, embedding: JSON.stringify(vector) } }); }, async query({ vector, topK, filter }) { const where = {}; if (filter?.postId) where.postId = filter.postId; const chunks = await prisma.articleChunk.findMany({ where, select: { id: true, content: true, embedding: true } }); const scored = chunks.map((c) => { const vec = JSON.parse(c.embedding); const score = cosineSimilarity(vector, vec); return { id: c.id, score, metadata: { content: c.content } }; }); scored.sort((a, b) => b.score - a.score); return scored.slice(0, topK); } }; } // src/embedAndStore.ts async function embedAndStore({ articleId, htmlOrMarkdown, embedder, store, maxLen = 1500, overlap = 200 }) { const plain = stripHtml(htmlOrMarkdown); const chunks = chunkText(plain, maxLen, overlap); for (const c of chunks) { const vector = await embedder.embedOne(c.content); await store.upsertEmbedding({ id: `${articleId}-${c.chunkIndex}`, vector, metadata: { ...c, postId: articleId } }); } } embedAndStore.createPrismaJsonStore = createPrismaJsonStore; // Annotate the CommonJS export names for ESM import in node: 0 && (module.exports = { OpenAIEmbedder, chunkText, createPrismaJsonStore, embedAndStore, retrievePassages, stripHtml });