@asktext/core
Version:
Core embedding and vector store utilities for AskText voice Q&A.
139 lines (133 loc) • 3.7 kB
JavaScript
// src/TextSplitter.ts
function chunkText(text, maxLen = 1500, overlap = 200) {
const chunks = [];
if (overlap >= maxLen)
throw new Error("overlap must be smaller than maxLen");
const step = maxLen - overlap;
let index = 0;
for (let start = 0; start < text.length; start += step) {
const end = Math.min(start + maxLen, text.length);
const content = text.slice(start, end);
chunks.push({ content, startChar: start, endChar: end, chunkIndex: index });
index += 1;
}
return chunks;
}
// src/Embedder.ts
import OpenAI from "openai";
var OpenAIEmbedder = class {
constructor(opts) {
this.openai = new OpenAI({ apiKey: opts.apiKey });
this.model = opts.model ?? "text-embedding-3-small";
}
/** Embed multiple texts in a single API call */
async embed(texts) {
const resp = await this.openai.embeddings.create({
model: this.model,
input: texts
});
return resp.data.map((d) => d.embedding);
}
/** Embed a single text string */
async embedOne(text) {
const [vec] = await this.embed([text]);
return vec;
}
};
// src/Retriever.ts
async function retrievePassages({ query, store, topK = 4, embedder, filter }) {
const vector = await embedder.embedOne(query);
const results = await store.query({ vector, topK, filter });
return results.map((r) => ({
id: r.id,
score: r.score,
content: r.metadata?.content
}));
}
// src/utils.ts
function stripHtml(html) {
return html.replace(/<[^>]*>/g, " ").replace(/\s+/g, " ").trim();
}
// src/PrismaJsonStore.ts
function cosineSimilarity(a, b) {
let dot = 0;
let normA = 0;
let normB = 0;
for (let i = 0; i < a.length; i++) {
dot += a[i] * b[i];
normA += a[i] ** 2;
normB += b[i] ** 2;
}
return dot / (Math.sqrt(normA) * Math.sqrt(normB));
}
function createPrismaJsonStore(prisma) {
return {
async upsertEmbedding({ id, vector, metadata }) {
await prisma.articleChunk.upsert({
where: { id },
update: {
embedding: JSON.stringify(vector),
...metadata ? { metadata } : {}
},
create: {
id,
postId: metadata?.postId ?? "",
chunkIndex: metadata?.chunkIndex ?? 0,
content: metadata?.content ?? "",
startChar: metadata?.startChar ?? 0,
endChar: metadata?.endChar ?? 0,
embedding: JSON.stringify(vector)
}
});
},
async query({ vector, topK, filter }) {
const where = {};
if (filter?.postId)
where.postId = filter.postId;
const chunks = await prisma.articleChunk.findMany({
where,
select: {
id: true,
content: true,
embedding: true
}
});
const scored = chunks.map((c) => {
const vec = JSON.parse(c.embedding);
const score = cosineSimilarity(vector, vec);
return { id: c.id, score, metadata: { content: c.content } };
});
scored.sort((a, b) => b.score - a.score);
return scored.slice(0, topK);
}
};
}
// src/embedAndStore.ts
async function embedAndStore({
articleId,
htmlOrMarkdown,
embedder,
store,
maxLen = 1500,
overlap = 200
}) {
const plain = stripHtml(htmlOrMarkdown);
const chunks = chunkText(plain, maxLen, overlap);
for (const c of chunks) {
const vector = await embedder.embedOne(c.content);
await store.upsertEmbedding({
id: `${articleId}-${c.chunkIndex}`,
vector,
metadata: { ...c, postId: articleId }
});
}
}
embedAndStore.createPrismaJsonStore = createPrismaJsonStore;
export {
OpenAIEmbedder,
chunkText,
createPrismaJsonStore,
embedAndStore,
retrievePassages,
stripHtml
};