@neureus/rag
Version:
AutoRAG - Zero-setup knowledge integration with Cloudflare AI and Vectorize
1,648 lines (1,645 loc) • 153 kB
JavaScript
import { z } from 'zod';
import TurndownService from 'turndown';
import * as mammoth from 'mammoth';
import pdfParse from 'pdf-parse';
import csvParser from 'csv-parser';
import MarkdownIt from 'markdown-it';
import { JSDOM } from 'jsdom';
import crypto2 from 'crypto';
import { createGateway } from '@neureus/ai-gateway';
import { createVectorDB } from '@neureus/vector-db';
// @neureus/rag - Production-ready RAG implementation for Cloudflare Workers
var __require = /* @__PURE__ */ ((x) => typeof require !== "undefined" ? require : typeof Proxy !== "undefined" ? new Proxy(x, {
get: (a, b) => (typeof require !== "undefined" ? require : a)[b]
}) : x)(function(x) {
if (typeof require !== "undefined") return require.apply(this, arguments);
throw Error('Dynamic require of "' + x + '" is not supported');
});
var DocumentSchema = z.object({
id: z.string(),
content: z.string(),
metadata: z.object({
id: z.string(),
title: z.string().optional(),
author: z.string().optional(),
createdAt: z.number().optional(),
updatedAt: z.number().optional(),
url: z.string().optional(),
filePath: z.string().optional(),
fileSize: z.number().optional(),
format: z.enum(["markdown", "html", "pdf", "txt", "docx", "csv", "json", "xml", "image", "audio", "video"]),
source: z.enum(["file", "url", "text", "s3", "r2", "github", "webhook", "email", "auto"]),
tags: z.array(z.string()).optional(),
language: z.string().optional(),
custom: z.record(z.string(), z.unknown()).optional()
})
});
var ChunkSchema = z.object({
id: z.string(),
content: z.string(),
tokens: z.number(),
embedding: z.array(z.number()).optional(),
metadata: z.object({
chunkId: z.string(),
documentId: z.string(),
index: z.number(),
startIndex: z.number(),
endIndex: z.number(),
title: z.string().optional(),
section: z.string().optional(),
headers: z.array(z.string()).optional(),
pageNumber: z.number().optional(),
custom: z.record(z.string(), z.unknown()).optional()
})
});
var RAGConfigSchema = z.object({
name: z.string(),
description: z.string().optional(),
embedding: z.object({
model: z.string(),
provider: z.enum(["openai", "anthropic", "google", "cloudflare", "cohere", "mistral"]),
dimensions: z.number().default(1536),
batchSize: z.number().default(100),
maxRetries: z.number().default(3),
timeout: z.number().default(3e4)
}),
chunking: z.object({
strategy: z.enum(["fixed_size", "semantic", "recursive", "sentence", "paragraph", "custom"]).default("recursive"),
size: z.number().default(512),
overlap: z.number().default(128),
minChunkSize: z.number().default(50),
maxChunkSize: z.number().default(2048),
separators: z.array(z.string()).optional(),
preserveStructure: z.boolean().default(false)
}),
retrieval: z.object({
topK: z.number().default(5),
minSimilarity: z.number().default(0.7),
hybridWeight: z.number().default(0.7),
// Weight for vector vs keyword search
rerankModel: z.string().optional(),
maxContextTokens: z.number().default(4e3)
}),
generation: z.object({
model: z.string(),
provider: z.enum(["openai", "anthropic", "google", "cloudflare", "cohere", "mistral"]),
temperature: z.number().default(0.1),
maxTokens: z.number().default(1e3),
systemPrompt: z.string().optional(),
includeSource: z.boolean().default(true),
streaming: z.boolean().default(false)
}),
analytics: z.object({
enabled: z.boolean().default(true),
trackQueries: z.boolean().default(true),
trackPerformance: z.boolean().default(true),
sampleRate: z.number().default(1)
})
});
var IngestionRequestSchema = z.object({
source: z.string(),
// URL, file path, or text content
type: z.enum(["file", "url", "text", "s3", "r2", "github", "auto", "webhook", "email"]).default("file"),
format: z.enum(["markdown", "html", "pdf", "txt", "docx", "csv", "json", "xml", "image", "audio", "video"]).optional(),
metadata: z.record(z.string(), z.unknown()).optional(),
recursive: z.boolean().default(false),
// For directories/repos
filters: z.array(z.string()).optional(),
// File patterns to include
excludes: z.array(z.string()).optional()
// File patterns to exclude
});
var QueryRequestSchema = z.object({
query: z.string(),
topK: z.number().default(5),
minSimilarity: z.number().default(0.7),
filter: z.record(z.string(), z.unknown()).optional(),
namespace: z.string().optional(),
includeSource: z.boolean().default(true),
streaming: z.boolean().default(false),
conversationHistory: z.array(z.object({
role: z.enum(["system", "user", "assistant"]),
content: z.string()
})).optional(),
customPrompt: z.string().optional(),
rerankResults: z.boolean().default(true)
});
var RAGError = class extends Error {
constructor(message, code, statusCode = 500, pipelineName, originalError) {
super(message);
this.code = code;
this.statusCode = statusCode;
this.pipelineName = pipelineName;
this.originalError = originalError;
this.name = "RAGError";
}
};
var DocumentProcessingError = class extends RAGError {
constructor(documentId, message, originalError) {
super(
`Failed to process document ${documentId}: ${message}`,
"DOCUMENT_PROCESSING_ERROR",
422,
void 0,
originalError
);
this.name = "DocumentProcessingError";
}
};
var EmbeddingError = class extends RAGError {
constructor(message, originalError) {
super(
`Embedding generation failed: ${message}`,
"EMBEDDING_ERROR",
500,
void 0,
originalError
);
this.name = "EmbeddingError";
}
};
var RetrievalError = class extends RAGError {
constructor(message, originalError) {
super(
`Context retrieval failed: ${message}`,
"RETRIEVAL_ERROR",
500,
void 0,
originalError
);
this.name = "RetrievalError";
}
};
var GenerationError = class extends RAGError {
constructor(message, originalError) {
super(
`Answer generation failed: ${message}`,
"GENERATION_ERROR",
500,
void 0,
originalError
);
this.name = "GenerationError";
}
};
function generateId(prefix = "") {
const timestamp = Date.now().toString(36);
const random = crypto2.randomBytes(6).toString("hex");
return prefix ? `${prefix}_${timestamp}_${random}` : `${timestamp}_${random}`;
}
function sanitizeText(text) {
return text.replace(/\r\n/g, "\n").replace(/\r/g, "\n").replace(/\n{3,}/g, "\n\n").replace(/[ \t]{2,}/g, " ").replace(/^\s+|\s+$/gm, "").trim();
}
function extractHeaders(content) {
const headers = [];
const mdHeaders = content.match(/^#{1,6}\s+(.+)$/gm);
if (mdHeaders) {
headers.push(...mdHeaders.map((h) => h.replace(/^#+\s+/, "").trim()));
}
if (headers.length === 0) {
const htmlHeaders = content.match(/<h[1-6][^>]*>(.*?)<\/h[1-6]>/gi);
if (htmlHeaders) {
headers.push(...htmlHeaders.map((h) => h.replace(/<[^>]*>/g, "").trim()));
}
}
return headers.filter((h) => h.length > 0);
}
function estimateTokens(text) {
return Math.ceil(text.length / 4);
}
function splitOnWordBoundary(text, maxLength) {
if (text.length <= maxLength) {
return [text];
}
const chunks = [];
let currentChunk = "";
const words = text.split(/\s+/);
for (const word of words) {
const testChunk = currentChunk ? `${currentChunk} ${word}` : word;
if (testChunk.length <= maxLength) {
currentChunk = testChunk;
} else {
if (currentChunk) {
chunks.push(currentChunk.trim());
currentChunk = word;
} else {
chunks.push(word);
currentChunk = "";
}
}
}
if (currentChunk) {
chunks.push(currentChunk.trim());
}
return chunks.filter((chunk) => chunk.length > 0);
}
function createOverlappingChunks(text, chunkSize, overlapSize, splitOnWords = true) {
if (text.length <= chunkSize) {
return [{
content: text,
startIndex: 0,
endIndex: text.length
}];
}
const chunks = [];
let currentIndex = 0;
while (currentIndex < text.length) {
const endIndex = Math.min(currentIndex + chunkSize, text.length);
let chunkText = text.slice(currentIndex, endIndex);
if (splitOnWords && endIndex < text.length) {
const lastSpaceIndex = chunkText.lastIndexOf(" ");
if (lastSpaceIndex > chunkSize * 0.8) {
chunkText = chunkText.slice(0, lastSpaceIndex);
}
}
chunks.push({
content: chunkText.trim(),
startIndex: currentIndex,
endIndex: currentIndex + chunkText.length
});
const actualChunkSize = chunkText.length;
const step = Math.max(actualChunkSize - overlapSize, 1);
currentIndex += step;
if (currentIndex >= text.length) {
break;
}
}
return chunks.filter((chunk) => chunk.content.length > 0);
}
function findSentenceBoundaries(text) {
const boundaries = [0];
const sentenceEnders = /[.!?]+\s+/g;
let match;
while ((match = sentenceEnders.exec(text)) !== null) {
boundaries.push(match.index + match[0].length);
}
if (boundaries[boundaries.length - 1] !== text.length) {
boundaries.push(text.length);
}
return boundaries;
}
function findParagraphBoundaries(text) {
const boundaries = [0];
const paragraphBreaks = /\n\s*\n/g;
let match;
while ((match = paragraphBreaks.exec(text)) !== null) {
boundaries.push(match.index + match[0].length);
}
if (boundaries[boundaries.length - 1] !== text.length) {
boundaries.push(text.length);
}
return boundaries;
}
function cosineSimilarity(a, b) {
if (a.length !== b.length) {
throw new Error("Vectors must have the same length");
}
let dotProduct = 0;
let normA = 0;
let normB = 0;
for (let i = 0; i < a.length; i++) {
dotProduct += a[i] * b[i];
normA += a[i] * a[i];
normB += b[i] * b[i];
}
normA = Math.sqrt(normA);
normB = Math.sqrt(normB);
if (normA === 0 || normB === 0) {
return 0;
}
return dotProduct / (normA * normB);
}
function textSimilarity(text1, text2) {
const words1 = new Set(text1.toLowerCase().split(/\s+/));
const words2 = new Set(text2.toLowerCase().split(/\s+/));
const intersection = new Set([...words1].filter((x) => words2.has(x)));
const union = /* @__PURE__ */ new Set([...words1, ...words2]);
return intersection.size / union.size;
}
function isValidContent(content) {
return content.trim().length > 0;
}
function truncateText(text, maxLength) {
if (text.length <= maxLength) {
return text;
}
const truncated = text.slice(0, maxLength);
const lastSpaceIndex = truncated.lastIndexOf(" ");
if (lastSpaceIndex > maxLength * 0.8) {
return truncated.slice(0, lastSpaceIndex) + "...";
}
return truncated + "...";
}
function normalizeQuery(query) {
return query.trim().replace(/\s+/g, " ").replace(/[^\w\s\-_.?!]/g, "").toLowerCase();
}
function extractKeywords(text, minLength = 3, maxCount = 20) {
const words = text.toLowerCase().replace(/[^\w\s]/g, " ").split(/\s+/).filter((word) => word.length >= minLength);
const frequency = {};
words.forEach((word) => {
frequency[word] = (frequency[word] || 0) + 1;
});
return Object.entries(frequency).sort(([, a], [, b]) => b - a).slice(0, maxCount).map(([word]) => word);
}
function formatDuration(milliseconds) {
if (milliseconds < 1e3) {
return `${milliseconds}ms`;
}
const seconds = milliseconds / 1e3;
if (seconds < 60) {
return `${seconds.toFixed(2)}s`;
}
const minutes = seconds / 60;
return `${minutes.toFixed(2)}m`;
}
function formatFileSize(bytes) {
if (bytes === 0) return "0 B";
const k = 1024;
const sizes = ["B", "KB", "MB", "GB", "TB"];
const i = Math.floor(Math.log(bytes) / Math.log(k));
return `${parseFloat((bytes / Math.pow(k, i)).toFixed(2))} ${sizes[i]}`;
}
function hashContent(content) {
return crypto2.createHash("sha256").update(content).digest("hex").slice(0, 16);
}
// src/processing/document-loader.ts
var DocumentLoader = class {
turndownService;
markdownIt;
env;
constructor(env) {
this.env = env;
this.turndownService = new TurndownService({
headingStyle: "atx",
hr: "---",
bulletListMarker: "-",
codeBlockStyle: "fenced",
fence: "```",
emDelimiter: "*",
strongDelimiter: "**"
});
this.markdownIt = new MarkdownIt({
html: true,
linkify: true,
typographer: true
});
}
/**
* Load a document from various sources
*/
async loadDocument(request) {
try {
const { source, type, format } = request;
let content;
let detectedFormat;
let metadata;
switch (type) {
case "text":
content = source;
detectedFormat = format || "txt";
metadata = this.createMetadata({
id: generateId(),
format: detectedFormat,
source: "text",
...request.metadata
});
break;
case "file":
({ content, format: detectedFormat, metadata } = await this.loadFromFile(source, request));
break;
case "url":
({ content, format: detectedFormat, metadata } = await this.loadFromUrl(source, request));
break;
case "s3":
case "r2":
({ content, format: detectedFormat, metadata } = await this.loadFromStorage(source, type, request));
break;
case "github":
({ content, format: detectedFormat, metadata } = await this.loadFromGitHub(source, request));
break;
default:
throw new DocumentProcessingError("unknown", `Unsupported source type: ${type}`);
}
const processedContent = await this.processContent(content, detectedFormat);
return {
id: metadata.id,
content: processedContent,
metadata
};
} catch (error) {
if (error instanceof DocumentProcessingError) {
throw error;
}
throw new DocumentProcessingError("unknown", `Failed to load document: ${error}`, error);
}
}
/**
* Load multiple documents in batch
*/
async loadDocuments(requests) {
const results = await Promise.allSettled(
requests.map((request) => this.loadDocument(request))
);
const documents = [];
const errors = [];
results.forEach((result, index) => {
if (result.status === "fulfilled") {
documents.push(result.value);
} else {
errors.push(`Document ${index}: ${result.reason.message}`);
}
});
if (errors.length > 0) {
console.warn("Some documents failed to load:", errors);
}
return documents;
}
/**
* Load document from file path
*/
async loadFromFile(filePath, request) {
throw new DocumentProcessingError("file", "File system access not available in Cloudflare Workers. Use R2 storage instead.");
}
/**
* Load document from URL
*/
async loadFromUrl(url, request) {
try {
const response = await fetch(url, {
headers: {
"User-Agent": "Nexus RAG Pipeline/1.0",
"Accept": "text/html,application/xhtml+xml,application/xml,text/plain,application/pdf,*/*"
}
});
if (!response.ok) {
throw new Error(`HTTP ${response.status}: ${response.statusText}`);
}
const contentType = response.headers.get("content-type") || "";
const format = this.detectFormatFromContentType(contentType) || request.format || "html";
let content;
if (format === "pdf") {
const buffer = await response.arrayBuffer();
content = await this.extractFromPDF(buffer);
} else {
content = await response.text();
}
const metadata = this.createMetadata({
id: generateId(),
url,
format,
source: "url",
title: this.extractTitleFromUrl(url),
fileSize: parseInt(response.headers.get("content-length") || "0"),
...request.metadata
});
return { content, format, metadata };
} catch (error) {
throw new DocumentProcessingError("url", `Failed to fetch from URL: ${error}`, error);
}
}
/**
* Load document from R2/S3 storage
*/
async loadFromStorage(key, storageType, request) {
try {
if (storageType === "r2") {
const object = await this.env.RAG_BUCKET.get(key);
if (!object) {
throw new Error(`Object not found: ${key}`);
}
const format = request.format || this.detectFormatFromFilename(key);
let content;
if (format === "pdf") {
const buffer = await object.arrayBuffer();
content = await this.extractFromPDF(buffer);
} else {
content = await object.text();
}
const metadata = this.createMetadata({
id: generateId(),
filePath: key,
format,
source: "r2",
fileSize: object.size,
title: this.extractTitleFromFilename(key),
...request.metadata
});
return { content, format, metadata };
} else {
throw new DocumentProcessingError("s3", "S3 support not implemented yet");
}
} catch (error) {
throw new DocumentProcessingError(storageType, `Failed to load from ${storageType}: ${error}`, error);
}
}
/**
* Load document from GitHub
*/
async loadFromGitHub(repoPath, request) {
try {
const [owner, repo, ...pathParts] = repoPath.split("/");
const filePath = pathParts.join("/");
const url = `https://raw.githubusercontent.com/${owner}/${repo}/main/${filePath}`;
return await this.loadFromUrl(url, {
...request,
type: "url",
metadata: {
...request.metadata,
source: "github",
url
}
});
} catch (error) {
throw new DocumentProcessingError("github", `Failed to load from GitHub: ${error}`, error);
}
}
/**
* Process content based on format
*/
async processContent(content, format) {
try {
switch (format) {
case "html":
return this.processHTML(content);
case "pdf":
return this.processPDF(content);
// Content is already extracted
case "docx":
return await this.processDocx(content);
case "markdown":
return this.processMarkdown(content);
case "csv":
return await this.processCSV(content);
case "json":
return this.processJSON(content);
case "xml":
return this.processXML(content);
case "txt":
default:
return sanitizeText(content);
}
} catch (error) {
throw new DocumentProcessingError(format, `Failed to process ${format} content: ${error}`, error);
}
}
/**
* Process HTML content
*/
processHTML(html) {
const dom = new JSDOM(html);
const document = dom.window.document;
const scripts = document.querySelectorAll("script, style, nav, footer, aside");
scripts.forEach((el) => el.remove());
let content = document.querySelector("main, article, .content, #content, .post, .entry");
if (!content) {
content = document.body;
}
const markdown = this.turndownService.turndown(content.innerHTML);
return sanitizeText(markdown);
}
/**
* Process PDF content (already extracted)
*/
processPDF(content) {
return sanitizeText(content);
}
/**
* Extract text from PDF buffer
*/
async extractFromPDF(buffer) {
try {
const data = await pdfParse(Buffer.from(buffer));
return data.text;
} catch (error) {
throw new Error(`PDF parsing failed: ${error}`);
}
}
/**
* Process DOCX content
*/
async processDocx(buffer) {
try {
const bufferData = Buffer.from(buffer);
const result = await mammoth.extractRawText({ arrayBuffer: bufferData.buffer });
return sanitizeText(result.value);
} catch (error) {
throw new Error(`DOCX parsing failed: ${error}`);
}
}
/**
* Process Markdown content
*/
processMarkdown(markdown) {
const html = this.markdownIt.render(markdown);
const dom = new JSDOM(html);
const text = dom.window.document.body.textContent || "";
return sanitizeText(text);
}
/**
* Process CSV content
*/
async processCSV(csvContent) {
return new Promise((resolve, reject) => {
const rows = [];
const stream = __require("stream").Readable.from([csvContent]);
stream.pipe(csvParser()).on("data", (row) => rows.push(row)).on("end", () => {
const text = rows.map(
(row) => Object.entries(row).map(([key, value]) => `${key}: ${value}`).join(", ")
).join("\n");
resolve(sanitizeText(text));
}).on("error", reject);
});
}
/**
* Process JSON content
*/
processJSON(jsonContent) {
try {
const data = JSON.parse(jsonContent);
const text = this.jsonToText(data);
return sanitizeText(text);
} catch (error) {
throw new Error(`JSON parsing failed: ${error}`);
}
}
/**
* Process XML content
*/
processXML(xmlContent) {
try {
const dom = new JSDOM(xmlContent, { contentType: "text/xml" });
const text = dom.window.document.textContent || "";
return sanitizeText(text);
} catch (error) {
throw new Error(`XML parsing failed: ${error}`);
}
}
/**
* Convert JSON to readable text
*/
jsonToText(obj, prefix = "") {
if (typeof obj !== "object" || obj === null) {
return String(obj);
}
const lines = [];
if (Array.isArray(obj)) {
obj.forEach((item, index) => {
const itemPrefix = `${prefix}[${index}]`;
lines.push(`${itemPrefix}: ${this.jsonToText(item, itemPrefix)}`);
});
} else {
Object.entries(obj).forEach(([key, value]) => {
const itemPrefix = prefix ? `${prefix}.${key}` : key;
if (typeof value === "object" && value !== null) {
lines.push(`${itemPrefix}:`);
lines.push(this.jsonToText(value, itemPrefix));
} else {
lines.push(`${itemPrefix}: ${value}`);
}
});
}
return lines.join("\n");
}
/**
* Create document metadata
*/
createMetadata(partial) {
const timestamp = Date.now();
return {
id: partial.id || generateId(),
format: partial.format,
source: partial.source,
createdAt: timestamp,
updatedAt: timestamp,
...partial
};
}
/**
* Detect format from content type
*/
detectFormatFromContentType(contentType) {
if (contentType.includes("text/html")) return "html";
if (contentType.includes("text/markdown")) return "markdown";
if (contentType.includes("text/plain")) return "txt";
if (contentType.includes("application/pdf")) return "pdf";
if (contentType.includes("application/vnd.openxmlformats-officedocument.wordprocessingml.document")) return "docx";
if (contentType.includes("text/csv")) return "csv";
if (contentType.includes("application/json")) return "json";
if (contentType.includes("application/xml") || contentType.includes("text/xml")) return "xml";
return null;
}
/**
* Detect format from filename
*/
detectFormatFromFilename(filename) {
const ext = filename.split(".").pop()?.toLowerCase();
switch (ext) {
case "md":
return "markdown";
case "html":
case "htm":
return "html";
case "pdf":
return "pdf";
case "docx":
return "docx";
case "csv":
return "csv";
case "json":
return "json";
case "xml":
return "xml";
case "txt":
default:
return "txt";
}
}
/**
* Extract title from URL
*/
extractTitleFromUrl(url) {
try {
const urlObj = new URL(url);
const pathname = urlObj.pathname;
const filename = pathname.split("/").pop() || "";
return filename.split(".")[0] || urlObj.hostname;
} catch {
return "Unknown Document";
}
}
/**
* Extract title from filename
*/
extractTitleFromFilename(filename) {
const name = filename.split("/").pop() || filename;
return name.split(".")[0] || "Unknown Document";
}
};
// src/processing/chunking.ts
var DocumentChunker = class _DocumentChunker {
/**
* Chunk a document using the specified strategy
*/
async chunkDocument(document, config) {
try {
switch (config.strategy) {
case "fixed_size":
return this.fixedSizeChunking(document, config);
case "semantic":
return this.semanticChunking(document, config);
case "recursive":
return this.recursiveChunking(document, config);
case "sentence":
return this.sentenceChunking(document, config);
case "paragraph":
return this.paragraphChunking(document, config);
case "custom":
if (config.customSplitter) {
return config.customSplitter(document.content, config);
}
throw new Error("Custom splitter not provided");
default:
throw new Error(`Unsupported chunking strategy: ${config.strategy}`);
}
} catch (error) {
throw new DocumentProcessingError(
document.id,
`Chunking failed: ${error}`,
error
);
}
}
/**
* Chunk multiple documents
*/
async chunkDocuments(documents, config) {
const allChunks = [];
for (const document of documents) {
try {
const chunks = await this.chunkDocument(document, config);
allChunks.push(...chunks);
} catch (error) {
console.error(`Failed to chunk document ${document.id}:`, error);
}
}
return allChunks;
}
/**
* Fixed-size chunking with overlap
*/
fixedSizeChunking(document, config) {
const { content } = document;
const chunks = createOverlappingChunks(content, config.size, config.overlap);
return chunks.map((chunk, index) => this.createChunk(
chunk.content,
document,
index,
chunk.startIndex,
chunk.endIndex
)).filter((chunk) => this.isValidChunk(chunk, config));
}
/**
* Semantic chunking based on content structure
*/
semanticChunking(document, config) {
const { content } = document;
const headers = extractHeaders(content);
if (headers.length === 0) {
return this.paragraphChunking(document, config);
}
const chunks = [];
const lines = content.split("\n");
let currentSection = "";
let currentStartIndex = 0;
let chunkIndex = 0;
for (let i = 0; i < lines.length; i++) {
const line = lines[i];
const isHeader = /^#{1,6}\s+/.test(line);
if (isHeader && currentSection.length > 0) {
const sectionContent = currentSection.trim();
if (sectionContent.length >= (config.minChunkSize || 50)) {
const chunk = this.createChunk(
sectionContent,
document,
chunkIndex++,
currentStartIndex,
currentStartIndex + currentSection.length
);
if (this.isValidChunk(chunk, config)) {
chunks.push(chunk);
}
}
currentSection = line + "\n";
currentStartIndex = content.indexOf(currentSection, currentStartIndex);
} else {
currentSection += line + "\n";
}
}
if (currentSection.trim().length >= (config.minChunkSize || 50)) {
const chunk = this.createChunk(
currentSection.trim(),
document,
chunkIndex,
currentStartIndex,
currentStartIndex + currentSection.length
);
if (this.isValidChunk(chunk, config)) {
chunks.push(chunk);
}
}
return this.splitLargeChunks(chunks, config);
}
/**
* Recursive chunking with hierarchical splitting
*/
recursiveChunking(document, config) {
const separators = config.separators || [
"\n\n",
// Paragraphs
"\n",
// Lines
". ",
// Sentences
" ",
// Words
""
// Characters
];
return this.recursiveSplit(document.content, separators, document, config);
}
/**
* Recursively split text using hierarchical separators
*/
recursiveSplit(text, separators, document, config, currentIndex = 0) {
if (text.length <= config.size || separators.length === 0) {
const chunk = this.createChunk(text, document, 0, currentIndex, currentIndex + text.length);
return this.isValidChunk(chunk, config) ? [chunk] : [];
}
const separator = separators[0];
const parts = separator === "" ? text.split("") : text.split(separator);
if (parts.length === 1) {
return this.recursiveSplit(text, separators.slice(1), document, config, currentIndex);
}
const chunks = [];
let currentText = "";
let currentStart = currentIndex;
let chunkIndex = 0;
for (let i = 0; i < parts.length; i++) {
const part = parts[i];
const testText = currentText + (currentText ? separator : "") + part;
if (testText.length <= config.size) {
currentText = testText;
} else {
if (currentText.trim().length > 0) {
const chunk = this.createChunk(
currentText.trim(),
document,
chunkIndex++,
currentStart,
currentStart + currentText.length
);
if (this.isValidChunk(chunk, config)) {
chunks.push(chunk);
}
}
currentStart += currentText.length + (currentText ? separator.length : 0);
currentText = part;
if (part.length > config.size) {
const subChunks = this.recursiveSplit(
part,
separators.slice(1),
document,
config,
currentStart
);
chunks.push(...subChunks);
currentText = "";
currentStart += part.length;
}
}
}
if (currentText.trim().length > 0) {
const chunk = this.createChunk(
currentText.trim(),
document,
chunkIndex,
currentStart,
currentStart + currentText.length
);
if (this.isValidChunk(chunk, config)) {
chunks.push(chunk);
}
}
return chunks;
}
/**
* Sentence-based chunking
*/
sentenceChunking(document, config) {
const { content } = document;
const boundaries = findSentenceBoundaries(content);
const chunks = [];
let currentChunk = "";
let currentStart = 0;
let chunkIndex = 0;
for (let i = 1; i < boundaries.length; i++) {
const sentenceStart = boundaries[i - 1];
const sentenceEnd = boundaries[i];
const sentence = content.slice(sentenceStart, sentenceEnd).trim();
if (!sentence) continue;
const testChunk = currentChunk + (currentChunk ? " " : "") + sentence;
if (testChunk.length <= config.size) {
currentChunk = testChunk;
} else {
if (currentChunk) {
const chunk = this.createChunk(
currentChunk,
document,
chunkIndex++,
currentStart,
currentStart + currentChunk.length
);
if (this.isValidChunk(chunk, config)) {
chunks.push(chunk);
}
}
currentStart = sentenceStart;
currentChunk = sentence;
if (sentence.length > config.size) {
const wordChunks = splitOnWordBoundary(sentence, config.size);
for (const wordChunk of wordChunks) {
const chunk = this.createChunk(
wordChunk,
document,
chunkIndex++,
currentStart,
currentStart + wordChunk.length
);
if (this.isValidChunk(chunk, config)) {
chunks.push(chunk);
}
currentStart += wordChunk.length;
}
currentChunk = "";
}
}
}
if (currentChunk) {
const chunk = this.createChunk(
currentChunk,
document,
chunkIndex,
currentStart,
currentStart + currentChunk.length
);
if (this.isValidChunk(chunk, config)) {
chunks.push(chunk);
}
}
return chunks;
}
/**
* Paragraph-based chunking
*/
paragraphChunking(document, config) {
const { content } = document;
const boundaries = findParagraphBoundaries(content);
const chunks = [];
let currentChunk = "";
let currentStart = 0;
let chunkIndex = 0;
for (let i = 1; i < boundaries.length; i++) {
const paragraphStart = boundaries[i - 1];
const paragraphEnd = boundaries[i];
const paragraph = content.slice(paragraphStart, paragraphEnd).trim();
if (!paragraph) continue;
const testChunk = currentChunk + (currentChunk ? "\n\n" : "") + paragraph;
if (testChunk.length <= config.size) {
currentChunk = testChunk;
} else {
if (currentChunk) {
const chunk = this.createChunk(
currentChunk,
document,
chunkIndex++,
currentStart,
currentStart + currentChunk.length
);
if (this.isValidChunk(chunk, config)) {
chunks.push(chunk);
}
}
currentStart = paragraphStart;
currentChunk = paragraph;
if (paragraph.length > config.size) {
const subChunks = this.fixedSizeChunking(
{ ...document, content: paragraph },
{ ...config, strategy: "fixed_size" }
);
chunks.push(...subChunks.map((chunk) => ({
...chunk,
metadata: {
...chunk.metadata,
documentId: document.id
}
})));
currentChunk = "";
currentStart += paragraph.length;
}
}
}
if (currentChunk) {
const chunk = this.createChunk(
currentChunk,
document,
chunkIndex,
currentStart,
currentStart + currentChunk.length
);
if (this.isValidChunk(chunk, config)) {
chunks.push(chunk);
}
}
return chunks;
}
/**
* Split chunks that are too large
*/
splitLargeChunks(chunks, config) {
const result = [];
for (const chunk of chunks) {
if (chunk.content.length <= config.maxChunkSize) {
result.push(chunk);
} else {
const subChunks = createOverlappingChunks(
chunk.content,
config.size,
config.overlap
);
let subIndex = 0;
for (const subChunk of subChunks) {
const newChunk = this.createChunk(
subChunk.content,
{ id: chunk.metadata.documentId },
chunk.metadata.index + subIndex / 1e3,
// Maintain ordering
chunk.metadata.startIndex + subChunk.startIndex,
chunk.metadata.startIndex + subChunk.endIndex,
{
...chunk.metadata,
chunkId: `${chunk.id}_${subIndex}`
}
);
if (this.isValidChunk(newChunk, config)) {
result.push(newChunk);
}
subIndex++;
}
}
}
return result;
}
/**
* Create a chunk object
*/
createChunk(content, document, index, startIndex, endIndex, metadataOverride) {
const chunkId = generateId("chunk");
const headers = extractHeaders(content);
return {
id: chunkId,
content: content.trim(),
tokens: estimateTokens(content),
metadata: {
chunkId,
documentId: document.id,
index,
startIndex,
endIndex,
title: document.metadata.title,
section: headers[0] || void 0,
headers: headers.length > 0 ? headers : void 0,
...metadataOverride
}
};
}
/**
* Validate that a chunk meets the configuration requirements
*/
isValidChunk(chunk, config) {
const contentLength = chunk.content.length;
if (config.minChunkSize && contentLength < config.minChunkSize) {
return false;
}
if (config.maxChunkSize && contentLength > config.maxChunkSize) {
return false;
}
if (!isValidContent(chunk.content)) {
return false;
}
return true;
}
/**
* Get default chunking configuration
*/
static getDefaultConfig() {
return {
strategy: "recursive",
size: 512,
overlap: 128,
minChunkSize: 50,
maxChunkSize: 2048,
preserveStructure: false,
separators: ["\n\n", "\n", ". ", " ", ""]
};
}
/**
* Get optimized configuration for specific document types
*/
static getOptimizedConfig(documentFormat) {
const baseConfig = _DocumentChunker.getDefaultConfig();
switch (documentFormat) {
case "markdown":
return {
...baseConfig,
strategy: "semantic",
preserveStructure: true,
separators: ["\n## ", "\n# ", "\n\n", "\n", ". ", " ", ""]
};
case "html":
return {
...baseConfig,
strategy: "semantic",
preserveStructure: true
};
case "pdf":
return {
...baseConfig,
strategy: "paragraph",
size: 768,
overlap: 150
};
case "csv":
case "json":
return {
...baseConfig,
strategy: "fixed_size",
size: 256,
overlap: 50
};
case "txt":
default:
return {
...baseConfig,
strategy: "recursive"
};
}
}
};
var EmbeddingService = class _EmbeddingService {
gateway;
env;
config;
requestIdCounter = 0;
constructor(env, config) {
this.env = env;
this.config = config;
this.gateway = createGateway(env, {
routing: {
primary: config.provider,
fallbacks: this.getFallbackProviders(config.provider),
loadBalancing: "least_latency",
failoverThreshold: 2
},
cache: {
enabled: true,
ttl: 86400,
// Cache embeddings for 24 hours
strategy: "exact",
keyPrefix: "emb:"
}
});
}
/**
* Generate embeddings for a single chunk
*/
async generateEmbedding(chunk) {
try {
const embedding = await this.generateEmbeddings([chunk]);
return embedding[0];
} catch (error) {
throw new EmbeddingError(
`Failed to generate embedding for chunk ${chunk.id}: ${error}`,
error
);
}
}
/**
* Generate embeddings for multiple chunks in batch
*/
async generateEmbeddings(chunks) {
if (chunks.length === 0) {
return [];
}
try {
const batchSize = Math.min(this.config.batchSize, 100);
const embeddings = [];
for (let i = 0; i < chunks.length; i += batchSize) {
const batchChunks = chunks.slice(i, i + batchSize);
const batchEmbeddings = await this.processBatch(batchChunks);
embeddings.push(...batchEmbeddings);
}
return embeddings;
} catch (error) {
throw new EmbeddingError(
`Failed to generate embeddings for batch: ${error}`,
error
);
}
}
/**
* Generate embeddings for text chunks with enhanced content
*/
async generateEnhancedEmbeddings(chunks) {
const enhancedChunks = chunks.map((chunk) => ({
...chunk,
content: this.enhanceChunkContent(chunk)
}));
return this.generateEmbeddings(enhancedChunks);
}
/**
* Generate embedding for a query
*/
async generateQueryEmbedding(query) {
try {
const embedding = await this.callEmbeddingModel([query]);
return embedding[0];
} catch (error) {
throw new EmbeddingError(
`Failed to generate embedding for query: ${error}`,
error
);
}
}
/**
* Process a batch of chunks
*/
async processBatch(chunks) {
const texts = chunks.map((chunk) => this.prepareTextForEmbedding(chunk.content));
let retries = 0;
while (retries <= this.config.maxRetries) {
try {
return await this.callEmbeddingModel(texts);
} catch (error) {
retries++;
if (retries > this.config.maxRetries) {
throw error;
}
const delay = Math.min(1e3 * Math.pow(2, retries - 1), 3e4);
await new Promise((resolve) => setTimeout(resolve, delay));
}
}
throw new Error("Max retries exceeded");
}
/**
* Call the embedding model via Cloudflare Workers AI or fallback providers
*/
async callEmbeddingModel(texts) {
try {
if (this.config.provider === "cloudflare" && "AI" in this.env) {
return await this.callCloudflareEmbedding(texts);
}
return await this.callProviderEmbedding(texts);
} catch (error) {
console.warn("Primary embedding provider failed, trying fallbacks:", error);
for (const fallbackProvider of this.getFallbackProviders(this.config.provider)) {
try {
const tempConfig = { ...this.config, provider: fallbackProvider };
const tempService = new _EmbeddingService(this.env, tempConfig);
return await tempService.callEmbeddingModel(texts);
} catch (fallbackError) {
console.warn(`Fallback provider ${fallbackProvider} failed:`, fallbackError);
}
}
throw new EmbeddingError(
`All embedding providers failed. Last error: ${error}`,
error
);
}
}
/**
* Call Cloudflare Workers AI for embeddings
*/
async callCloudflareEmbedding(texts) {
const ai = this.env.AI;
if (!ai) {
throw new Error("Cloudflare AI binding not available");
}
const embeddings = [];
const batchSize = Math.min(this.config.batchSize, 50);
for (let i = 0; i < texts.length; i += batchSize) {
const batch = texts.slice(i, i + batchSize);
try {
const response = await ai.run("@cf/baai/bge-base-en-v1.5", {
text: batch
});
if (!response.success) {
throw new Error(`Cloudflare AI embedding failed: ${JSON.stringify(response.errors)}`);
}
embeddings.push(...response.data);
} catch (error) {
throw new EmbeddingError(
`Cloudflare AI embedding batch failed: ${error}`,
error
);
}
}
return embeddings;
}
/**
* Call provider-specific embedding APIs through AI Gateway
*/
async callProviderEmbedding(texts) {
const embeddings = [];
for (const text of texts) {
try {
let embedding;
switch (this.config.provider) {
case "openai":
embedding = await this.callOpenAIEmbedding(text);
break;
case "cohere":
embedding = await this.callCohereEmbedding(text);
break;
case "google":
embedding = await this.callGoogleEmbedding(text);
break;
default:
embedding = await this.simulateEmbedding(text);
}
embeddings.push(embedding);
} catch (error) {
throw new EmbeddingError(
`Provider ${this.config.provider} embedding failed for text: ${error}`,
error
);
}
}
return embeddings;
}
/**
* Call OpenAI embedding API
*/
async callOpenAIEmbedding(text) {
const response = await fetch("https://api.openai.com/v1/embeddings", {
method: "POST",
headers: {
"Authorization": `Bearer ${this.env.OPENAI_API_KEY}`,
"Content-Type": "application/json"
},
body: JSON.stringify({
model: this.config.model,
input: text,
encoding_format: "float"
})
});
if (!response.ok) {
throw new Error(`OpenAI API error: ${response.status} ${response.statusText}`);
}
const data = await response.json();
return data.data[0].embedding;
}
/**
* Call Cohere embedding API
*/
async callCohereEmbedding(text) {
const response = await fetch("https://api.cohere.ai/v1/embed", {
method: "POST",
headers: {
"Authorization": `Bearer ${this.env.COHERE_API_KEY}`,
"Content-Type": "application/json"
},
body: JSON.stringify({
model: this.config.model,
texts: [text],
input_type: "search_document"
})
});
if (!response.ok) {
throw new Error(`Cohere API error: ${response.status} ${response.statusText}`);
}
const data = await response.json();
return data.embeddings[0];
}
/**
* Call Google embedding API
*/
async callGoogleEmbedding(text) {
const response = await fetch(`https://generativelanguage.googleapis.com/v1beta/models/embedding-001:embedContent?key=${this.env.GOOGLE_API_KEY}`, {
method: "POST",
headers: {
"Content-Type": "application/json"
},
body: JSON.stringify({
model: "models/embedding-001",
content: {
parts: [{ text }]
}
})
});
if (!response.ok) {
throw new Error(`Google API error: ${response.status} ${response.statusText}`);
}
const data = await response.json();
return data.embedding.values;
}
/**
* Simulate embedding generation (fallback for unsupported providers)
*/
async simulateEmbedding(text) {
const dimensions = this.config.dimensions;
const embedding = new Array(dimensions).fill(0);
let hash = 0;
for (let i = 0; i < text.length; i++) {
hash = (hash << 5) - hash + text.charCodeAt(i) & 4294967295;
}
for (let i = 0; i < dimensions; i++) {
const seed = hash ^ i * 2654435761;
embedding[i] = Math.sin(seed) * 0.5;
}
const norm = Math.sqrt(embedding.reduce((sum, val) => sum + val * val, 0));
return embedding.map((val) => val / norm);
}
/**
* Prepare text for embedding by cleaning and truncating
*/
prepareTextForEmbedding(text) {
let cleanText = text.replace(/\s+/g, " ").replace(/\n+/g, " ").trim();
const maxTokens = 8192;
const estimatedTokens = cleanText.length / 4;
if (estimatedTokens > maxTokens) {
const maxChars = maxTokens * 4;
cleanText = cleanText.substring(0, maxChars);
const lastSpaceIndex = cleanText.lastIndexOf(" ");
if (lastSpaceIndex > maxChars * 0.9) {
cleanText = cleanText.substring(0, lastSpaceIndex);
}
}
return cleanText;
}
/**
* Enhance chunk content with metadata for better embeddings
*/
enhanceChunkContent(chunk) {
const parts = [chunk.content];
if (chunk.metadata.title) {
parts.unshift(`Title: ${chunk.metadata.title}`);
}
if (chunk.metadata.headers && chunk.metadata.headers.length > 0) {
const headerText = chunk.metadata.headers.join(" > ");
parts.unshift(`Section: ${headerText}`);
}
const metadata = chunk.metadata;
if (metadata.custom) {
Object.entries(metadata.custom).forEach(([key, value]) => {
if (typeof value === "string" && value.length < 100) {
parts.push(`${key}: ${value}`);
}
});
}
return parts.join("\n\n");
}
/**
* Get fallback providers for embedding generation
*/
getFallbackProviders(primaryProvider) {
const allProviders = ["cloudflare", "openai", "cohere", "google", "mistral"];
return allProviders.filter((provider) => provider !== primaryProvider);
}
/**
* Validate embedding dimensions
*/
validateEmbedding(embedding) {
if (!Array.isArray(embedding)) {
return false;
}
if (embedding.length !== this.config.dimensions) {
return false;
}
return embedding.every(
(val) => typeof val === "number" && isFinite(val) && !isNaN(val)
);
}
/**
* Calculate embedding similarity
*/
calculateSimilarity(embedding1, embedding2) {
if (embedding1.length !== embedding2.length) {
throw new Error("Embeddings must have the same dimensions");
}
let dotProduct = 0;
let norm1 = 0;
let norm2 = 0;
for (let i = 0; i < embedding1.length; i++) {
dotProduct += embedding1[i] * embedding2[i];
norm1 += embedding1[i] * embedding1[i];
norm2 += embedding2[i] * embedding2[i];
}
if (norm1 === 0 || norm2 === 0) {
return 0;
}
return dotProduct / (Math.sqrt(norm1) * Math.sqrt(norm2));
}
/**
* Get embedding statistics for monitoring
*/
getEmbeddingStats() {
return {
provider: this.config.provider,
model: this.config.model,
dimensions: this.config.dimensions,
batchSize: this.config.batchSize
};
}
/**
* Update embedding configuration
*/
updateConfig(newConfig) {
this.config = { ...this.config, ...newConfig };
if (newConfig.provider && newConfig.provider !== this.config.provider) {
this.gateway = createGateway(this.env, {
routing: {
primary: this.config.provider,
fallbacks: this.getFallbackProviders(this.config.provider),
loadBalancing: "least_latency",
failoverThreshold: 2
},
cache: {
enabled: true,
ttl: 86400,
strategy: "exact",
keyPrefix: "emb:"
}
});
}
}
/**
* Create embedding service with default configuration
*/
static createDefault(env) {
const preferCloudflare = "AI" in env;
const defaultConfig = {
model: env.RAG_DEFAULT_EMBEDDING_MODEL || (preferCloudflare ? "@cf/baai/bge-base-en-v1.5" : "text-embedding-ada-002"),
provider: preferCloudflare ? "cloudflare" : "openai",
dimensions: preferCloudflare ? 768 : 1536,
batchSize: parseInt(env.RAG_BATCH_SIZE || (preferCloudflare ? "50" : "100")),
maxRetries: 3,
timeout: 3e4
};
return new _EmbeddingService(env, defaultConfig);
}
/**
* Create embedding service for specific model
*/
static createForModel(env, model, provider) {
const modelConfigs = {
// OpenAI