UNPKG

@neureus/rag

Version:

AutoRAG - Zero-setup knowledge integration with Cloudflare AI and Vectorize

github.com/nexus-ai/nexus-cloud-platform

nexus-ai/nexus-cloud-platform

1,648 lines (1,645 loc) • 153 kB

JavaScript

import { z } from 'zod'; import TurndownService from 'turndown'; import * as mammoth from 'mammoth'; import pdfParse from 'pdf-parse'; import csvParser from 'csv-parser'; import MarkdownIt from 'markdown-it'; import { JSDOM } from 'jsdom'; import crypto2 from 'crypto'; import { createGateway } from '@neureus/ai-gateway'; import { createVectorDB } from '@neureus/vector-db'; // @neureus/rag - Production-ready RAG implementation for Cloudflare Workers var __require = /* @__PURE__ */ ((x) => typeof require !== "undefined" ? require : typeof Proxy !== "undefined" ? new Proxy(x, { get: (a, b) => (typeof require !== "undefined" ? require : a)[b] }) : x)(function(x) { if (typeof require !== "undefined") return require.apply(this, arguments); throw Error('Dynamic require of "' + x + '" is not supported'); }); var DocumentSchema = z.object({ id: z.string(), content: z.string(), metadata: z.object({ id: z.string(), title: z.string().optional(), author: z.string().optional(), createdAt: z.number().optional(), updatedAt: z.number().optional(), url: z.string().optional(), filePath: z.string().optional(), fileSize: z.number().optional(), format: z.enum(["markdown", "html", "pdf", "txt", "docx", "csv", "json", "xml", "image", "audio", "video"]), source: z.enum(["file", "url", "text", "s3", "r2", "github", "webhook", "email", "auto"]), tags: z.array(z.string()).optional(), language: z.string().optional(), custom: z.record(z.string(), z.unknown()).optional() }) }); var ChunkSchema = z.object({ id: z.string(), content: z.string(), tokens: z.number(), embedding: z.array(z.number()).optional(), metadata: z.object({ chunkId: z.string(), documentId: z.string(), index: z.number(), startIndex: z.number(), endIndex: z.number(), title: z.string().optional(), section: z.string().optional(), headers: z.array(z.string()).optional(), pageNumber: z.number().optional(), custom: z.record(z.string(), z.unknown()).optional() }) }); var RAGConfigSchema = z.object({ name: z.string(), description: z.string().optional(), embedding: z.object({ model: z.string(), provider: z.enum(["openai", "anthropic", "google", "cloudflare", "cohere", "mistral"]), dimensions: z.number().default(1536), batchSize: z.number().default(100), maxRetries: z.number().default(3), timeout: z.number().default(3e4) }), chunking: z.object({ strategy: z.enum(["fixed_size", "semantic", "recursive", "sentence", "paragraph", "custom"]).default("recursive"), size: z.number().default(512), overlap: z.number().default(128), minChunkSize: z.number().default(50), maxChunkSize: z.number().default(2048), separators: z.array(z.string()).optional(), preserveStructure: z.boolean().default(false) }), retrieval: z.object({ topK: z.number().default(5), minSimilarity: z.number().default(0.7), hybridWeight: z.number().default(0.7), // Weight for vector vs keyword search rerankModel: z.string().optional(), maxContextTokens: z.number().default(4e3) }), generation: z.object({ model: z.string(), provider: z.enum(["openai", "anthropic", "google", "cloudflare", "cohere", "mistral"]), temperature: z.number().default(0.1), maxTokens: z.number().default(1e3), systemPrompt: z.string().optional(), includeSource: z.boolean().default(true), streaming: z.boolean().default(false) }), analytics: z.object({ enabled: z.boolean().default(true), trackQueries: z.boolean().default(true), trackPerformance: z.boolean().default(true), sampleRate: z.number().default(1) }) }); var IngestionRequestSchema = z.object({ source: z.string(), // URL, file path, or text content type: z.enum(["file", "url", "text", "s3", "r2", "github", "auto", "webhook", "email"]).default("file"), format: z.enum(["markdown", "html", "pdf", "txt", "docx", "csv", "json", "xml", "image", "audio", "video"]).optional(), metadata: z.record(z.string(), z.unknown()).optional(), recursive: z.boolean().default(false), // For directories/repos filters: z.array(z.string()).optional(), // File patterns to include excludes: z.array(z.string()).optional() // File patterns to exclude }); var QueryRequestSchema = z.object({ query: z.string(), topK: z.number().default(5), minSimilarity: z.number().default(0.7), filter: z.record(z.string(), z.unknown()).optional(), namespace: z.string().optional(), includeSource: z.boolean().default(true), streaming: z.boolean().default(false), conversationHistory: z.array(z.object({ role: z.enum(["system", "user", "assistant"]), content: z.string() })).optional(), customPrompt: z.string().optional(), rerankResults: z.boolean().default(true) }); var RAGError = class extends Error { constructor(message, code, statusCode = 500, pipelineName, originalError) { super(message); this.code = code; this.statusCode = statusCode; this.pipelineName = pipelineName; this.originalError = originalError; this.name = "RAGError"; } }; var DocumentProcessingError = class extends RAGError { constructor(documentId, message, originalError) { super( `Failed to process document ${documentId}: ${message}`, "DOCUMENT_PROCESSING_ERROR", 422, void 0, originalError ); this.name = "DocumentProcessingError"; } }; var EmbeddingError = class extends RAGError { constructor(message, originalError) { super( `Embedding generation failed: ${message}`, "EMBEDDING_ERROR", 500, void 0, originalError ); this.name = "EmbeddingError"; } }; var RetrievalError = class extends RAGError { constructor(message, originalError) { super( `Context retrieval failed: ${message}`, "RETRIEVAL_ERROR", 500, void 0, originalError ); this.name = "RetrievalError"; } }; var GenerationError = class extends RAGError { constructor(message, originalError) { super( `Answer generation failed: ${message}`, "GENERATION_ERROR", 500, void 0, originalError ); this.name = "GenerationError"; } }; function generateId(prefix = "") { const timestamp = Date.now().toString(36); const random = crypto2.randomBytes(6).toString("hex"); return prefix ? `${prefix}_${timestamp}_${random}` : `${timestamp}_${random}`; } function sanitizeText(text) { return text.replace(/\r\n/g, "\n").replace(/\r/g, "\n").replace(/\n{3,}/g, "\n\n").replace(/[ \t]{2,}/g, " ").replace(/^\s+|\s+$/gm, "").trim(); } function extractHeaders(content) { const headers = []; const mdHeaders = content.match(/^#{1,6}\s+(.+)$/gm); if (mdHeaders) { headers.push(...mdHeaders.map((h) => h.replace(/^#+\s+/, "").trim())); } if (headers.length === 0) { const htmlHeaders = content.match(/<h[1-6][^>]*>(.*?)<\/h[1-6]>/gi); if (htmlHeaders) { headers.push(...htmlHeaders.map((h) => h.replace(/<[^>]*>/g, "").trim())); } } return headers.filter((h) => h.length > 0); } function estimateTokens(text) { return Math.ceil(text.length / 4); } function splitOnWordBoundary(text, maxLength) { if (text.length <= maxLength) { return [text]; } const chunks = []; let currentChunk = ""; const words = text.split(/\s+/); for (const word of words) { const testChunk = currentChunk ? `${currentChunk} ${word}` : word; if (testChunk.length <= maxLength) { currentChunk = testChunk; } else { if (currentChunk) { chunks.push(currentChunk.trim()); currentChunk = word; } else { chunks.push(word); currentChunk = ""; } } } if (currentChunk) { chunks.push(currentChunk.trim()); } return chunks.filter((chunk) => chunk.length > 0); } function createOverlappingChunks(text, chunkSize, overlapSize, splitOnWords = true) { if (text.length <= chunkSize) { return [{ content: text, startIndex: 0, endIndex: text.length }]; } const chunks = []; let currentIndex = 0; while (currentIndex < text.length) { const endIndex = Math.min(currentIndex + chunkSize, text.length); let chunkText = text.slice(currentIndex, endIndex); if (splitOnWords && endIndex < text.length) { const lastSpaceIndex = chunkText.lastIndexOf(" "); if (lastSpaceIndex > chunkSize * 0.8) { chunkText = chunkText.slice(0, lastSpaceIndex); } } chunks.push({ content: chunkText.trim(), startIndex: currentIndex, endIndex: currentIndex + chunkText.length }); const actualChunkSize = chunkText.length; const step = Math.max(actualChunkSize - overlapSize, 1); currentIndex += step; if (currentIndex >= text.length) { break; } } return chunks.filter((chunk) => chunk.content.length > 0); } function findSentenceBoundaries(text) { const boundaries = [0]; const sentenceEnders = /[.!?]+\s+/g; let match; while ((match = sentenceEnders.exec(text)) !== null) { boundaries.push(match.index + match[0].length); } if (boundaries[boundaries.length - 1] !== text.length) { boundaries.push(text.length); } return boundaries; } function findParagraphBoundaries(text) { const boundaries = [0]; const paragraphBreaks = /\n\s*\n/g; let match; while ((match = paragraphBreaks.exec(text)) !== null) { boundaries.push(match.index + match[0].length); } if (boundaries[boundaries.length - 1] !== text.length) { boundaries.push(text.length); } return boundaries; } function cosineSimilarity(a, b) { if (a.length !== b.length) { throw new Error("Vectors must have the same length"); } let dotProduct = 0; let normA = 0; let normB = 0; for (let i = 0; i < a.length; i++) { dotProduct += a[i] * b[i]; normA += a[i] * a[i]; normB += b[i] * b[i]; } normA = Math.sqrt(normA); normB = Math.sqrt(normB); if (normA === 0 || normB === 0) { return 0; } return dotProduct / (normA * normB); } function textSimilarity(text1, text2) { const words1 = new Set(text1.toLowerCase().split(/\s+/)); const words2 = new Set(text2.toLowerCase().split(/\s+/)); const intersection = new Set([...words1].filter((x) => words2.has(x))); const union = /* @__PURE__ */ new Set([...words1, ...words2]); return intersection.size / union.size; } function isValidContent(content) { return content.trim().length > 0; } function truncateText(text, maxLength) { if (text.length <= maxLength) { return text; } const truncated = text.slice(0, maxLength); const lastSpaceIndex = truncated.lastIndexOf(" "); if (lastSpaceIndex > maxLength * 0.8) { return truncated.slice(0, lastSpaceIndex) + "..."; } return truncated + "..."; } function normalizeQuery(query) { return query.trim().replace(/\s+/g, " ").replace(/[^\w\s\-_.?!]/g, "").toLowerCase(); } function extractKeywords(text, minLength = 3, maxCount = 20) { const words = text.toLowerCase().replace(/[^\w\s]/g, " ").split(/\s+/).filter((word) => word.length >= minLength); const frequency = {}; words.forEach((word) => { frequency[word] = (frequency[word] || 0) + 1; }); return Object.entries(frequency).sort(([, a], [, b]) => b - a).slice(0, maxCount).map(([word]) => word); } function formatDuration(milliseconds) { if (milliseconds < 1e3) { return `${milliseconds}ms`; } const seconds = milliseconds / 1e3; if (seconds < 60) { return `${seconds.toFixed(2)}s`; } const minutes = seconds / 60; return `${minutes.toFixed(2)}m`; } function formatFileSize(bytes) { if (bytes === 0) return "0 B"; const k = 1024; const sizes = ["B", "KB", "MB", "GB", "TB"]; const i = Math.floor(Math.log(bytes) / Math.log(k)); return `${parseFloat((bytes / Math.pow(k, i)).toFixed(2))} ${sizes[i]}`; } function hashContent(content) { return crypto2.createHash("sha256").update(content).digest("hex").slice(0, 16); } // src/processing/document-loader.ts var DocumentLoader = class { turndownService; markdownIt; env; constructor(env) { this.env = env; this.turndownService = new TurndownService({ headingStyle: "atx", hr: "---", bulletListMarker: "-", codeBlockStyle: "fenced", fence: "```", emDelimiter: "*", strongDelimiter: "**" }); this.markdownIt = new MarkdownIt({ html: true, linkify: true, typographer: true }); } /** * Load a document from various sources */ async loadDocument(request) { try { const { source, type, format } = request; let content; let detectedFormat; let metadata; switch (type) { case "text": content = source; detectedFormat = format || "txt"; metadata = this.createMetadata({ id: generateId(), format: detectedFormat, source: "text", ...request.metadata }); break; case "file": ({ content, format: detectedFormat, metadata } = await this.loadFromFile(source, request)); break; case "url": ({ content, format: detectedFormat, metadata } = await this.loadFromUrl(source, request)); break; case "s3": case "r2": ({ content, format: detectedFormat, metadata } = await this.loadFromStorage(source, type, request)); break; case "github": ({ content, format: detectedFormat, metadata } = await this.loadFromGitHub(source, request)); break; default: throw new DocumentProcessingError("unknown", `Unsupported source type: ${type}`); } const processedContent = await this.processContent(content, detectedFormat); return { id: metadata.id, content: processedContent, metadata }; } catch (error) { if (error instanceof DocumentProcessingError) { throw error; } throw new DocumentProcessingError("unknown", `Failed to load document: ${error}`, error); } } /** * Load multiple documents in batch */ async loadDocuments(requests) { const results = await Promise.allSettled( requests.map((request) => this.loadDocument(request)) ); const documents = []; const errors = []; results.forEach((result, index) => { if (result.status === "fulfilled") { documents.push(result.value); } else { errors.push(`Document ${index}: ${result.reason.message}`); } }); if (errors.length > 0) { console.warn("Some documents failed to load:", errors); } return documents; } /** * Load document from file path */ async loadFromFile(filePath, request) { throw new DocumentProcessingError("file", "File system access not available in Cloudflare Workers. Use R2 storage instead."); } /** * Load document from URL */ async loadFromUrl(url, request) { try { const response = await fetch(url, { headers: { "User-Agent": "Nexus RAG Pipeline/1.0", "Accept": "text/html,application/xhtml+xml,application/xml,text/plain,application/pdf,*/*" } }); if (!response.ok) { throw new Error(`HTTP ${response.status}: ${response.statusText}`); } const contentType = response.headers.get("content-type") || ""; const format = this.detectFormatFromContentType(contentType) || request.format || "html"; let content; if (format === "pdf") { const buffer = await response.arrayBuffer(); content = await this.extractFromPDF(buffer); } else { content = await response.text(); } const metadata = this.createMetadata({ id: generateId(), url, format, source: "url", title: this.extractTitleFromUrl(url), fileSize: parseInt(response.headers.get("content-length") || "0"), ...request.metadata }); return { content, format, metadata }; } catch (error) { throw new DocumentProcessingError("url", `Failed to fetch from URL: ${error}`, error); } } /** * Load document from R2/S3 storage */ async loadFromStorage(key, storageType, request) { try { if (storageType === "r2") { const object = await this.env.RAG_BUCKET.get(key); if (!object) { throw new Error(`Object not found: ${key}`); } const format = request.format || this.detectFormatFromFilename(key); let content; if (format === "pdf") { const buffer = await object.arrayBuffer(); content = await this.extractFromPDF(buffer); } else { content = await object.text(); } const metadata = this.createMetadata({ id: generateId(), filePath: key, format, source: "r2", fileSize: object.size, title: this.extractTitleFromFilename(key), ...request.metadata }); return { content, format, metadata }; } else { throw new DocumentProcessingError("s3", "S3 support not implemented yet"); } } catch (error) { throw new DocumentProcessingError(storageType, `Failed to load from ${storageType}: ${error}`, error); } } /** * Load document from GitHub */ async loadFromGitHub(repoPath, request) { try { const [owner, repo, ...pathParts] = repoPath.split("/"); const filePath = pathParts.join("/"); const url = `https://raw.githubusercontent.com/${owner}/${repo}/main/${filePath}`; return await this.loadFromUrl(url, { ...request, type: "url", metadata: { ...request.metadata, source: "github", url } }); } catch (error) { throw new DocumentProcessingError("github", `Failed to load from GitHub: ${error}`, error); } } /** * Process content based on format */ async processContent(content, format) { try { switch (format) { case "html": return this.processHTML(content); case "pdf": return this.processPDF(content); // Content is already extracted case "docx": return await this.processDocx(content); case "markdown": return this.processMarkdown(content); case "csv": return await this.processCSV(content); case "json": return this.processJSON(content); case "xml": return this.processXML(content); case "txt": default: return sanitizeText(content); } } catch (error) { throw new DocumentProcessingError(format, `Failed to process ${format} content: ${error}`, error); } } /** * Process HTML content */ processHTML(html) { const dom = new JSDOM(html); const document = dom.window.document; const scripts = document.querySelectorAll("script, style, nav, footer, aside"); scripts.forEach((el) => el.remove()); let content = document.querySelector("main, article, .content, #content, .post, .entry"); if (!content) { content = document.body; } const markdown = this.turndownService.turndown(content.innerHTML); return sanitizeText(markdown); } /** * Process PDF content (already extracted) */ processPDF(content) { return sanitizeText(content); } /** * Extract text from PDF buffer */ async extractFromPDF(buffer) { try { const data = await pdfParse(Buffer.from(buffer)); return data.text; } catch (error) { throw new Error(`PDF parsing failed: ${error}`); } } /** * Process DOCX content */ async processDocx(buffer) { try { const bufferData = Buffer.from(buffer); const result = await mammoth.extractRawText({ arrayBuffer: bufferData.buffer }); return sanitizeText(result.value); } catch (error) { throw new Error(`DOCX parsing failed: ${error}`); } } /** * Process Markdown content */ processMarkdown(markdown) { const html = this.markdownIt.render(markdown); const dom = new JSDOM(html); const text = dom.window.document.body.textContent || ""; return sanitizeText(text); } /** * Process CSV content */ async processCSV(csvContent) { return new Promise((resolve, reject) => { const rows = []; const stream = __require("stream").Readable.from([csvContent]); stream.pipe(csvParser()).on("data", (row) => rows.push(row)).on("end", () => { const text = rows.map( (row) => Object.entries(row).map(([key, value]) => `${key}: ${value}`).join(", ") ).join("\n"); resolve(sanitizeText(text)); }).on("error", reject); }); } /** * Process JSON content */ processJSON(jsonContent) { try { const data = JSON.parse(jsonContent); const text = this.jsonToText(data); return sanitizeText(text); } catch (error) { throw new Error(`JSON parsing failed: ${error}`); } } /** * Process XML content */ processXML(xmlContent) { try { const dom = new JSDOM(xmlContent, { contentType: "text/xml" }); const text = dom.window.document.textContent || ""; return sanitizeText(text); } catch (error) { throw new Error(`XML parsing failed: ${error}`); } } /** * Convert JSON to readable text */ jsonToText(obj, prefix = "") { if (typeof obj !== "object" || obj === null) { return String(obj); } const lines = []; if (Array.isArray(obj)) { obj.forEach((item, index) => { const itemPrefix = `${prefix}[${index}]`; lines.push(`${itemPrefix}: ${this.jsonToText(item, itemPrefix)}`); }); } else { Object.entries(obj).forEach(([key, value]) => { const itemPrefix = prefix ? `${prefix}.${key}` : key; if (typeof value === "object" && value !== null) { lines.push(`${itemPrefix}:`); lines.push(this.jsonToText(value, itemPrefix)); } else { lines.push(`${itemPrefix}: ${value}`); } }); } return lines.join("\n"); } /** * Create document metadata */ createMetadata(partial) { const timestamp = Date.now(); return { id: partial.id || generateId(), format: partial.format, source: partial.source, createdAt: timestamp, updatedAt: timestamp, ...partial }; } /** * Detect format from content type */ detectFormatFromContentType(contentType) { if (contentType.includes("text/html")) return "html"; if (contentType.includes("text/markdown")) return "markdown"; if (contentType.includes("text/plain")) return "txt"; if (contentType.includes("application/pdf")) return "pdf"; if (contentType.includes("application/vnd.openxmlformats-officedocument.wordprocessingml.document")) return "docx"; if (contentType.includes("text/csv")) return "csv"; if (contentType.includes("application/json")) return "json"; if (contentType.includes("application/xml") || contentType.includes("text/xml")) return "xml"; return null; } /** * Detect format from filename */ detectFormatFromFilename(filename) { const ext = filename.split(".").pop()?.toLowerCase(); switch (ext) { case "md": return "markdown"; case "html": case "htm": return "html"; case "pdf": return "pdf"; case "docx": return "docx"; case "csv": return "csv"; case "json": return "json"; case "xml": return "xml"; case "txt": default: return "txt"; } } /** * Extract title from URL */ extractTitleFromUrl(url) { try { const urlObj = new URL(url); const pathname = urlObj.pathname; const filename = pathname.split("/").pop() || ""; return filename.split(".")[0] || urlObj.hostname; } catch { return "Unknown Document"; } } /** * Extract title from filename */ extractTitleFromFilename(filename) { const name = filename.split("/").pop() || filename; return name.split(".")[0] || "Unknown Document"; } }; // src/processing/chunking.ts var DocumentChunker = class _DocumentChunker { /** * Chunk a document using the specified strategy */ async chunkDocument(document, config) { try { switch (config.strategy) { case "fixed_size": return this.fixedSizeChunking(document, config); case "semantic": return this.semanticChunking(document, config); case "recursive": return this.recursiveChunking(document, config); case "sentence": return this.sentenceChunking(document, config); case "paragraph": return this.paragraphChunking(document, config); case "custom": if (config.customSplitter) { return config.customSplitter(document.content, config); } throw new Error("Custom splitter not provided"); default: throw new Error(`Unsupported chunking strategy: ${config.strategy}`); } } catch (error) { throw new DocumentProcessingError( document.id, `Chunking failed: ${error}`, error ); } } /** * Chunk multiple documents */ async chunkDocuments(documents, config) { const allChunks = []; for (const document of documents) { try { const chunks = await this.chunkDocument(document, config); allChunks.push(...chunks); } catch (error) { console.error(`Failed to chunk document ${document.id}:`, error); } } return allChunks; } /** * Fixed-size chunking with overlap */ fixedSizeChunking(document, config) { const { content } = document; const chunks = createOverlappingChunks(content, config.size, config.overlap); return chunks.map((chunk, index) => this.createChunk( chunk.content, document, index, chunk.startIndex, chunk.endIndex )).filter((chunk) => this.isValidChunk(chunk, config)); } /** * Semantic chunking based on content structure */ semanticChunking(document, config) { const { content } = document; const headers = extractHeaders(content); if (headers.length === 0) { return this.paragraphChunking(document, config); } const chunks = []; const lines = content.split("\n"); let currentSection = ""; let currentStartIndex = 0; let chunkIndex = 0; for (let i = 0; i < lines.length; i++) { const line = lines[i]; const isHeader = /^#{1,6}\s+/.test(line); if (isHeader && currentSection.length > 0) { const sectionContent = currentSection.trim(); if (sectionContent.length >= (config.minChunkSize || 50)) { const chunk = this.createChunk( sectionContent, document, chunkIndex++, currentStartIndex, currentStartIndex + currentSection.length ); if (this.isValidChunk(chunk, config)) { chunks.push(chunk); } } currentSection = line + "\n"; currentStartIndex = content.indexOf(currentSection, currentStartIndex); } else { currentSection += line + "\n"; } } if (currentSection.trim().length >= (config.minChunkSize || 50)) { const chunk = this.createChunk( currentSection.trim(), document, chunkIndex, currentStartIndex, currentStartIndex + currentSection.length ); if (this.isValidChunk(chunk, config)) { chunks.push(chunk); } } return this.splitLargeChunks(chunks, config); } /** * Recursive chunking with hierarchical splitting */ recursiveChunking(document, config) { const separators = config.separators || [ "\n\n", // Paragraphs "\n", // Lines ". ", // Sentences " ", // Words "" // Characters ]; return this.recursiveSplit(document.content, separators, document, config); } /** * Recursively split text using hierarchical separators */ recursiveSplit(text, separators, document, config, currentIndex = 0) { if (text.length <= config.size || separators.length === 0) { const chunk = this.createChunk(text, document, 0, currentIndex, currentIndex + text.length); return this.isValidChunk(chunk, config) ? [chunk] : []; } const separator = separators[0]; const parts = separator === "" ? text.split("") : text.split(separator); if (parts.length === 1) { return this.recursiveSplit(text, separators.slice(1), document, config, currentIndex); } const chunks = []; let currentText = ""; let currentStart = currentIndex; let chunkIndex = 0; for (let i = 0; i < parts.length; i++) { const part = parts[i]; const testText = currentText + (currentText ? separator : "") + part; if (testText.length <= config.size) { currentText = testText; } else { if (currentText.trim().length > 0) { const chunk = this.createChunk( currentText.trim(), document, chunkIndex++, currentStart, currentStart + currentText.length ); if (this.isValidChunk(chunk, config)) { chunks.push(chunk); } } currentStart += currentText.length + (currentText ? separator.length : 0); currentText = part; if (part.length > config.size) { const subChunks = this.recursiveSplit( part, separators.slice(1), document, config, currentStart ); chunks.push(...subChunks); currentText = ""; currentStart += part.length; } } } if (currentText.trim().length > 0) { const chunk = this.createChunk( currentText.trim(), document, chunkIndex, currentStart, currentStart + currentText.length ); if (this.isValidChunk(chunk, config)) { chunks.push(chunk); } } return chunks; } /** * Sentence-based chunking */ sentenceChunking(document, config) { const { content } = document; const boundaries = findSentenceBoundaries(content); const chunks = []; let currentChunk = ""; let currentStart = 0; let chunkIndex = 0; for (let i = 1; i < boundaries.length; i++) { const sentenceStart = boundaries[i - 1]; const sentenceEnd = boundaries[i]; const sentence = content.slice(sentenceStart, sentenceEnd).trim(); if (!sentence) continue; const testChunk = currentChunk + (currentChunk ? " " : "") + sentence; if (testChunk.length <= config.size) { currentChunk = testChunk; } else { if (currentChunk) { const chunk = this.createChunk( currentChunk, document, chunkIndex++, currentStart, currentStart + currentChunk.length ); if (this.isValidChunk(chunk, config)) { chunks.push(chunk); } } currentStart = sentenceStart; currentChunk = sentence; if (sentence.length > config.size) { const wordChunks = splitOnWordBoundary(sentence, config.size); for (const wordChunk of wordChunks) { const chunk = this.createChunk( wordChunk, document, chunkIndex++, currentStart, currentStart + wordChunk.length ); if (this.isValidChunk(chunk, config)) { chunks.push(chunk); } currentStart += wordChunk.length; } currentChunk = ""; } } } if (currentChunk) { const chunk = this.createChunk( currentChunk, document, chunkIndex, currentStart, currentStart + currentChunk.length ); if (this.isValidChunk(chunk, config)) { chunks.push(chunk); } } return chunks; } /** * Paragraph-based chunking */ paragraphChunking(document, config) { const { content } = document; const boundaries = findParagraphBoundaries(content); const chunks = []; let currentChunk = ""; let currentStart = 0; let chunkIndex = 0; for (let i = 1; i < boundaries.length; i++) { const paragraphStart = boundaries[i - 1]; const paragraphEnd = boundaries[i]; const paragraph = content.slice(paragraphStart, paragraphEnd).trim(); if (!paragraph) continue; const testChunk = currentChunk + (currentChunk ? "\n\n" : "") + paragraph; if (testChunk.length <= config.size) { currentChunk = testChunk; } else { if (currentChunk) { const chunk = this.createChunk( currentChunk, document, chunkIndex++, currentStart, currentStart + currentChunk.length ); if (this.isValidChunk(chunk, config)) { chunks.push(chunk); } } currentStart = paragraphStart; currentChunk = paragraph; if (paragraph.length > config.size) { const subChunks = this.fixedSizeChunking( { ...document, content: paragraph }, { ...config, strategy: "fixed_size" } ); chunks.push(...subChunks.map((chunk) => ({ ...chunk, metadata: { ...chunk.metadata, documentId: document.id } }))); currentChunk = ""; currentStart += paragraph.length; } } } if (currentChunk) { const chunk = this.createChunk( currentChunk, document, chunkIndex, currentStart, currentStart + currentChunk.length ); if (this.isValidChunk(chunk, config)) { chunks.push(chunk); } } return chunks; } /** * Split chunks that are too large */ splitLargeChunks(chunks, config) { const result = []; for (const chunk of chunks) { if (chunk.content.length <= config.maxChunkSize) { result.push(chunk); } else { const subChunks = createOverlappingChunks( chunk.content, config.size, config.overlap ); let subIndex = 0; for (const subChunk of subChunks) { const newChunk = this.createChunk( subChunk.content, { id: chunk.metadata.documentId }, chunk.metadata.index + subIndex / 1e3, // Maintain ordering chunk.metadata.startIndex + subChunk.startIndex, chunk.metadata.startIndex + subChunk.endIndex, { ...chunk.metadata, chunkId: `${chunk.id}_${subIndex}` } ); if (this.isValidChunk(newChunk, config)) { result.push(newChunk); } subIndex++; } } } return result; } /** * Create a chunk object */ createChunk(content, document, index, startIndex, endIndex, metadataOverride) { const chunkId = generateId("chunk"); const headers = extractHeaders(content); return { id: chunkId, content: content.trim(), tokens: estimateTokens(content), metadata: { chunkId, documentId: document.id, index, startIndex, endIndex, title: document.metadata.title, section: headers[0] || void 0, headers: headers.length > 0 ? headers : void 0, ...metadataOverride } }; } /** * Validate that a chunk meets the configuration requirements */ isValidChunk(chunk, config) { const contentLength = chunk.content.length; if (config.minChunkSize && contentLength < config.minChunkSize) { return false; } if (config.maxChunkSize && contentLength > config.maxChunkSize) { return false; } if (!isValidContent(chunk.content)) { return false; } return true; } /** * Get default chunking configuration */ static getDefaultConfig() { return { strategy: "recursive", size: 512, overlap: 128, minChunkSize: 50, maxChunkSize: 2048, preserveStructure: false, separators: ["\n\n", "\n", ". ", " ", ""] }; } /** * Get optimized configuration for specific document types */ static getOptimizedConfig(documentFormat) { const baseConfig = _DocumentChunker.getDefaultConfig(); switch (documentFormat) { case "markdown": return { ...baseConfig, strategy: "semantic", preserveStructure: true, separators: ["\n## ", "\n# ", "\n\n", "\n", ". ", " ", ""] }; case "html": return { ...baseConfig, strategy: "semantic", preserveStructure: true }; case "pdf": return { ...baseConfig, strategy: "paragraph", size: 768, overlap: 150 }; case "csv": case "json": return { ...baseConfig, strategy: "fixed_size", size: 256, overlap: 50 }; case "txt": default: return { ...baseConfig, strategy: "recursive" }; } } }; var EmbeddingService = class _EmbeddingService { gateway; env; config; requestIdCounter = 0; constructor(env, config) { this.env = env; this.config = config; this.gateway = createGateway(env, { routing: { primary: config.provider, fallbacks: this.getFallbackProviders(config.provider), loadBalancing: "least_latency", failoverThreshold: 2 }, cache: { enabled: true, ttl: 86400, // Cache embeddings for 24 hours strategy: "exact", keyPrefix: "emb:" } }); } /** * Generate embeddings for a single chunk */ async generateEmbedding(chunk) { try { const embedding = await this.generateEmbeddings([chunk]); return embedding[0]; } catch (error) { throw new EmbeddingError( `Failed to generate embedding for chunk ${chunk.id}: ${error}`, error ); } } /** * Generate embeddings for multiple chunks in batch */ async generateEmbeddings(chunks) { if (chunks.length === 0) { return []; } try { const batchSize = Math.min(this.config.batchSize, 100); const embeddings = []; for (let i = 0; i < chunks.length; i += batchSize) { const batchChunks = chunks.slice(i, i + batchSize); const batchEmbeddings = await this.processBatch(batchChunks); embeddings.push(...batchEmbeddings); } return embeddings; } catch (error) { throw new EmbeddingError( `Failed to generate embeddings for batch: ${error}`, error ); } } /** * Generate embeddings for text chunks with enhanced content */ async generateEnhancedEmbeddings(chunks) { const enhancedChunks = chunks.map((chunk) => ({ ...chunk, content: this.enhanceChunkContent(chunk) })); return this.generateEmbeddings(enhancedChunks); } /** * Generate embedding for a query */ async generateQueryEmbedding(query) { try { const embedding = await this.callEmbeddingModel([query]); return embedding[0]; } catch (error) { throw new EmbeddingError( `Failed to generate embedding for query: ${error}`, error ); } } /** * Process a batch of chunks */ async processBatch(chunks) { const texts = chunks.map((chunk) => this.prepareTextForEmbedding(chunk.content)); let retries = 0; while (retries <= this.config.maxRetries) { try { return await this.callEmbeddingModel(texts); } catch (error) { retries++; if (retries > this.config.maxRetries) { throw error; } const delay = Math.min(1e3 * Math.pow(2, retries - 1), 3e4); await new Promise((resolve) => setTimeout(resolve, delay)); } } throw new Error("Max retries exceeded"); } /** * Call the embedding model via Cloudflare Workers AI or fallback providers */ async callEmbeddingModel(texts) { try { if (this.config.provider === "cloudflare" && "AI" in this.env) { return await this.callCloudflareEmbedding(texts); } return await this.callProviderEmbedding(texts); } catch (error) { console.warn("Primary embedding provider failed, trying fallbacks:", error); for (const fallbackProvider of this.getFallbackProviders(this.config.provider)) { try { const tempConfig = { ...this.config, provider: fallbackProvider }; const tempService = new _EmbeddingService(this.env, tempConfig); return await tempService.callEmbeddingModel(texts); } catch (fallbackError) { console.warn(`Fallback provider ${fallbackProvider} failed:`, fallbackError); } } throw new EmbeddingError( `All embedding providers failed. Last error: ${error}`, error ); } } /** * Call Cloudflare Workers AI for embeddings */ async callCloudflareEmbedding(texts) { const ai = this.env.AI; if (!ai) { throw new Error("Cloudflare AI binding not available"); } const embeddings = []; const batchSize = Math.min(this.config.batchSize, 50); for (let i = 0; i < texts.length; i += batchSize) { const batch = texts.slice(i, i + batchSize); try { const response = await ai.run("@cf/baai/bge-base-en-v1.5", { text: batch }); if (!response.success) { throw new Error(`Cloudflare AI embedding failed: ${JSON.stringify(response.errors)}`); } embeddings.push(...response.data); } catch (error) { throw new EmbeddingError( `Cloudflare AI embedding batch failed: ${error}`, error ); } } return embeddings; } /** * Call provider-specific embedding APIs through AI Gateway */ async callProviderEmbedding(texts) { const embeddings = []; for (const text of texts) { try { let embedding; switch (this.config.provider) { case "openai": embedding = await this.callOpenAIEmbedding(text); break; case "cohere": embedding = await this.callCohereEmbedding(text); break; case "google": embedding = await this.callGoogleEmbedding(text); break; default: embedding = await this.simulateEmbedding(text); } embeddings.push(embedding); } catch (error) { throw new EmbeddingError( `Provider ${this.config.provider} embedding failed for text: ${error}`, error ); } } return embeddings; } /** * Call OpenAI embedding API */ async callOpenAIEmbedding(text) { const response = await fetch("https://api.openai.com/v1/embeddings", { method: "POST", headers: { "Authorization": `Bearer ${this.env.OPENAI_API_KEY}`, "Content-Type": "application/json" }, body: JSON.stringify({ model: this.config.model, input: text, encoding_format: "float" }) }); if (!response.ok) { throw new Error(`OpenAI API error: ${response.status} ${response.statusText}`); } const data = await response.json(); return data.data[0].embedding; } /** * Call Cohere embedding API */ async callCohereEmbedding(text) { const response = await fetch("https://api.cohere.ai/v1/embed", { method: "POST", headers: { "Authorization": `Bearer ${this.env.COHERE_API_KEY}`, "Content-Type": "application/json" }, body: JSON.stringify({ model: this.config.model, texts: [text], input_type: "search_document" }) }); if (!response.ok) { throw new Error(`Cohere API error: ${response.status} ${response.statusText}`); } const data = await response.json(); return data.embeddings[0]; } /** * Call Google embedding API */ async callGoogleEmbedding(text) { const response = await fetch(`https://generativelanguage.googleapis.com/v1beta/models/embedding-001:embedContent?key=${this.env.GOOGLE_API_KEY}`, { method: "POST", headers: { "Content-Type": "application/json" }, body: JSON.stringify({ model: "models/embedding-001", content: { parts: [{ text }] } }) }); if (!response.ok) { throw new Error(`Google API error: ${response.status} ${response.statusText}`); } const data = await response.json(); return data.embedding.values; } /** * Simulate embedding generation (fallback for unsupported providers) */ async simulateEmbedding(text) { const dimensions = this.config.dimensions; const embedding = new Array(dimensions).fill(0); let hash = 0; for (let i = 0; i < text.length; i++) { hash = (hash << 5) - hash + text.charCodeAt(i) & 4294967295; } for (let i = 0; i < dimensions; i++) { const seed = hash ^ i * 2654435761; embedding[i] = Math.sin(seed) * 0.5; } const norm = Math.sqrt(embedding.reduce((sum, val) => sum + val * val, 0)); return embedding.map((val) => val / norm); } /** * Prepare text for embedding by cleaning and truncating */ prepareTextForEmbedding(text) { let cleanText = text.replace(/\s+/g, " ").replace(/\n+/g, " ").trim(); const maxTokens = 8192; const estimatedTokens = cleanText.length / 4; if (estimatedTokens > maxTokens) { const maxChars = maxTokens * 4; cleanText = cleanText.substring(0, maxChars); const lastSpaceIndex = cleanText.lastIndexOf(" "); if (lastSpaceIndex > maxChars * 0.9) { cleanText = cleanText.substring(0, lastSpaceIndex); } } return cleanText; } /** * Enhance chunk content with metadata for better embeddings */ enhanceChunkContent(chunk) { const parts = [chunk.content]; if (chunk.metadata.title) { parts.unshift(`Title: ${chunk.metadata.title}`); } if (chunk.metadata.headers && chunk.metadata.headers.length > 0) { const headerText = chunk.metadata.headers.join(" > "); parts.unshift(`Section: ${headerText}`); } const metadata = chunk.metadata; if (metadata.custom) { Object.entries(metadata.custom).forEach(([key, value]) => { if (typeof value === "string" && value.length < 100) { parts.push(`${key}: ${value}`); } }); } return parts.join("\n\n"); } /** * Get fallback providers for embedding generation */ getFallbackProviders(primaryProvider) { const allProviders = ["cloudflare", "openai", "cohere", "google", "mistral"]; return allProviders.filter((provider) => provider !== primaryProvider); } /** * Validate embedding dimensions */ validateEmbedding(embedding) { if (!Array.isArray(embedding)) { return false; } if (embedding.length !== this.config.dimensions) { return false; } return embedding.every( (val) => typeof val === "number" && isFinite(val) && !isNaN(val) ); } /** * Calculate embedding similarity */ calculateSimilarity(embedding1, embedding2) { if (embedding1.length !== embedding2.length) { throw new Error("Embeddings must have the same dimensions"); } let dotProduct = 0; let norm1 = 0; let norm2 = 0; for (let i = 0; i < embedding1.length; i++) { dotProduct += embedding1[i] * embedding2[i]; norm1 += embedding1[i] * embedding1[i]; norm2 += embedding2[i] * embedding2[i]; } if (norm1 === 0 || norm2 === 0) { return 0; } return dotProduct / (Math.sqrt(norm1) * Math.sqrt(norm2)); } /** * Get embedding statistics for monitoring */ getEmbeddingStats() { return { provider: this.config.provider, model: this.config.model, dimensions: this.config.dimensions, batchSize: this.config.batchSize }; } /** * Update embedding configuration */ updateConfig(newConfig) { this.config = { ...this.config, ...newConfig }; if (newConfig.provider && newConfig.provider !== this.config.provider) { this.gateway = createGateway(this.env, { routing: { primary: this.config.provider, fallbacks: this.getFallbackProviders(this.config.provider), loadBalancing: "least_latency", failoverThreshold: 2 }, cache: { enabled: true, ttl: 86400, strategy: "exact", keyPrefix: "emb:" } }); } } /** * Create embedding service with default configuration */ static createDefault(env) { const preferCloudflare = "AI" in env; const defaultConfig = { model: env.RAG_DEFAULT_EMBEDDING_MODEL || (preferCloudflare ? "@cf/baai/bge-base-en-v1.5" : "text-embedding-ada-002"), provider: preferCloudflare ? "cloudflare" : "openai", dimensions: preferCloudflare ? 768 : 1536, batchSize: parseInt(env.RAG_BATCH_SIZE || (preferCloudflare ? "50" : "100")), maxRetries: 3, timeout: 3e4 }; return new _EmbeddingService(env, defaultConfig); } /** * Create embedding service for specific model */ static createForModel(env, model, provider) { const modelConfigs = { // OpenAI