UNPKG

@elizaos/plugin-knowledge

Version:
695 lines (680 loc) 22.3 kB
// src/docs-loader.ts import { logger as logger2 } from "@elizaos/core"; import * as fs from "fs"; import * as path from "path"; // src/utils.ts import { Buffer as Buffer2 } from "buffer"; import * as mammoth from "mammoth"; import { logger } from "@elizaos/core"; import { getDocument } from "pdfjs-dist/legacy/build/pdf.mjs"; import { createHash as createHash2 } from "crypto"; // node_modules/uuid/dist/esm/regex.js var regex_default = /^(?:[0-9a-f]{8}-[0-9a-f]{4}-[1-8][0-9a-f]{3}-[89ab][0-9a-f]{3}-[0-9a-f]{12}|00000000-0000-0000-0000-000000000000|ffffffff-ffff-ffff-ffff-ffffffffffff)$/i; // node_modules/uuid/dist/esm/validate.js function validate(uuid) { return typeof uuid === "string" && regex_default.test(uuid); } var validate_default = validate; // node_modules/uuid/dist/esm/parse.js function parse(uuid) { if (!validate_default(uuid)) { throw TypeError("Invalid UUID"); } let v; return Uint8Array.of((v = parseInt(uuid.slice(0, 8), 16)) >>> 24, v >>> 16 & 255, v >>> 8 & 255, v & 255, (v = parseInt(uuid.slice(9, 13), 16)) >>> 8, v & 255, (v = parseInt(uuid.slice(14, 18), 16)) >>> 8, v & 255, (v = parseInt(uuid.slice(19, 23), 16)) >>> 8, v & 255, (v = parseInt(uuid.slice(24, 36), 16)) / 1099511627776 & 255, v / 4294967296 & 255, v >>> 24 & 255, v >>> 16 & 255, v >>> 8 & 255, v & 255); } var parse_default = parse; // node_modules/uuid/dist/esm/stringify.js var byteToHex = []; for (let i = 0; i < 256; ++i) { byteToHex.push((i + 256).toString(16).slice(1)); } function unsafeStringify(arr, offset = 0) { return (byteToHex[arr[offset + 0]] + byteToHex[arr[offset + 1]] + byteToHex[arr[offset + 2]] + byteToHex[arr[offset + 3]] + "-" + byteToHex[arr[offset + 4]] + byteToHex[arr[offset + 5]] + "-" + byteToHex[arr[offset + 6]] + byteToHex[arr[offset + 7]] + "-" + byteToHex[arr[offset + 8]] + byteToHex[arr[offset + 9]] + "-" + byteToHex[arr[offset + 10]] + byteToHex[arr[offset + 11]] + byteToHex[arr[offset + 12]] + byteToHex[arr[offset + 13]] + byteToHex[arr[offset + 14]] + byteToHex[arr[offset + 15]]).toLowerCase(); } // node_modules/uuid/dist/esm/rng.js import { randomFillSync } from "crypto"; var rnds8Pool = new Uint8Array(256); var poolPtr = rnds8Pool.length; function rng() { if (poolPtr > rnds8Pool.length - 16) { randomFillSync(rnds8Pool); poolPtr = 0; } return rnds8Pool.slice(poolPtr, poolPtr += 16); } // node_modules/uuid/dist/esm/v35.js function stringToBytes(str) { str = unescape(encodeURIComponent(str)); const bytes = new Uint8Array(str.length); for (let i = 0; i < str.length; ++i) { bytes[i] = str.charCodeAt(i); } return bytes; } var DNS = "6ba7b810-9dad-11d1-80b4-00c04fd430c8"; var URL2 = "6ba7b811-9dad-11d1-80b4-00c04fd430c8"; function v35(version, hash, value, namespace, buf, offset) { const valueBytes = typeof value === "string" ? stringToBytes(value) : value; const namespaceBytes = typeof namespace === "string" ? parse_default(namespace) : namespace; if (typeof namespace === "string") { namespace = parse_default(namespace); } if (namespace?.length !== 16) { throw TypeError("Namespace must be array-like (16 iterable integer values, 0-255)"); } let bytes = new Uint8Array(16 + valueBytes.length); bytes.set(namespaceBytes); bytes.set(valueBytes, namespaceBytes.length); bytes = hash(bytes); bytes[6] = bytes[6] & 15 | version; bytes[8] = bytes[8] & 63 | 128; if (buf) { offset = offset || 0; for (let i = 0; i < 16; ++i) { buf[offset + i] = bytes[i]; } return buf; } return unsafeStringify(bytes); } // node_modules/uuid/dist/esm/native.js import { randomUUID } from "crypto"; var native_default = { randomUUID }; // node_modules/uuid/dist/esm/v4.js function v4(options, buf, offset) { if (native_default.randomUUID && !buf && !options) { return native_default.randomUUID(); } options = options || {}; const rnds = options.random ?? options.rng?.() ?? rng(); if (rnds.length < 16) { throw new Error("Random bytes length must be >= 16"); } rnds[6] = rnds[6] & 15 | 64; rnds[8] = rnds[8] & 63 | 128; if (buf) { offset = offset || 0; if (offset < 0 || offset + 16 > buf.length) { throw new RangeError(`UUID byte range ${offset}:${offset + 15} is out of buffer bounds`); } for (let i = 0; i < 16; ++i) { buf[offset + i] = rnds[i]; } return buf; } return unsafeStringify(rnds); } var v4_default = v4; // node_modules/uuid/dist/esm/sha1.js import { createHash } from "crypto"; function sha1(bytes) { if (Array.isArray(bytes)) { bytes = Buffer.from(bytes); } else if (typeof bytes === "string") { bytes = Buffer.from(bytes, "utf8"); } return createHash("sha1").update(bytes).digest(); } var sha1_default = sha1; // node_modules/uuid/dist/esm/v5.js function v5(value, namespace, buf, offset) { return v35(80, sha1_default, value, namespace, buf, offset); } v5.DNS = DNS; v5.URL = URL2; var v5_default = v5; // src/utils.ts var PLAIN_TEXT_CONTENT_TYPES = [ "application/typescript", "text/typescript", "text/x-python", "application/x-python-code", "application/yaml", "text/yaml", "application/x-yaml", "application/json", "text/markdown", "text/csv" ]; var MAX_FALLBACK_SIZE_BYTES = 5 * 1024 * 1024; var BINARY_CHECK_BYTES = 1024; async function extractTextFromFileBuffer(fileBuffer, contentType, originalFilename) { const lowerContentType = contentType.toLowerCase(); logger.debug( `[TextUtil] Attempting to extract text from ${originalFilename} (type: ${contentType})` ); if (lowerContentType === "application/vnd.openxmlformats-officedocument.wordprocessingml.document") { logger.debug(`[TextUtil] Extracting text from DOCX ${originalFilename} via mammoth.`); try { const result = await mammoth.extractRawText({ buffer: fileBuffer }); logger.debug( `[TextUtil] DOCX text extraction complete for ${originalFilename}. Text length: ${result.value.length}` ); return result.value; } catch (docxError) { const errorMsg = `[TextUtil] Failed to parse DOCX file ${originalFilename}: ${docxError.message}`; logger.error(errorMsg, docxError.stack); throw new Error(errorMsg); } } else if (lowerContentType === "application/msword" || originalFilename.toLowerCase().endsWith(".doc")) { logger.debug(`[TextUtil] Handling Microsoft Word .doc file: ${originalFilename}`); return `[Microsoft Word Document: ${originalFilename}] This document was indexed for search but cannot be displayed directly in the browser. The original document content is preserved for retrieval purposes.`; } else if (lowerContentType.startsWith("text/") || PLAIN_TEXT_CONTENT_TYPES.includes(lowerContentType)) { logger.debug( `[TextUtil] Extracting text from plain text compatible file ${originalFilename} (type: ${contentType})` ); return fileBuffer.toString("utf-8"); } else { logger.warn( `[TextUtil] Unsupported content type: "${contentType}" for ${originalFilename}. Attempting fallback to plain text.` ); if (fileBuffer.length > MAX_FALLBACK_SIZE_BYTES) { const sizeErrorMsg = `[TextUtil] File ${originalFilename} (type: ${contentType}) exceeds maximum size for fallback (${MAX_FALLBACK_SIZE_BYTES} bytes). Cannot process as plain text.`; logger.error(sizeErrorMsg); throw new Error(sizeErrorMsg); } const initialBytes = fileBuffer.subarray(0, Math.min(fileBuffer.length, BINARY_CHECK_BYTES)); if (initialBytes.includes(0)) { const binaryHeuristicMsg = `[TextUtil] File ${originalFilename} (type: ${contentType}) appears to be binary based on initial byte check. Cannot process as plain text.`; logger.error(binaryHeuristicMsg); throw new Error(binaryHeuristicMsg); } try { const textContent = fileBuffer.toString("utf-8"); if (textContent.includes("\uFFFD")) { const binaryErrorMsg = `[TextUtil] File ${originalFilename} (type: ${contentType}) seems to be binary or has encoding issues after fallback to plain text (detected \uFFFD).`; logger.error(binaryErrorMsg); throw new Error(binaryErrorMsg); } logger.debug( `[TextUtil] Successfully processed unknown type ${contentType} as plain text after fallback for ${originalFilename}.` ); return textContent; } catch (fallbackError) { const finalErrorMsg = `[TextUtil] Unsupported content type: ${contentType} for ${originalFilename}. Fallback to plain text also failed or indicated binary content.`; logger.error(finalErrorMsg, fallbackError.message ? fallbackError.stack : void 0); throw new Error(finalErrorMsg); } } } async function convertPdfToTextFromBuffer(pdfBuffer, filename) { const docName = filename || "unnamed-document"; logger.debug(`[PdfService] Starting conversion for ${docName}`); try { const uint8Array = new Uint8Array(pdfBuffer); const pdf = await getDocument({ data: uint8Array }).promise; const numPages = pdf.numPages; const textPages = []; for (let pageNum = 1; pageNum <= numPages; pageNum++) { logger.debug(`[PdfService] Processing page ${pageNum}/${numPages}`); const page = await pdf.getPage(pageNum); const textContent = await page.getTextContent(); const lineMap = /* @__PURE__ */ new Map(); textContent.items.filter(isTextItem).forEach((item) => { const yPos = Math.round(item.transform[5]); if (!lineMap.has(yPos)) { lineMap.set(yPos, []); } lineMap.get(yPos).push(item); }); const sortedLines = Array.from(lineMap.entries()).sort((a, b) => b[0] - a[0]).map( ([_, items]) => items.sort((a, b) => a.transform[4] - b.transform[4]).map((item) => item.str).join(" ") ); textPages.push(sortedLines.join("\n")); } const fullText = textPages.join("\n\n").replace(/\s+/g, " ").trim(); logger.debug(`[PdfService] Conversion complete for ${docName}, length: ${fullText.length}`); return fullText; } catch (error) { logger.error(`[PdfService] Error converting PDF ${docName}:`, error.message); throw new Error(`Failed to convert PDF to text: ${error.message}`); } } function isBinaryContentType(contentType, filename) { const textContentTypes = [ "text/", "application/json", "application/xml", "application/javascript", "application/typescript", "application/x-yaml", "application/x-sh" ]; const isTextMimeType = textContentTypes.some((type) => contentType.includes(type)); if (isTextMimeType) { return false; } const binaryContentTypes = [ "application/pdf", "application/msword", "application/vnd.openxmlformats-officedocument", "application/vnd.ms-excel", "application/vnd.ms-powerpoint", "application/zip", "application/x-zip-compressed", "application/octet-stream", "image/", "audio/", "video/" ]; const isBinaryMimeType = binaryContentTypes.some((type) => contentType.includes(type)); if (isBinaryMimeType) { return true; } const fileExt = filename.split(".").pop()?.toLowerCase() || ""; const textExtensions = [ "txt", "md", "markdown", "json", "xml", "html", "htm", "css", "js", "ts", "jsx", "tsx", "yaml", "yml", "toml", "ini", "cfg", "conf", "sh", "bash", "zsh", "fish", "py", "rb", "go", "rs", "java", "c", "cpp", "h", "hpp", "cs", "php", "sql", "r", "swift", "kt", "scala", "clj", "ex", "exs", "vim", "env", "gitignore", "dockerignore", "editorconfig", "log", "csv", "tsv", "properties", "gradle", "sbt", "makefile", "dockerfile", "vagrantfile", "gemfile", "rakefile", "podfile", "csproj", "vbproj", "fsproj", "sln", "pom" ]; if (textExtensions.includes(fileExt)) { return false; } const binaryExtensions = [ "pdf", "docx", "doc", "xls", "xlsx", "ppt", "pptx", "zip", "rar", "7z", "tar", "gz", "bz2", "xz", "jpg", "jpeg", "png", "gif", "bmp", "svg", "ico", "webp", "mp3", "mp4", "avi", "mov", "wmv", "flv", "wav", "flac", "ogg", "exe", "dll", "so", "dylib", "bin", "dat", "db", "sqlite" ]; return binaryExtensions.includes(fileExt); } function isTextItem(item) { return "str" in item; } function normalizeS3Url(url) { try { const urlObj = new URL(url); return `${urlObj.origin}${urlObj.pathname}`; } catch (error) { logger.warn(`[URL NORMALIZER] Failed to parse URL: ${url}. Returning original.`); return url; } } async function fetchUrlContent(url) { logger.debug(`[URL FETCHER] Fetching content from URL: ${url}`); try { const controller = new AbortController(); const timeoutId = setTimeout(() => controller.abort(), 3e4); const response = await fetch(url, { signal: controller.signal, headers: { "User-Agent": "Eliza-Knowledge-Plugin/1.0" } }); clearTimeout(timeoutId); if (!response.ok) { throw new Error(`Failed to fetch URL: ${response.status} ${response.statusText}`); } const contentType = response.headers.get("content-type") || "application/octet-stream"; logger.debug(`[URL FETCHER] Content type from server: ${contentType} for URL: ${url}`); const arrayBuffer = await response.arrayBuffer(); const buffer = Buffer2.from(arrayBuffer); const base64Content = buffer.toString("base64"); logger.debug( `[URL FETCHER] Successfully fetched content from URL: ${url} (${buffer.length} bytes)` ); return { content: base64Content, contentType }; } catch (error) { logger.error(`[URL FETCHER] Error fetching content from URL ${url}: ${error.message}`); throw new Error(`Failed to fetch content from URL: ${error.message}`); } } function looksLikeBase64(content) { const base64Regex = /^(?:[A-Za-z0-9+/]{4})*(?:[A-Za-z0-9+/]{2}==|[A-Za-z0-9+/]{3}=)?$/; return content && content.length > 0 && base64Regex.test(content.replace(/\s/g, "")) || false; } function generateContentBasedId(content, agentId, options) { const { maxChars = 2e3, // Use first 2000 chars by default includeFilename, contentType } = options || {}; let contentForHashing; if (looksLikeBase64(content)) { try { const decoded = Buffer2.from(content, "base64").toString("utf8"); if (!decoded.includes("\uFFFD") || contentType?.includes("pdf")) { contentForHashing = content.slice(0, maxChars); } else { contentForHashing = decoded.slice(0, maxChars); } } catch { contentForHashing = content.slice(0, maxChars); } } else { contentForHashing = content.slice(0, maxChars); } contentForHashing = contentForHashing.replace(/\r\n/g, "\n").replace(/\r/g, "\n").trim(); const componentsToHash = [ agentId, // Namespace by agent contentForHashing, // The actual content includeFilename || "" // Optional filename for additional uniqueness ].filter(Boolean).join("::"); const hash = createHash2("sha256").update(componentsToHash).digest("hex"); const DOCUMENT_NAMESPACE = "6ba7b810-9dad-11d1-80b4-00c04fd430c8"; const uuid = v5_default(hash, DOCUMENT_NAMESPACE); logger.debug( `[generateContentBasedId] Generated UUID ${uuid} for document with content hash ${hash.slice(0, 8)}...` ); return uuid; } // src/docs-loader.ts function getKnowledgePath() { const envPath = process.env.KNOWLEDGE_PATH; if (envPath) { const resolvedPath = path.resolve(envPath); if (!fs.existsSync(resolvedPath)) { logger2.warn(`Knowledge path from environment variable does not exist: ${resolvedPath}`); logger2.warn("Please create the directory or update KNOWLEDGE_PATH environment variable"); } return resolvedPath; } const defaultPath = path.join(process.cwd(), "docs"); if (!fs.existsSync(defaultPath)) { logger2.info(`Default docs folder does not exist at: ${defaultPath}`); logger2.info("To use the knowledge plugin, either:"); logger2.info('1. Create a "docs" folder in your project root'); logger2.info("2. Set KNOWLEDGE_PATH environment variable to your documents folder"); } return defaultPath; } async function loadDocsFromPath(service, agentId, worldId) { const docsPath = getKnowledgePath(); if (!fs.existsSync(docsPath)) { logger2.warn(`Knowledge path does not exist: ${docsPath}`); return { total: 0, successful: 0, failed: 0 }; } logger2.info(`Loading documents from: ${docsPath}`); const files = getAllFiles(docsPath); if (files.length === 0) { logger2.info("No files found in knowledge path"); return { total: 0, successful: 0, failed: 0 }; } logger2.info(`Found ${files.length} files to process`); let successful = 0; let failed = 0; for (const filePath of files) { try { const fileName = path.basename(filePath); const fileExt = path.extname(filePath).toLowerCase(); if (fileName.startsWith(".")) { continue; } const contentType = getContentType(fileExt); if (!contentType) { logger2.debug(`Skipping unsupported file type: ${filePath}`); continue; } const fileBuffer = fs.readFileSync(filePath); const isBinary = isBinaryContentType(contentType, fileName); const content = isBinary ? fileBuffer.toString("base64") : fileBuffer.toString("utf-8"); const knowledgeOptions = { clientDocumentId: "", // Will be generated by the service based on content contentType, originalFilename: fileName, worldId: worldId || agentId, content, roomId: agentId, entityId: agentId }; logger2.debug(`Processing document: ${fileName}`); const result = await service.addKnowledge(knowledgeOptions); logger2.info(`\u2705 "${fileName}": ${result.fragmentCount} fragments created`); successful++; } catch (error) { logger2.error(`Failed to process file ${filePath}:`, error); failed++; } } logger2.info( `Document loading complete: ${successful} successful, ${failed} failed out of ${files.length} total` ); return { total: files.length, successful, failed }; } function getAllFiles(dirPath, files = []) { try { const entries = fs.readdirSync(dirPath, { withFileTypes: true }); for (const entry of entries) { const fullPath = path.join(dirPath, entry.name); if (entry.isDirectory()) { if (!["node_modules", ".git", ".vscode", "dist", "build"].includes(entry.name)) { getAllFiles(fullPath, files); } } else if (entry.isFile()) { files.push(fullPath); } } } catch (error) { logger2.error(`Error reading directory ${dirPath}:`, error); } return files; } function getContentType(extension) { const contentTypes = { // Text documents ".txt": "text/plain", ".md": "text/markdown", ".markdown": "text/markdown", ".tson": "text/plain", ".xml": "application/xml", ".csv": "text/csv", ".tsv": "text/tab-separated-values", ".log": "text/plain", // Web files ".html": "text/html", ".htm": "text/html", ".css": "text/css", ".scss": "text/x-scss", ".sass": "text/x-sass", ".less": "text/x-less", // JavaScript/TypeScript ".js": "text/javascript", ".jsx": "text/javascript", ".ts": "text/typescript", ".tsx": "text/typescript", ".mjs": "text/javascript", ".cjs": "text/javascript", ".vue": "text/x-vue", ".svelte": "text/x-svelte", ".astro": "text/x-astro", // Python ".py": "text/x-python", ".pyw": "text/x-python", ".pyi": "text/x-python", // Java/Kotlin/Scala ".java": "text/x-java", ".kt": "text/x-kotlin", ".kts": "text/x-kotlin", ".scala": "text/x-scala", // C/C++/C# ".c": "text/x-c", ".cpp": "text/x-c++", ".cc": "text/x-c++", ".cxx": "text/x-c++", ".h": "text/x-c", ".hpp": "text/x-c++", ".cs": "text/x-csharp", // Other languages ".php": "text/x-php", ".rb": "text/x-ruby", ".go": "text/x-go", ".rs": "text/x-rust", ".swift": "text/x-swift", ".r": "text/x-r", ".R": "text/x-r", ".m": "text/x-objectivec", ".mm": "text/x-objectivec", ".clj": "text/x-clojure", ".cljs": "text/x-clojure", ".ex": "text/x-elixir", ".exs": "text/x-elixir", ".lua": "text/x-lua", ".pl": "text/x-perl", ".pm": "text/x-perl", ".dart": "text/x-dart", ".hs": "text/x-haskell", ".elm": "text/x-elm", ".ml": "text/x-ocaml", ".fs": "text/x-fsharp", ".fsx": "text/x-fsharp", ".vb": "text/x-vb", ".pas": "text/x-pascal", ".d": "text/x-d", ".nim": "text/x-nim", ".zig": "text/x-zig", ".jl": "text/x-julia", ".tcl": "text/x-tcl", ".awk": "text/x-awk", ".sed": "text/x-sed", // Shell scripts ".sh": "text/x-sh", ".bash": "text/x-sh", ".zsh": "text/x-sh", ".fish": "text/x-fish", ".ps1": "text/x-powershell", ".bat": "text/x-batch", ".cmd": "text/x-batch", // Config files ".json": "application/json", ".yaml": "text/x-yaml", ".yml": "text/x-yaml", ".toml": "text/x-toml", ".ini": "text/x-ini", ".cfg": "text/x-ini", ".conf": "text/x-ini", ".env": "text/plain", ".gitignore": "text/plain", ".dockerignore": "text/plain", ".editorconfig": "text/plain", ".properties": "text/x-properties", // Database ".sql": "text/x-sql", // Binary documents ".pdf": "application/pdf", ".doc": "application/msword", ".docx": "application/vnd.openxmlformats-officedocument.wordprocessingml.document" }; return contentTypes[extension] || null; } export { v4_default, extractTextFromFileBuffer, convertPdfToTextFromBuffer, isBinaryContentType, normalizeS3Url, fetchUrlContent, looksLikeBase64, generateContentBasedId, getKnowledgePath, loadDocsFromPath }; //# sourceMappingURL=chunk-RFXW7QQK.js.map