UNPKG

@vjlanguage/mcp-vj-docs

Version:

MCP server for documentation crawling, indexing, and retrieval

1,009 lines 176 kB
#!/usr/bin/env node "use strict"; var __assign = (this && this.__assign) || function () { __assign = Object.assign || function(t) { for (var s, i = 1, n = arguments.length; i < n; i++) { s = arguments[i]; for (var p in s) if (Object.prototype.hasOwnProperty.call(s, p)) t[p] = s[p]; } return t; }; return __assign.apply(this, arguments); }; var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) { function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); } return new (P || (P = Promise))(function (resolve, reject) { function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } } function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } } function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); } step((generator = generator.apply(thisArg, _arguments || [])).next()); }); }; var __generator = (this && this.__generator) || function (thisArg, body) { var _ = { label: 0, sent: function() { if (t[0] & 1) throw t[1]; return t[1]; }, trys: [], ops: [] }, f, y, t, g = Object.create((typeof Iterator === "function" ? Iterator : Object).prototype); return g.next = verb(0), g["throw"] = verb(1), g["return"] = verb(2), typeof Symbol === "function" && (g[Symbol.iterator] = function() { return this; }), g; function verb(n) { return function (v) { return step([n, v]); }; } function step(op) { if (f) throw new TypeError("Generator is already executing."); while (g && (g = 0, op[0] && (_ = 0)), _) try { if (f = 1, y && (t = op[0] & 2 ? y["return"] : op[0] ? y["throw"] || ((t = y["return"]) && t.call(y), 0) : y.next) && !(t = t.call(y, op[1])).done) return t; if (y = 0, t) op = [op[0] & 2, t.value]; switch (op[0]) { case 0: case 1: t = op; break; case 4: _.label++; return { value: op[1], done: false }; case 5: _.label++; y = op[1]; op = [0]; continue; case 7: op = _.ops.pop(); _.trys.pop(); continue; default: if (!(t = _.trys, t = t.length > 0 && t[t.length - 1]) && (op[0] === 6 || op[0] === 2)) { _ = 0; continue; } if (op[0] === 3 && (!t || (op[1] > t[0] && op[1] < t[3]))) { _.label = op[1]; break; } if (op[0] === 6 && _.label < t[1]) { _.label = t[1]; t = op; break; } if (t && _.label < t[2]) { _.label = t[2]; _.ops.push(op); break; } if (t[2]) _.ops.pop(); _.trys.pop(); continue; } op = body.call(thisArg, _); } catch (e) { op = [6, e]; y = 0; } finally { f = t = 0; } if (op[0] & 5) throw op[1]; return { value: op[0] ? op[1] : void 0, done: true }; } }; var __rest = (this && this.__rest) || function (s, e) { var t = {}; for (var p in s) if (Object.prototype.hasOwnProperty.call(s, p) && e.indexOf(p) < 0) t[p] = s[p]; if (s != null && typeof Object.getOwnPropertySymbols === "function") for (var i = 0, p = Object.getOwnPropertySymbols(s); i < p.length; i++) { if (e.indexOf(p[i]) < 0 && Object.prototype.propertyIsEnumerable.call(s, p[i])) t[p[i]] = s[p[i]]; } return t; }; var __spreadArray = (this && this.__spreadArray) || function (to, from, pack) { if (pack || arguments.length === 2) for (var i = 0, l = from.length, ar; i < l; i++) { if (ar || !(i in from)) { if (!ar) ar = Array.prototype.slice.call(from, 0, i); ar[i] = from[i]; } } return to.concat(ar || Array.prototype.slice.call(from)); }; var __importDefault = (this && this.__importDefault) || function (mod) { return (mod && mod.__esModule) ? mod : { "default": mod }; }; Object.defineProperty(exports, "__esModule", { value: true }); var express_1 = __importDefault(require("express")); var index_js_1 = require("@modelcontextprotocol/sdk/server/index.js"); var stdio_js_1 = require("@modelcontextprotocol/sdk/server/stdio.js"); var sse_js_1 = require("@modelcontextprotocol/sdk/server/sse.js"); var streamableHttp_js_1 = require("@modelcontextprotocol/sdk/server/streamableHttp.js"); var types_js_1 = require("@modelcontextprotocol/sdk/types.js"); var path_1 = __importDefault(require("path")); var fs_1 = __importDefault(require("fs")); var lowdb_1 = __importDefault(require("lowdb")); var FileSync_js_1 = __importDefault(require("lowdb/adapters/FileSync.js")); var natural_1 = __importDefault(require("natural")); var TfIdf = natural_1.default.TfIdf, PorterStemmer = natural_1.default.PorterStemmer, WordTokenizer = natural_1.default.WordTokenizer; var firecrawl_js_1 = __importDefault(require("@mendable/firecrawl-js")); var os_1 = __importDefault(require("os")); var winston_1 = __importDefault(require("winston")); var pdf_parse_1 = __importDefault(require("pdf-parse")); var buffer_1 = require("buffer"); var date_fns_1 = require("date-fns"); var logDir = expandTildePath(process.env.VJDOC_LOG_DIR || './logs'); if (!fs_1.default.existsSync(logDir)) { try { fs_1.default.mkdirSync(logDir, { recursive: true }); } catch (error) { console.error("Error creating log directory: ".concat(error)); } } var logLevel = process.env.VJDOC_LOG_LEVEL || 'info'; var logToFile = process.env.VJDOC_LOG_TO_FILE !== 'false'; var enableStdioTransport = process.env.ENABLE_STDIO_TRANSPORT !== 'false'; var enableStreamableHttp = process.env.ENABLE_STREAMABLE_HTTP === 'true'; var enableLegacySse = process.env.ENABLE_LEGACY_SSE === 'true'; var streamableHttpPort = parseInt(process.env.STREAMABLE_HTTP_PORT || '3000', 10); var legacySsePort = parseInt(process.env.LEGACY_SSE_PORT || '3001', 10); console.log(' process.env:', process.env); var timestampFormat = winston_1.default.format(function (info) { info.timestamp = (0, date_fns_1.format)(new Date(), 'yyyy-MM-dd HH:mm:ss'); return info; })(); var logger = winston_1.default.createLogger({ level: logLevel, format: winston_1.default.format.combine(timestampFormat, winston_1.default.format.json()), defaultMeta: { service: 'mcp-vj-docs' }, transports: [ new winston_1.default.transports.Console({ format: winston_1.default.format.combine(winston_1.default.format.colorize(), winston_1.default.format.printf(function (info) { var timestamp = info.timestamp, level = info.level, message = info.message, meta = __rest(info, ["timestamp", "level", "message"]); return "".concat(timestamp, " ").concat(level, ": ").concat(message, " ").concat(Object.keys(meta).length ? JSON.stringify(meta, null, 2) : ''); })) }) ] }); if (logToFile) { logger.add(new winston_1.default.transports.File({ filename: path_1.default.join(logDir, 'error.log'), level: 'error' })); logger.add(new winston_1.default.transports.File({ filename: path_1.default.join(logDir, 'combined.log') })); } logger.info('Logger initialized', { logLevel: logLevel, logToFile: logToFile, logDir: logDir }); function expandTildePath(filePath) { if (!filePath || typeof filePath !== 'string') { return filePath; } if (filePath.startsWith('~/') || filePath === '~') { return path_1.default.join(os_1.default.homedir(), filePath.substring(1)); } else if (filePath.startsWith('~')) { var userEnd = filePath.indexOf('/'); if (userEnd === -1) { return filePath; } var username = filePath.substring(1, userEnd); return filePath; } return filePath; } var CRAWL_TOOL = { name: "vjdoc_crawl", description: "Crawl a website and index its content for search", inputSchema: { type: "object", properties: { url: { type: "string", description: "URL to crawl", }, maxDepth: { type: "number", description: "Maximum depth to crawl", }, maxPages: { type: "number", description: "Maximum number of pages to crawl", }, includePatterns: { type: "array", items: { type: "string", }, description: "Patterns to include in crawl", }, excludePatterns: { type: "array", items: { type: "string", }, description: "Patterns to exclude from crawl", }, defaultCategory: { type: "string", description: "Default category for documents if not detected automatically", }, }, required: ["url"], }, }; var SEARCH_TOOL = { name: "vjdoc_search", description: "Search indexed documents", inputSchema: { type: "object", properties: { query: { type: "string", description: "Search query" }, limit: { type: "number", description: "Maximum number of results to return" }, filters: { type: "object", description: "Optional filters to narrow down search results", properties: { categories: { type: "array", description: "Filter by document categories (e.g., 'API Documentation', 'Tutorial')", items: { type: "string" } }, dateFrom: { type: "number", description: "Filter documents created after this timestamp (in milliseconds)" }, dateTo: { type: "number", description: "Filter documents created before this timestamp (in milliseconds)" }, metadata: { type: "object", description: "Filter by metadata fields (key-value pairs)", additionalProperties: true } } }, userId: { type: "string", description: "Optional user ID for personalized results" } }, required: ["query"] }, }; var ADD_CORPUS_FILE_TOOL = { name: "vjdoc_add_corpus_file", description: "Add a corpus file to the TF-IDF files directory", inputSchema: { type: "object", properties: { content: { type: "string", description: "Content to add to the corpus file (alternative to filePath)" }, filePath: { type: "string", description: "Absolute path to a file to add to the corpus (alternative to content)" }, filename: { type: "string", description: "Optional filename for the corpus file (without extension)" }, category: { type: "string", description: "Optional category for the corpus file (e.g., 'Code Snippet', 'API Documentation', 'Error Solution', 'Technical Note')" }, contentType: { type: "string", description: "Optional content type for the corpus file (e.g., 'text', 'markdown', 'pdf-base64')" } }, required: [] } }; var GET_DOCS_META_TOOL = { name: "vjdoc_get_docs_meta", description: "Get metadata information about all documents and corpus files", inputSchema: { type: "object", properties: { query: { type: "string", description: "Natural language query or requirement" } }, required: ["query"] } }; var GET_DOCUMENT_TOOL = { name: "vjdoc_get_document", description: "Get the full content of a specific document by URL or title", inputSchema: { type: "object", properties: { url: { type: "string", description: "URL of the document to retrieve (optional if title is provided)" }, title: { type: "string", description: "Title of the document to retrieve (optional if url is provided)" } } } }; function isCrawlArgs(args) { logger.debug('Validating crawl args', { argsType: typeof args, args: args, isObject: typeof args === "object", isNull: args === null, hasUrlProp: args !== null && typeof args === "object" && "url" in args, urlType: args !== null && typeof args === "object" && "url" in args ? typeof args.url : "N/A" }); if (args === undefined || args === null) { return false; } return (typeof args === "object" && args !== null && "url" in args && typeof args.url === "string"); } function isSearchArgs(args) { logger.debug('Validating search args', { argsType: typeof args, args: args, isObject: typeof args === "object", isNull: args === null, hasQueryProp: args !== null && typeof args === "object" && "query" in args, queryType: args !== null && typeof args === "object" && "query" in args ? typeof args.query : "N/A" }); return (args !== null && typeof args === "object" && "query" in args && typeof args.query === "string" && (!("limit" in args) || typeof args.limit === "number") && (!("filters" in args) || typeof args.filters === "object") && (!("userId" in args) || typeof args.userId === "string")); } function isAddCorpusFileArgs(args) { return (args !== null && typeof args === 'object' && ((typeof args.content === 'string' && args.content.trim() !== '') || (typeof args.filePath === 'string' && args.filePath.trim() !== '')) && (args.filename === undefined || typeof args.filename === 'string') && (args.category === undefined || typeof args.category === 'string') && (args.contentType === undefined || typeof args.contentType === 'string')); } function isGetDocsMetaArgs(args) { return (args !== null && typeof args === "object" && "query" in args && typeof args.query === "string"); } function isGetDocumentArgs(args) { logger.debug('Validating get document args', { argsType: typeof args, args: args, isObject: typeof args === "object", isNull: args === null, hasUrlProp: args !== null && typeof args === "object" && "url" in args, urlType: args !== null && typeof args === "object" && "url" in args ? typeof args.url : "N/A", hasTitleProp: args !== null && typeof args === "object" && "title" in args, titleType: args !== null && typeof args === "object" && "title" in args ? typeof args.title : "N/A" }); if (args === undefined || args === null) { return false; } return (typeof args === "object" && args !== null && (("url" in args && typeof args.url === "string") || ("title" in args && typeof args.title === "string"))); } function extractKeywords(content, limit) { if (limit === void 0) { limit = 10; } if (!content) return []; var tokenizer = new WordTokenizer(); var tokens = tokenizer.tokenize(content.toLowerCase()) || []; var stopWords = ['a', 'an', 'the', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'with', 'by', 'of', 'is', 'are', 'was', 'were']; var filteredTokens = tokens.filter(function (token) { return token.length > 2 && !stopWords.includes(token) && !/^\d+$/.test(token); }); var wordFreq = {}; filteredTokens.forEach(function (token) { wordFreq[token] = (wordFreq[token] || 0) + 1; }); return Object.entries(wordFreq) .sort(function (a, b) { return b[1] - a[1]; }) .slice(0, limit) .map(function (entry) { return entry[0]; }); } function generateSummary(content, maxLength) { if (maxLength === void 0) { maxLength = 150; } if (!content) return ''; var summary = content.trim().substring(0, maxLength); var lastSpaceIndex = summary.lastIndexOf(' '); if (lastSpaceIndex > 0 && summary.length === maxLength) { return summary.substring(0, lastSpaceIndex) + '...'; } return summary.length < content.length ? summary + '...' : summary; } var DocumentStorage = (function () { function DocumentStorage(options) { this.db = null; this.tfidf = null; this.tokenizer = new WordTokenizer(); this.paragraphMap = new Map(); this.options = __assign(__assign({}, options), { dbPath: expandTildePath(options.dbPath), tfidfFilesDir: options.tfidfFilesDir ? expandTildePath(options.tfidfFilesDir) : undefined }); logger.info("DocumentStorage initialized with dbPath: ".concat(this.options.dbPath, ", tfidfFilesDir: ").concat(this.options.tfidfFilesDir || 'not set')); var dbDir = path_1.default.dirname(this.options.dbPath); logger.info("Ensuring database directory exists: ".concat(dbDir)); if (!fs_1.default.existsSync(dbDir)) { try { fs_1.default.mkdirSync(dbDir, { recursive: true }); logger.info("Created database directory: ".concat(dbDir)); } catch (error) { logger.error("Error creating database directory", { error: error, dbDir: dbDir }); throw new Error("Failed to create database directory: ".concat(error)); } } if (this.options.tfidfFilesDir) { logger.info("Ensuring TF-IDF files directory exists: ".concat(this.options.tfidfFilesDir)); if (!fs_1.default.existsSync(this.options.tfidfFilesDir)) { try { fs_1.default.mkdirSync(this.options.tfidfFilesDir, { recursive: true }); logger.info("Created TF-IDF files directory: ".concat(this.options.tfidfFilesDir)); } catch (error) { logger.error("Error creating TF-IDF files directory", { error: error, dir: this.options.tfidfFilesDir }); throw new Error("Failed to create TF-IDF files directory: ".concat(error)); } } } } DocumentStorage.prototype.initialize = function () { return __awaiter(this, void 0, void 0, function () { var adapter, documents, error_1; var _this = this; return __generator(this, function (_a) { switch (_a.label) { case 0: logger.info("Initializing database from: ".concat(this.options.dbPath)); _a.label = 1; case 1: _a.trys.push([1, 4, , 5]); adapter = new FileSync_js_1.default(this.options.dbPath); this.db = (0, lowdb_1.default)(adapter); this.db.defaults({ documents: [], links: [], searchHistory: [], interactions: [], userPreferences: [] }).write(); this.tfidf = new TfIdf(); if (!(this.options.tfidfFilesDir && fs_1.default.existsSync(this.options.tfidfFilesDir))) return [3, 3]; return [4, this.loadTfidfFiles()]; case 2: _a.sent(); _a.label = 3; case 3: documents = this.db.get('documents').value(); if (documents.length > 0) { logger.info("Indexing ".concat(documents.length, " existing documents")); documents.forEach(function (doc, index) { var paragraphs = _this.processDocumentParagraphs(doc, index); paragraphs.forEach(function (paragraph, paragraphId) { var docKey = "doc_".concat(index, "_p").concat(paragraphId); if (_this.tfidf) { _this.tfidf.addDocument(_this.processTextForSearch(paragraph.content), docKey); } _this.paragraphMap.set(docKey, paragraph); }); }); logger.info("Indexed ".concat(this.paragraphMap.size, " paragraphs for search")); } else { logger.info('No existing documents to index'); } logger.info('Database and search index initialized successfully'); return [3, 5]; case 4: error_1 = _a.sent(); logger.error('Error initializing database', { error: error_1 }); throw new Error("Failed to initialize database: ".concat(error_1)); case 5: return [2]; } }); }); }; DocumentStorage.prototype.scanFilesRecursive = function (dir) { var supportedExts = ['.txt', '.md', '.pdf']; var results = []; var walk = function (currentDir) { var entries = fs_1.default.readdirSync(currentDir, { withFileTypes: true }); var _loop_1 = function (entry) { var fullPath = path_1.default.join(currentDir, entry.name); if (entry.isDirectory()) { walk(fullPath); } else if (entry.isFile() && supportedExts.some(function (ext) { return entry.name.toLowerCase().endsWith(ext); })) { results.push(fullPath); } }; for (var _i = 0, entries_1 = entries; _i < entries_1.length; _i++) { var entry = entries_1[_i]; _loop_1(entry); } }; walk(dir); return results; }; DocumentStorage.prototype.loadTfidfFiles = function () { return __awaiter(this, void 0, void 0, function () { var files, _i, files_1, file, pdfBuffer, pdfData, markdownText, pdfError_1, fileContent, error_2, error_3; return __generator(this, function (_a) { switch (_a.label) { case 0: if (!this.options.tfidfFilesDir || !this.tfidf) { return [2]; } _a.label = 1; case 1: _a.trys.push([1, 13, , 14]); files = this.scanFilesRecursive(this.options.tfidfFilesDir); if (files.length === 0) { logger.info("No text files found in TF-IDF directory: ".concat(this.options.tfidfFilesDir)); return [2]; } logger.info("Loading ".concat(files.length, " text files for TF-IDF calculation")); _i = 0, files_1 = files; _a.label = 2; case 2: if (!(_i < files_1.length)) return [3, 12]; file = files_1[_i]; _a.label = 3; case 3: _a.trys.push([3, 10, , 11]); if (!file.endsWith('.pdf')) return [3, 8]; pdfBuffer = fs_1.default.readFileSync(file); _a.label = 4; case 4: _a.trys.push([4, 6, , 7]); return [4, (0, pdf_parse_1.default)(pdfBuffer)]; case 5: pdfData = _a.sent(); try { markdownText = this.convertPdfTextToMarkdown(pdfData.text); this.tfidf.addDocument(markdownText, file); logger.info("Added PDF (converted to Markdown) to TF-IDF index: ".concat(file)); } catch (mdError) { this.tfidf.addDocument(pdfData.text, file); logger.info("Added PDF (plain text) to TF-IDF index: ".concat(file)); } return [3, 7]; case 6: pdfError_1 = _a.sent(); logger.error("Error parsing PDF file: ".concat(file), { error: pdfError_1 }); return [3, 11]; case 7: return [3, 9]; case 8: fileContent = fs_1.default.readFileSync(file, 'utf8'); this.tfidf.addDocument(fileContent, file); logger.info("Added TF-IDF file: ".concat(file)); _a.label = 9; case 9: return [3, 11]; case 10: error_2 = _a.sent(); logger.error("Error adding TF-IDF file: ".concat(file), { error: error_2 }); return [3, 11]; case 11: _i++; return [3, 2]; case 12: return [3, 14]; case 13: error_3 = _a.sent(); logger.error("Error loading TF-IDF files", { error: error_3, dir: this.options.tfidfFilesDir }); return [3, 14]; case 14: return [2]; } }); }); }; DocumentStorage.prototype.processTextForSearch = function (text) { if (!text) return []; var containsChinese = /[\u4e00-\u9fa5]/.test(text); if (containsChinese) { var chineseTokens_1 = []; var chineseMatches = text.match(/[\u4e00-\u9fa5]+/g) || []; chineseMatches.forEach(function (match) { for (var i = 0; i < match.length; i++) { chineseTokens_1.push(match[i]); } for (var i = 0; i < match.length - 1; i++) { chineseTokens_1.push(match.substring(i, i + 2)); } for (var i = 0; i < match.length - 2; i++) { chineseTokens_1.push(match.substring(i, i + 3)); } if (match.length <= 10) { chineseTokens_1.push(match); } }); var nonChineseText = text.replace(/[\u4e00-\u9fa5]+/g, ' '); var nonChineseTokens = this.tokenizer.tokenize(nonChineseText.toLowerCase()) || []; var allTokens = __spreadArray(__spreadArray([], chineseTokens_1, true), nonChineseTokens, true); return allTokens.filter(function (token) { return /[\u4e00-\u9fa5]/.test(token) || token.length > 2; }); } else { var tokens = this.tokenizer.tokenize(text.toLowerCase()) || []; var processedTokens = tokens .filter(function (token) { return token.length > 2; }) .map(function (token) { return PorterStemmer.stem(token); }); return processedTokens; } }; DocumentStorage.prototype.processDocumentParagraphs = function (doc, docIndex) { if (!doc.content) return []; var paragraphs = doc.content.split('\n\n').filter(function (p) { return p.trim().length > 0; }); return paragraphs.map(function (content, paragraphId) { return ({ docId: docIndex, paragraphId: paragraphId, content: content, url: doc.url, title: doc.title }); }); }; DocumentStorage.prototype.storeDocuments = function (documents_1) { return __awaiter(this, arguments, void 0, function (documents, links) { var existingDocs, existingLinks, updatedDocs, updatedLinks; var _this = this; if (links === void 0) { links = []; } return __generator(this, function (_a) { switch (_a.label) { case 0: if (!!this.db) return [3, 2]; logger.info('Database not initialized, initializing now'); return [4, this.initialize()]; case 1: _a.sent(); _a.label = 2; case 2: if (!this.db) { logger.error('Failed to initialize database'); throw new Error('Failed to initialize database'); } existingDocs = this.db.get('documents').value(); existingLinks = this.db.get('links').value(); updatedDocs = __spreadArray(__spreadArray([], existingDocs, true), documents, true); updatedLinks = __spreadArray(__spreadArray([], existingLinks, true), links, true); this.db.set('documents', updatedDocs).write(); this.db.set('links', updatedLinks).write(); this.tfidf = new TfIdf(); this.paragraphMap.clear(); updatedDocs.forEach(function (doc, docIndex) { var paragraphs = _this.processDocumentParagraphs(doc, docIndex); paragraphs.forEach(function (paragraph, paragraphId) { var docKey = "doc_".concat(docIndex, "_p").concat(paragraphId); if (_this.tfidf) { _this.tfidf.addDocument(_this.processTextForSearch(paragraph.content), docKey); } _this.paragraphMap.set(docKey, paragraph); }); }); logger.info("Stored ".concat(documents.length, " documents and ").concat(links.length, " links. Total: ").concat(updatedDocs.length, " documents, ").concat(updatedLinks.length, " links")); logger.info("Indexed ".concat(this.paragraphMap.size, " paragraphs for search")); return [2]; } }); }); }; DocumentStorage.prototype.addDocuments = function (documents) { return __awaiter(this, void 0, void 0, function () { var _i, documents_1, doc; return __generator(this, function (_a) { switch (_a.label) { case 0: if (!!this.db) return [3, 2]; logger.info('Database not initialized, initializing now'); return [4, this.initialize()]; case 1: _a.sent(); _a.label = 2; case 2: if (!this.db) { logger.error('Failed to initialize database'); throw new Error('Failed to initialize database'); } _i = 0, documents_1 = documents; _a.label = 3; case 3: if (!(_i < documents_1.length)) return [3, 6]; doc = documents_1[_i]; return [4, this.addDocument(doc)]; case 4: _a.sent(); _a.label = 5; case 5: _i++; return [3, 3]; case 6: logger.info("Added ".concat(documents.length, " documents to storage")); return [2]; } }); }); }; DocumentStorage.prototype.clearDocuments = function () { return __awaiter(this, void 0, void 0, function () { return __generator(this, function (_a) { switch (_a.label) { case 0: if (!!this.db) return [3, 2]; logger.info('Database not initialized, initializing now'); return [4, this.initialize()]; case 1: _a.sent(); _a.label = 2; case 2: if (!this.db) { logger.error('Failed to initialize database'); throw new Error('Failed to initialize database'); } this.db.set('documents', []).write(); this.tfidf = new TfIdf(); this.paragraphMap.clear(); logger.info('Cleared all documents'); return [2]; } }); }); }; DocumentStorage.prototype.highlightKeywords = function (text, keywords, format) { if (format === void 0) { format = 'markdown'; } if (!text || keywords.length === 0) return text; var highlightedText = text; var sortedKeywords = __spreadArray([], keywords, true).sort(function (a, b) { return b.length - a.length; }); var keywordPatterns = sortedKeywords.map(function (keyword) { var escapedKeyword = keyword.replace(/[.*+?^${}()|[\]\\]/g, '\\$&'); return "\\b".concat(escapedKeyword, "\\b"); }); var combinedPattern = new RegExp(keywordPatterns.join('|'), 'gi'); if (format === 'markdown') { highlightedText = highlightedText.replace(combinedPattern, function (match) { return "**".concat(match, "**"); }); } else { highlightedText = highlightedText.replace(combinedPattern, function (match) { return "<span class=\"highlight\">".concat(match, "</span>"); }); } return highlightedText; }; DocumentStorage.prototype.search = function (query_1) { return __awaiter(this, arguments, void 0, function (query, limit, filters, userId) { var dbDocuments, corpusDocuments, documentMap, documents, userPreferences, userInteractions, urlScores, queryTerms, similarQueries, userHistory, results; var _this = this; if (limit === void 0) { limit = 10; } return __generator(this, function (_a) { switch (_a.label) { case 0: if (!this.db || !this.tfidf) { logger.error('Failed to search: database or TF-IDF not initialized'); throw new Error('Failed to search: database or TF-IDF not initialized'); } if (!userId) return [3, 2]; return [4, this.recordSearchQuery(userId, query, filters)]; case 1: _a.sent(); _a.label = 2; case 2: dbDocuments = this.db.get('documents').value(); logger.debug("Retrieved ".concat(dbDocuments.length, " documents from database")); return [4, this.getCorpusDocuments()]; case 3: corpusDocuments = _a.sent(); logger.debug("Retrieved ".concat(corpusDocuments.length, " documents from corpus")); documentMap = new Map(); dbDocuments.forEach(function (doc) { documentMap.set(doc.url, doc); }); corpusDocuments.forEach(function (doc) { documentMap.set(doc.url, doc); }); documents = Array.from(documentMap.values()); logger.debug("Total unique documents after merging: ".concat(documents.length)); if (documents.length === 0) { logger.info('No documents in database or corpus directory to search'); return [2, []]; } userPreferences = null; userInteractions = []; if (!userId) return [3, 5]; return [4, this.getUserPreferences(userId)]; case 4: userPreferences = _a.sent(); userInteractions = this.db.get('interactions') .filter({ userId: userId }) .sortBy('timestamp') .reverse() .take(50) .value(); _a.label = 5; case 5: urlScores = new Map(); queryTerms = this.tokenizer.tokenize(query.toLowerCase()) || []; similarQueries = []; if (!userId) return [3, 7]; return [4, this.getUserSearchHistory(userId, 20)]; case 6: userHistory = _a.sent(); similarQueries = userHistory .filter(function (entry) { var entryTerms = _this.tokenizer.tokenize(entry.query.toLowerCase()) || []; var overlap = queryTerms.filter(function (term) { return entryTerms.includes(term); }).length; return overlap > 0 && overlap / Math.max(queryTerms.length, entryTerms.length) > 0.3; }) .map(function (entry) { return entry.query; }); _a.label = 7; case 7: documents.forEach(function (doc) { if (filters) { if (filters.categories && filters.categories.length > 0) { if (!doc.category || !filters.categories.includes(doc.category)) { return; } } if (filters.dateFrom && doc.timestamp && doc.timestamp < filters.dateFrom) { return; } if (filters.dateTo && doc.timestamp && doc.timestamp > filters.dateTo) { return; } if (filters.metadata) { for (var _i = 0, _a = Object.entries(filters.metadata); _i < _a.length; _i++) { var _b = _a[_i], key = _b[0], value = _b[1]; if (!doc.metadata || doc.metadata[key] !== value) { return; } } } } var score = _this.calculateDocumentScore(doc, query); if (userId && userPreferences) { if (userPreferences.preferredCategories && doc.category && userPreferences.preferredCategories.includes(doc.category)) { score *= 1.2; } var docInteractions = userInteractions.filter(function (i) { return i.documentUrl === doc.url; }); if (docInteractions.length > 0) { var mostRecent = docInteractions[0].timestamp; var now = Date.now(); var daysSinceInteraction = (now - mostRecent) / (1000 * 60 * 60 * 24); var recencyFactor = Math.max(0.1, Math.min(1, 1 - (daysSinceInteraction / 30))); var interactionBoost_1 = 0; docInteractions.forEach(function (interaction) { switch (interaction.interactionType) { case 'bookmark': interactionBoost_1 += 0.3; break; case 'view': if (interaction.durationMs) { interactionBoost_1 += Math.min(0.2, interaction.durationMs / 60000 * 0.1); } else { interactionBoost_1 += 0.1; } break; case 'click': interactionBoost_1 += 0.15; break; } }); score *= (1 + interactionBoost_1 * recencyFactor); } if (similarQueries.length > 0) { var relevantForSimilarQueries = userInteractions.some(function (i) { return i.documentUrl === doc.url && similarQueries.includes(i.query); }); if (relevantForSimilarQueries) { score *= 1.15; } } } urlScores.set(doc.url, score); }); results = []; Array.from(urlScores.entries()).forEach(function (_a) { var url = _a[0], score = _a[1]; var doc = documents.find(function (d) { return d.url === url; }); if (doc && score > 0) { var snippet = _this.createSnippet(doc.content, query, true); results.push({ url: doc.url, title: doc.title, snippet: _this.createSnippet(doc.content, query, false), highlightedSnippet: snippet, score: score, category: doc.category, paragraph: doc.content, highlightedContent: _this.highlightKeywords(doc.content, queryTerms), fullDocument: doc.content }); } }); results.sort(function (a, b) { return b.score - a.score; }); return [2, results.slice(0, limit)]; } }); }); }; DocumentStorage.prototype.searchForLLM = function (query_1) { return __awaiter(this, arguments, void 0, function (query, limit, filters, userId) { var searchResults, resultsByDocument, documentContents, documentEntries, i, _a, url, results, topResult, fullDocumentContent, dbDoc, filePath, fileExt, pdfBuffer, pdfData, error_4, error_5, queryTerms, relevantContext, topDocUrl, topDocFullContent, dbDoc, filePath, fileExt, pdfBuffer, pdfData, error_6, error_7, sources, topDocument; var _b, _c; if (limit === void 0) { limit = 10; } return __generator(this, function (_d) { switch (_d.label) { case 0: return [4, this.search(query, limit, filters, userId)]; case 1: searchResults = _d.sent(); logger.debug("Search returned ".concat(searchResults.length, " results")); if (searchResults.length === 0) { return [2, { content: "No results found for your query.", sources: [] }]; } resultsByDocument = new Map(); searchResults.forEach(function (result) { var results = resultsByDocument.get(result.url) || []; results.push(result); resultsByDocument.set(result.url, results); }); documentContents = []; documentEntries = Array.from(resultsByDocument.entries()); i = 0; _d.label = 2; case 2: if (!(i < documentEntries.length)) return [3, 13]; _a = documentEntries[i], url = _a[0], results = _a[1]; topResult = results.reduce(function (prev, current) { return (current.score > prev.score) ? current : prev; }); fullDocumentContent = void 0; dbDoc = (_b = this.db) === null || _b === void 0 ? void 0 : _b.get('documents').find({ url: url }).value(); if (!(dbDoc && dbDoc.content)) return [3, 3]; fullDocumentContent = dbDoc.content; return [3, 11]; case 3: if (!url.startsWith('file://')) return [3, 11]; _d.label = 4; case 4: _d.trys.push([4, 10, , 11]); filePath = url.replace('file://', ''); if (!fs_1.default.existsSync(filePath)) return [3, 9]; fileExt = path_1.default.extname(filePath).toLowerCase(); if (!(fileExt === '.txt' || fileExt === '.md')) return [3, 5]; fullDocumentContent = fs_1.default.readFileSync(filePath, 'utf8'); return [3, 9]; case 5: if (!(fileExt === '.pdf')) return [3, 9]; _d.label = 6; case 6: _d.trys.push([6, 8, , 9]); pdfBuffer = fs_1.default.readFileSync(filePath); return [4, (0, pdf_parse_1.default)(pdfBuffer)]; case 7: pdfData = _d.sent(); fullDocumentContent = pdfData.text; return [3, 9]; case 8: error_4 = _d.sent(); logger.error("Error reading PDF file for fullDocument: ".concat(filePath), { error: error_4 }); return [3, 9]; case 9: return [3, 11]; case 10: error_5 = _d.sent(); logger.error("Error reading file for fullDocument: ".concat(url), { error: error_5 }); return [3, 11]; case 11: if (!fullDocumentContent) { fullDocumentContent = topResult.snippet; } queryTerms = this.tokenizer.tokenize(query.toLowerCase()) || []; relevantContext = fullDocumentContent ? this.extractRelevantContext(fullDocumentContent, queryTerms, query, 500) : topResult.snippet; documentContents.push({ url: url, title: topResult.title, paragraph: relevantContext, highlightedParagraph: this.highlightKeywords(relevantContext, queryTerms), score: topResult.score, category: topResult.category, fullDocument: undefined }); _d.label = 12; case 12: i++; return [3, 2]; case 13: documentContents.sort(function (a, b) { return b.score - a.score; }); if