UNPKG

@vjlanguage/mcp-vj-docs

Version:

MCP server for documentation crawling, indexing, and retrieval

1,002 lines 140 kB
#!/usr/bin/env node "use strict"; var __assign = (this && this.__assign) || function () { __assign = Object.assign || function(t) { for (var s, i = 1, n = arguments.length; i < n; i++) { s = arguments[i]; for (var p in s) if (Object.prototype.hasOwnProperty.call(s, p)) t[p] = s[p]; } return t; }; return __assign.apply(this, arguments); }; var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) { function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); } return new (P || (P = Promise))(function (resolve, reject) { function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } } function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } } function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); } step((generator = generator.apply(thisArg, _arguments || [])).next()); }); }; var __generator = (this && this.__generator) || function (thisArg, body) { var _ = { label: 0, sent: function() { if (t[0] & 1) throw t[1]; return t[1]; }, trys: [], ops: [] }, f, y, t, g = Object.create((typeof Iterator === "function" ? Iterator : Object).prototype); return g.next = verb(0), g["throw"] = verb(1), g["return"] = verb(2), typeof Symbol === "function" && (g[Symbol.iterator] = function() { return this; }), g; function verb(n) { return function (v) { return step([n, v]); }; } function step(op) { if (f) throw new TypeError("Generator is already executing."); while (g && (g = 0, op[0] && (_ = 0)), _) try { if (f = 1, y && (t = op[0] & 2 ? y["return"] : op[0] ? y["throw"] || ((t = y["return"]) && t.call(y), 0) : y.next) && !(t = t.call(y, op[1])).done) return t; if (y = 0, t) op = [op[0] & 2, t.value]; switch (op[0]) { case 0: case 1: t = op; break; case 4: _.label++; return { value: op[1], done: false }; case 5: _.label++; y = op[1]; op = [0]; continue; case 7: op = _.ops.pop(); _.trys.pop(); continue; default: if (!(t = _.trys, t = t.length > 0 && t[t.length - 1]) && (op[0] === 6 || op[0] === 2)) { _ = 0; continue; } if (op[0] === 3 && (!t || (op[1] > t[0] && op[1] < t[3]))) { _.label = op[1]; break; } if (op[0] === 6 && _.label < t[1]) { _.label = t[1]; t = op; break; } if (t && _.label < t[2]) { _.label = t[2]; _.ops.push(op); break; } if (t[2]) _.ops.pop(); _.trys.pop(); continue; } op = body.call(thisArg, _); } catch (e) { op = [6, e]; y = 0; } finally { f = t = 0; } if (op[0] & 5) throw op[1]; return { value: op[0] ? op[1] : void 0, done: true }; } }; var __rest = (this && this.__rest) || function (s, e) { var t = {}; for (var p in s) if (Object.prototype.hasOwnProperty.call(s, p) && e.indexOf(p) < 0) t[p] = s[p]; if (s != null && typeof Object.getOwnPropertySymbols === "function") for (var i = 0, p = Object.getOwnPropertySymbols(s); i < p.length; i++) { if (e.indexOf(p[i]) < 0 && Object.prototype.propertyIsEnumerable.call(s, p[i])) t[p[i]] = s[p[i]]; } return t; }; var __spreadArray = (this && this.__spreadArray) || function (to, from, pack) { if (pack || arguments.length === 2) for (var i = 0, l = from.length, ar; i < l; i++) { if (ar || !(i in from)) { if (!ar) ar = Array.prototype.slice.call(from, 0, i); ar[i] = from[i]; } } return to.concat(ar || Array.prototype.slice.call(from)); }; var __importDefault = (this && this.__importDefault) || function (mod) { return (mod && mod.__esModule) ? mod : { "default": mod }; }; Object.defineProperty(exports, "__esModule", { value: true }); var index_js_1 = require("@modelcontextprotocol/sdk/server/index.js"); var stdio_js_1 = require("@modelcontextprotocol/sdk/server/stdio.js"); var types_js_1 = require("@modelcontextprotocol/sdk/types.js"); var path_1 = __importDefault(require("path")); var fs_1 = __importDefault(require("fs")); var lowdb_1 = __importDefault(require("lowdb")); var FileSync_js_1 = __importDefault(require("lowdb/adapters/FileSync.js")); var natural_1 = __importDefault(require("natural")); var TfIdf = natural_1.default.TfIdf, PorterStemmer = natural_1.default.PorterStemmer, WordTokenizer = natural_1.default.WordTokenizer; var firecrawl_js_1 = __importDefault(require("@mendable/firecrawl-js")); var os_1 = __importDefault(require("os")); var winston_1 = __importDefault(require("winston")); var pdf_parse_1 = __importDefault(require("pdf-parse")); var buffer_1 = require("buffer"); // Configure logger var logDir = expandTildePath(process.env.VJDOC_LOG_DIR || './logs'); // Ensure log directory exists if (!fs_1.default.existsSync(logDir)) { try { fs_1.default.mkdirSync(logDir, { recursive: true }); } catch (error) { console.error("Error creating log directory: ".concat(error)); } } var logLevel = process.env.VJDOC_LOG_LEVEL || 'info'; var logToFile = process.env.VJDOC_LOG_TO_FILE !== 'false'; // Create logger instance var logger = winston_1.default.createLogger({ level: logLevel, format: winston_1.default.format.combine(winston_1.default.format.timestamp(), winston_1.default.format.json()), defaultMeta: { service: 'mcp-vj-docs' }, transports: [ new winston_1.default.transports.Console({ format: winston_1.default.format.combine(winston_1.default.format.colorize(), winston_1.default.format.printf(function (info) { var timestamp = info.timestamp, level = info.level, message = info.message, meta = __rest(info, ["timestamp", "level", "message"]); return "".concat(timestamp, " ").concat(level, ": ").concat(message, " ").concat(Object.keys(meta).length ? JSON.stringify(meta, null, 2) : ''); })) }) ] }); // Add file transport if enabled if (logToFile) { logger.add(new winston_1.default.transports.File({ filename: path_1.default.join(logDir, 'error.log'), level: 'error' })); logger.add(new winston_1.default.transports.File({ filename: path_1.default.join(logDir, 'combined.log') })); } logger.info('Logger initialized', { logLevel: logLevel, logToFile: logToFile, logDir: logDir }); // Utility function to expand tilde in paths function expandTildePath(filePath) { if (!filePath || typeof filePath !== 'string') { return filePath; } // Check if path starts with tilde if (filePath.startsWith('~/') || filePath === '~') { return path_1.default.join(os_1.default.homedir(), filePath.substring(1)); } else if (filePath.startsWith('~')) { // Handle ~username/path format (not common in Node.js contexts) var userEnd = filePath.indexOf('/'); if (userEnd === -1) { return filePath; // Can't handle this case easily in Node.js } var username = filePath.substring(1, userEnd); // This is a simplification; proper handling would require OS-specific logic return filePath; } return filePath; } // Define tools var CRAWL_TOOL = { name: "vjdoc_crawl", description: "Crawl a website and index its content for search", inputSchema: { type: "object", properties: { url: { type: "string", description: "URL to crawl", }, maxDepth: { type: "number", description: "Maximum depth to crawl", }, maxPages: { type: "number", description: "Maximum number of pages to crawl", }, includePatterns: { type: "array", items: { type: "string", }, description: "Patterns to include in crawl", }, excludePatterns: { type: "array", items: { type: "string", }, description: "Patterns to exclude from crawl", }, defaultCategory: { type: "string", description: "Default category for documents if not detected automatically", }, }, required: ["url"], }, }; var SEARCH_TOOL = { name: "vjdoc_search", description: "Search indexed documents", inputSchema: { type: "object", properties: { query: { type: "string", description: "Search query" }, limit: { type: "number", description: "Maximum number of results to return" }, filters: { type: "object", description: "Optional filters to narrow down search results", properties: { categories: { type: "array", description: "Filter by document categories (e.g., 'API Documentation', 'Tutorial')", items: { type: "string" } }, dateFrom: { type: "number", description: "Filter documents created after this timestamp (in milliseconds)" }, dateTo: { type: "number", description: "Filter documents created before this timestamp (in milliseconds)" }, metadata: { type: "object", description: "Filter by metadata fields (key-value pairs)", additionalProperties: true } } }, userId: { type: "string", description: "Optional user ID for personalized results" } }, required: ["query"] }, }; var ADD_CORPUS_FILE_TOOL = { name: "vjdoc_add_corpus_file", description: "Add a corpus file to the TF-IDF files directory", inputSchema: { type: "object", properties: { content: { type: "string", description: "Content to add to the corpus file (alternative to filePath)" }, filePath: { type: "string", description: "Absolute path to a file to add to the corpus (alternative to content)" }, filename: { type: "string", description: "Optional filename for the corpus file (without extension)" }, category: { type: "string", description: "Optional category for the corpus file (e.g., 'Code Snippet', 'API Documentation', 'Error Solution', 'Technical Note')" }, contentType: { type: "string", description: "Optional content type for the corpus file (e.g., 'text', 'markdown', 'pdf-base64')" } }, required: [] } }; var GET_DOCS_META_TOOL = { name: "vjdoc_get_docs_meta", description: "Get metadata information about all documents and corpus files", inputSchema: { type: "object", properties: { query: { type: "string", description: "Natural language query or requirement" } }, required: ["query"] } }; var GET_DOCUMENT_TOOL = { name: "vjdoc_get_document", description: "Get the full content of a specific document by URL or title", inputSchema: { type: "object", properties: { url: { type: "string", description: "URL of the document to retrieve (optional if title is provided)" }, title: { type: "string", description: "Title of the document to retrieve (optional if url is provided)" } } } }; // Type guard functions for parameter validation function isCrawlArgs(args) { logger.debug('Validating crawl args', { argsType: typeof args, args: args, isObject: typeof args === "object", isNull: args === null, hasUrlProp: args !== null && typeof args === "object" && "url" in args, urlType: args !== null && typeof args === "object" && "url" in args ? typeof args.url : "N/A" }); if (args === undefined || args === null) { return false; } return (typeof args === "object" && args !== null && "url" in args && typeof args.url === "string"); } function isSearchArgs(args) { logger.debug('Validating search args', { argsType: typeof args, args: args, isObject: typeof args === "object", isNull: args === null, hasQueryProp: args !== null && typeof args === "object" && "query" in args, queryType: args !== null && typeof args === "object" && "query" in args ? typeof args.query : "N/A" }); return (args !== null && typeof args === "object" && "query" in args && typeof args.query === "string" && (!("limit" in args) || typeof args.limit === "number") && (!("filters" in args) || typeof args.filters === "object") && (!("userId" in args) || typeof args.userId === "string")); } function isAddCorpusFileArgs(args) { return (args !== null && typeof args === 'object' && ((typeof args.content === 'string' && args.content.trim() !== '') || (typeof args.filePath === 'string' && args.filePath.trim() !== '')) && (args.filename === undefined || typeof args.filename === 'string') && (args.category === undefined || typeof args.category === 'string') && (args.contentType === undefined || typeof args.contentType === 'string')); } function isGetDocsMetaArgs(args) { return (args !== null && typeof args === "object" && "query" in args && typeof args.query === "string"); } function isGetDocumentArgs(args) { logger.debug('Validating get document args', { argsType: typeof args, args: args, isObject: typeof args === "object", isNull: args === null, hasUrlProp: args !== null && typeof args === "object" && "url" in args, urlType: args !== null && typeof args === "object" && "url" in args ? typeof args.url : "N/A", hasTitleProp: args !== null && typeof args === "object" && "title" in args, titleType: args !== null && typeof args === "object" && "title" in args ? typeof args.title : "N/A" }); if (args === undefined || args === null) { return false; } return (typeof args === "object" && args !== null && (("url" in args && typeof args.url === "string") || ("title" in args && typeof args.title === "string"))); } // Utility functions for document metadata extraction /** * Extract keywords from document content * @param content Document content * @param limit Maximum number of keywords to extract * @returns Array of keywords */ function extractKeywords(content, limit) { if (limit === void 0) { limit = 10; } if (!content) return []; // Use simple TF-IDF idea to extract keywords var tokenizer = new WordTokenizer(); var tokens = tokenizer.tokenize(content.toLowerCase()) || []; // Filter out stop words and short words var stopWords = ['a', 'an', 'the', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'with', 'by', 'of', 'is', 'are', 'was', 'were']; var filteredTokens = tokens.filter(function (token) { return token.length > 2 && !stopWords.includes(token) && !/^\d+$/.test(token); }); // Calculate word frequency var wordFreq = {}; filteredTokens.forEach(function (token) { wordFreq[token] = (wordFreq[token] || 0) + 1; }); // Sort by word frequency and return top N return Object.entries(wordFreq) .sort(function (a, b) { return b[1] - a[1]; }) .slice(0, limit) .map(function (entry) { return entry[0]; }); } /** * Generate a summary of document content * @param content Document content * @param maxLength Maximum length of summary * @returns Summary text */ function generateSummary(content, maxLength) { if (maxLength === void 0) { maxLength = 150; } if (!content) return ''; // Simple method: take the first N characters as the summary var summary = content.trim().substring(0, maxLength); // Ensure the summary doesn't cut off in the middle of a word var lastSpaceIndex = summary.lastIndexOf(' '); if (lastSpaceIndex > 0 && summary.length === maxLength) { return summary.substring(0, lastSpaceIndex) + '...'; } return summary.length < content.length ? summary + '...' : summary; } // DocumentStorage class var DocumentStorage = /** @class */ (function () { function DocumentStorage(options) { this.db = null; this.tfidf = null; this.tokenizer = new WordTokenizer(); // Add a map to store paragraph information this.paragraphMap = new Map(); // Ensure paths have tilde expanded this.options = __assign(__assign({}, options), { dbPath: expandTildePath(options.dbPath), tfidfFilesDir: options.tfidfFilesDir ? expandTildePath(options.tfidfFilesDir) : undefined }); logger.info("DocumentStorage initialized with dbPath: ".concat(this.options.dbPath, ", tfidfFilesDir: ").concat(this.options.tfidfFilesDir || 'not set')); // Ensure database directory exists var dbDir = path_1.default.dirname(this.options.dbPath); logger.info("Ensuring database directory exists: ".concat(dbDir)); if (!fs_1.default.existsSync(dbDir)) { try { fs_1.default.mkdirSync(dbDir, { recursive: true }); logger.info("Created database directory: ".concat(dbDir)); } catch (error) { logger.error("Error creating database directory", { error: error, dbDir: dbDir }); throw new Error("Failed to create database directory: ".concat(error)); } } // Ensure TF-IDF files directory exists if specified if (this.options.tfidfFilesDir) { logger.info("Ensuring TF-IDF files directory exists: ".concat(this.options.tfidfFilesDir)); if (!fs_1.default.existsSync(this.options.tfidfFilesDir)) { try { fs_1.default.mkdirSync(this.options.tfidfFilesDir, { recursive: true }); logger.info("Created TF-IDF files directory: ".concat(this.options.tfidfFilesDir)); } catch (error) { logger.error("Error creating TF-IDF files directory", { error: error, dir: this.options.tfidfFilesDir }); throw new Error("Failed to create TF-IDF files directory: ".concat(error)); } } } } /** * Initialize the database and search index */ DocumentStorage.prototype.initialize = function () { return __awaiter(this, void 0, void 0, function () { var adapter, documents, error_1; var _this = this; return __generator(this, function (_a) { switch (_a.label) { case 0: logger.info("Initializing database from: ".concat(this.options.dbPath)); _a.label = 1; case 1: _a.trys.push([1, 4, , 5]); adapter = new FileSync_js_1.default(this.options.dbPath); this.db = (0, lowdb_1.default)(adapter); // Initialize the database with default values if empty this.db.defaults({ documents: [], links: [], searchHistory: [], interactions: [], userPreferences: [] }).write(); // Initialize TF-IDF this.tfidf = new TfIdf(); if (!(this.options.tfidfFilesDir && fs_1.default.existsSync(this.options.tfidfFilesDir))) return [3 /*break*/, 3]; return [4 /*yield*/, this.loadTfidfFiles()]; case 2: _a.sent(); _a.label = 3; case 3: documents = this.db.get('documents').value(); if (documents.length > 0) { logger.info("Indexing ".concat(documents.length, " existing documents")); documents.forEach(function (doc, index) { // Process document into paragraphs var paragraphs = _this.processDocumentParagraphs(doc, index); // Add each paragraph to the TF-IDF index paragraphs.forEach(function (paragraph, paragraphId) { var docKey = "doc_".concat(index, "_p").concat(paragraphId); if (_this.tfidf) { _this.tfidf.addDocument(_this.processTextForSearch(paragraph.content), docKey); } // Store paragraph information for later retrieval _this.paragraphMap.set(docKey, paragraph); }); }); logger.info("Indexed ".concat(this.paragraphMap.size, " paragraphs for search")); } else { logger.info('No existing documents to index'); } logger.info('Database and search index initialized successfully'); return [3 /*break*/, 5]; case 4: error_1 = _a.sent(); logger.error('Error initializing database', { error: error_1 }); throw new Error("Failed to initialize database: ".concat(error_1)); case 5: return [2 /*return*/]; } }); }); }; /** * Load text files from the TF-IDF files directory */ DocumentStorage.prototype.loadTfidfFiles = function () { return __awaiter(this, void 0, void 0, function () { var files, _i, files_1, file, pdfBuffer, pdfData, markdownText, pdfError_1, fileContent, error_2, error_3; var _this = this; return __generator(this, function (_a) { switch (_a.label) { case 0: if (!this.options.tfidfFilesDir || !this.tfidf) { return [2 /*return*/]; } _a.label = 1; case 1: _a.trys.push([1, 13, , 14]); files = fs_1.default.readdirSync(this.options.tfidfFilesDir) .filter(function (file) { return ['.txt', '.md', '.pdf'].some(function (ext) { return file.endsWith(ext); }); }) .map(function (file) { return path_1.default.join(_this.options.tfidfFilesDir, file); }); if (files.length === 0) { logger.info("No text files found in TF-IDF directory: ".concat(this.options.tfidfFilesDir)); return [2 /*return*/]; } logger.info("Loading ".concat(files.length, " text files for TF-IDF calculation")); _i = 0, files_1 = files; _a.label = 2; case 2: if (!(_i < files_1.length)) return [3 /*break*/, 12]; file = files_1[_i]; _a.label = 3; case 3: _a.trys.push([3, 10, , 11]); if (!file.endsWith('.pdf')) return [3 /*break*/, 8]; pdfBuffer = fs_1.default.readFileSync(file); _a.label = 4; case 4: _a.trys.push([4, 6, , 7]); return [4 /*yield*/, (0, pdf_parse_1.default)(pdfBuffer)]; case 5: pdfData = _a.sent(); // Use our custom converter to generate markdown try { markdownText = this.convertPdfTextToMarkdown(pdfData.text); this.tfidf.addDocument(markdownText, file); logger.info("Added PDF (converted to Markdown) to TF-IDF index: ".concat(file)); } catch (mdError) { // Fallback to plain text this.tfidf.addDocument(pdfData.text, file); logger.info("Added PDF (plain text) to TF-IDF index: ".concat(file)); } return [3 /*break*/, 7]; case 6: pdfError_1 = _a.sent(); logger.error("Error parsing PDF file: ".concat(file), { error: pdfError_1 }); // Skip this file and continue with others return [3 /*break*/, 11]; case 7: return [3 /*break*/, 9]; case 8: fileContent = fs_1.default.readFileSync(file, 'utf8'); this.tfidf.addDocument(fileContent, file); logger.info("Added TF-IDF file: ".concat(file)); _a.label = 9; case 9: return [3 /*break*/, 11]; case 10: error_2 = _a.sent(); logger.error("Error adding TF-IDF file: ".concat(file), { error: error_2 }); return [3 /*break*/, 11]; case 11: _i++; return [3 /*break*/, 2]; case 12: return [3 /*break*/, 14]; case 13: error_3 = _a.sent(); logger.error("Error loading TF-IDF files", { error: error_3, dir: this.options.tfidfFilesDir }); return [3 /*break*/, 14]; case 14: return [2 /*return*/]; } }); }); }; /** * Process text for search (tokenize, stem, etc.) */ DocumentStorage.prototype.processTextForSearch = function (text) { if (!text) return []; // Tokenize the text var tokens = this.tokenizer.tokenize(text.toLowerCase()) || []; // Remove stopwords and stem tokens var processedTokens = tokens .filter(function (token) { return token.length > 2; }) // Filter out very short tokens .map(function (token) { return PorterStemmer.stem(token); }); return processedTokens; }; /** * Split document content into paragraphs and process for indexing * @param doc Document to process * @param docIndex Index of the document * @returns Array of processed paragraphs */ DocumentStorage.prototype.processDocumentParagraphs = function (doc, docIndex) { if (!doc.content) return []; // Split content into paragraphs (split by double newline) var paragraphs = doc.content.split('\n\n').filter(function (p) { return p.trim().length > 0; }); // Create paragraph objects return paragraphs.map(function (content, paragraphId) { return ({ docId: docIndex, paragraphId: paragraphId, content: content, url: doc.url, title: doc.title }); }); }; /** * Store documents in the database */ DocumentStorage.prototype.storeDocuments = function (documents_1) { return __awaiter(this, arguments, void 0, function (documents, links) { var existingDocs, existingLinks, updatedDocs, updatedLinks; var _this = this; if (links === void 0) { links = []; } return __generator(this, function (_a) { switch (_a.label) { case 0: if (!!this.db) return [3 /*break*/, 2]; logger.info('Database not initialized, initializing now'); return [4 /*yield*/, this.initialize()]; case 1: _a.sent(); _a.label = 2; case 2: if (!this.db) { logger.error('Failed to initialize database'); throw new Error('Failed to initialize database'); } existingDocs = this.db.get('documents').value(); existingLinks = this.db.get('links').value(); updatedDocs = __spreadArray(__spreadArray([], existingDocs, true), documents, true); updatedLinks = __spreadArray(__spreadArray([], existingLinks, true), links, true); // Update database this.db.set('documents', updatedDocs).write(); this.db.set('links', updatedLinks).write(); // Reset TF-IDF index this.tfidf = new TfIdf(); this.paragraphMap.clear(); // Index all documents at paragraph level updatedDocs.forEach(function (doc, docIndex) { // Process document into paragraphs var paragraphs = _this.processDocumentParagraphs(doc, docIndex); // Add each paragraph to the TF-IDF index paragraphs.forEach(function (paragraph, paragraphId) { var docKey = "doc_".concat(docIndex, "_p").concat(paragraphId); if (_this.tfidf) { // Add document to TF-IDF index // Note: TfIdf.addDocument accepts either a string or string[] as the first parameter // and a document identifier (string) as the second parameter _this.tfidf.addDocument(_this.processTextForSearch(paragraph.content), docKey); } // Store paragraph information for later retrieval _this.paragraphMap.set(docKey, paragraph); }); }); logger.info("Stored ".concat(documents.length, " documents and ").concat(links.length, " links. Total: ").concat(updatedDocs.length, " documents, ").concat(updatedLinks.length, " links")); logger.info("Indexed ".concat(this.paragraphMap.size, " paragraphs for search")); return [2 /*return*/]; } }); }); }; /** * Add multiple documents to the storage */ DocumentStorage.prototype.addDocuments = function (documents) { return __awaiter(this, void 0, void 0, function () { var _i, documents_1, doc; return __generator(this, function (_a) { switch (_a.label) { case 0: if (!!this.db) return [3 /*break*/, 2]; logger.info('Database not initialized, initializing now'); return [4 /*yield*/, this.initialize()]; case 1: _a.sent(); _a.label = 2; case 2: if (!this.db) { logger.error('Failed to initialize database'); throw new Error('Failed to initialize database'); } _i = 0, documents_1 = documents; _a.label = 3; case 3: if (!(_i < documents_1.length)) return [3 /*break*/, 6]; doc = documents_1[_i]; return [4 /*yield*/, this.addDocument(doc)]; case 4: _a.sent(); _a.label = 5; case 5: _i++; return [3 /*break*/, 3]; case 6: logger.info("Added ".concat(documents.length, " documents to storage")); return [2 /*return*/]; } }); }); }; /** * Clear all documents */ DocumentStorage.prototype.clearDocuments = function () { return __awaiter(this, void 0, void 0, function () { return __generator(this, function (_a) { switch (_a.label) { case 0: if (!!this.db) return [3 /*break*/, 2]; logger.info('Database not initialized, initializing now'); return [4 /*yield*/, this.initialize()]; case 1: _a.sent(); _a.label = 2; case 2: if (!this.db) { logger.error('Failed to initialize database'); throw new Error('Failed to initialize database'); } this.db.set('documents', []).write(); this.tfidf = new TfIdf(); this.paragraphMap.clear(); logger.info('Cleared all documents'); return [2 /*return*/]; } }); }); }; /** * Highlight keywords in text using Markdown or HTML * @param text Text to highlight keywords in * @param keywords Keywords to highlight * @param format Format to use for highlighting ('markdown' or 'html') * @returns Text with highlighted keywords */ DocumentStorage.prototype.highlightKeywords = function (text, keywords, format) { if (format === void 0) { format = 'markdown'; } if (!text || keywords.length === 0) return text; // Create a copy of the text to avoid modifying the original var highlightedText = text; // Sort keywords by length (descending) to avoid highlighting parts of longer keywords var sortedKeywords = __spreadArray([], keywords, true).sort(function (a, b) { return b.length - a.length; }); // Create a regex pattern that matches whole words only // This avoids highlighting parts of words (e.g., "cat" in "category") var keywordPatterns = sortedKeywords.map(function (keyword) { // Escape special regex characters var escapedKeyword = keyword.replace(/[.*+?^${}()|[\]\\]/g, '\\$&'); // Match whole words only return "\\b".concat(escapedKeyword, "\\b"); }); // Create a combined pattern with word boundaries var combinedPattern = new RegExp(keywordPatterns.join('|'), 'gi'); // Replace keywords with highlighted versions if (format === 'markdown') { // Use Markdown bold syntax for highlighting highlightedText = highlightedText.replace(combinedPattern, function (match) { return "**".concat(match, "**"); }); } else { // Use HTML span tags with a highlight class for highlighting highlightedText = highlightedText.replace(combinedPattern, function (match) { return "<span class=\"highlight\">".concat(match, "</span>"); }); } return highlightedText; }; /** * Search for documents matching a query */ DocumentStorage.prototype.search = function (query_1) { return __awaiter(this, arguments, void 0, function (query, limit, filters, userId) { var dbDocuments, corpusDocuments, documentMap, documents, userPreferences, userInteractions, urlScores, queryTerms, similarQueries, userHistory, results; var _this = this; if (limit === void 0) { limit = 10; } return __generator(this, function (_a) { switch (_a.label) { case 0: if (!this.db || !this.tfidf) { logger.error('Failed to search: database or TF-IDF not initialized'); throw new Error('Failed to search: database or TF-IDF not initialized'); } if (!userId) return [3 /*break*/, 2]; return [4 /*yield*/, this.recordSearchQuery(userId, query, filters)]; case 1: _a.sent(); _a.label = 2; case 2: dbDocuments = this.db.get('documents').value(); logger.debug("Retrieved ".concat(dbDocuments.length, " documents from database")); return [4 /*yield*/, this.getCorpusDocuments()]; case 3: corpusDocuments = _a.sent(); logger.debug("Retrieved ".concat(corpusDocuments.length, " documents from corpus")); documentMap = new Map(); // Add database documents to map dbDocuments.forEach(function (doc) { documentMap.set(doc.url, doc); }); // Add corpus documents to map (will overwrite db documents with same URL if any) corpusDocuments.forEach(function (doc) { documentMap.set(doc.url, doc); }); documents = Array.from(documentMap.values()); logger.debug("Total unique documents after merging: ".concat(documents.length)); if (documents.length === 0) { logger.info('No documents in database or corpus directory to search'); return [2 /*return*/, []]; } userPreferences = null; userInteractions = []; if (!userId) return [3 /*break*/, 5]; return [4 /*yield*/, this.getUserPreferences(userId)]; case 4: userPreferences = _a.sent(); // Get recent interactions for this user userInteractions = this.db.get('interactions') .filter({ userId: userId }) .sortBy('timestamp') .reverse() .take(50) // Consider last 50 interactions .value(); _a.label = 5; case 5: urlScores = new Map(); queryTerms = this.tokenizer.tokenize(query.toLowerCase()) || []; similarQueries = []; if (!userId) return [3 /*break*/, 7]; return [4 /*yield*/, this.getUserSearchHistory(userId, 20)]; case 6: userHistory = _a.sent(); // Find similar past queries using simple term overlap similarQueries = userHistory .filter(function (entry) { var entryTerms = _this.tokenizer.tokenize(entry.query.toLowerCase()) || []; // Calculate term overlap var overlap = queryTerms.filter(function (term) { return entryTerms.includes(term); }).length; return overlap > 0 && overlap / Math.max(queryTerms.length, entryTerms.length) > 0.3; }) .map(function (entry) { return entry.query; }); _a.label = 7; case 7: // Calculate TF-IDF scores for documents documents.forEach(function (doc) { // Skip documents that don't match filters if (filters) { // Filter by categories if specified if (filters.categories && filters.categories.length > 0) { if (!doc.category || !filters.categories.includes(doc.category)) { return; } } // Filter by date range if specified if (filters.dateFrom && doc.timestamp && doc.timestamp < filters.dateFrom) { return; } if (filters.dateTo && doc.timestamp && doc.timestamp > filters.dateTo) { return; } // Filter by metadata if specified if (filters.metadata) { for (var _i = 0, _a = Object.entries(filters.metadata); _i < _a.length; _i++) { var _b = _a[_i], key = _b[0], value = _b[1]; if (!doc.metadata || doc.metadata[key] !== value) { return; } } } } // Calculate base relevance score var score = _this.calculateDocumentScore(doc, query); // Apply personalization if userId is provided if (userId && userPreferences) { // Boost score based on user's preferred categories if (userPreferences.preferredCategories && doc.category && userPreferences.preferredCategories.includes(doc.category)) { score *= 1.2; // 20% boost for preferred categories } // Boost score based on past interactions var docInteractions = userInteractions.filter(function (i) { return i.documentUrl === doc.url; }); if (docInteractions.length > 0) { // Calculate recency factor (more recent = higher boost) var mostRecent = docInteractions[0].timestamp; var now = Date.now(); var daysSinceInteraction = (now - mostRecent) / (1000 * 60 * 60 * 24); var recencyFactor = Math.max(0.1, Math.min(1, 1 - (daysSinceInteraction / 30))); // Decay over 30 days // Calculate interaction boost var interactionBoost_1 = 0; docInteractions.forEach(function (interaction) { switch (interaction.interactionType) { case 'bookmark': interactionBoost_1 += 0.3; // Strong signal break; case 'view': // Longer views get higher boost if (interaction.durationMs) { interactionBoost_1 += Math.min(0.2, interaction.durationMs / 60000 * 0.1); // Up to 0.2 for 2+ minute views } else { interactionBoost_1 += 0.1; } break; case 'click': interactionBoost_1 += 0.15; break; } }); // Apply interaction boost with recency factor score *= (1 + interactionBoost_1 * recencyFactor); } // Boost score based on similar queries if (similarQueries.length > 0) { // Check if document was relevant for similar queries var relevantForSimilarQueries = userInteractions.some(function (i) { return i.documentUrl === doc.url && similarQueries.includes(i.query); }); if (relevantForSimilarQueries) { score *= 1.15; // 15% boost for documents relevant to similar queries } } } urlScores.set(doc.url, score); }); results = []; Array.from(urlScores.entries()).forEach(function (_a) { var url = _a[0], score = _a[1]; var doc = documents.find(function (d) { return d.url === url; }); // Only include documents with score > 0 if (doc && score > 0) { // Create snippet with highlighting var snippet = _this.createSnippet(doc.content, query, true); results.push({ url: doc.url, title: doc.title, snippet: _this.createSnippet(doc.content, query, false), highlightedSnippet: snippet, score: score, category: doc.category, paragraph: doc.content, highlightedContent: _this.highlightKeywords(doc.content, queryTerms), fullDocument: doc.content }); } }); // Sort by score (descending) results.sort(function (a, b) { return b.score - a.score; }); // Return limited results return [2 /*return*/, results.slice(0, limit)]; } }); }); }; /** * Search for documents and aggregate results for LLM consumption * This method aggregates multiple search results into a single document * optimized for large language models */ DocumentStorage.prototype.searchForLLM = function (query_1) { return __awaiter(this, arguments, void 0, function (query, limit, filters, userId) { var searchResults, resultsByDocument, documentContents, documentEntries, i, _a, url, results, topResult, fullDocumentContent, dbDoc, filePath, fileExt, pdfBuffer, pdfData, error_4, error_5, queryTerms, relevantContext, topDocUrl, topDocFullContent, dbDoc, filePath, fileExt, pdfBuffer, pdfData, error_6, error_7, sources, topDocument; var _b, _c; if (limit === void 0) { limit = 10; } return __generator(this, function (_d) { switch (_d.label) { case 0: return [4 /*yield*/, this.search(query, limit, filters, userId)]; case 1: searchResults = _d.sent(); logger.debug("Search returned ".concat(searchResults.length, " results")); if (searchResults.length === 0) { return [2 /*return*/, { content: "No results found for your query.", sources: [] }]; } resultsByDocument