@vjlanguage/mcp-vj-docs
Version:
MCP server for documentation crawling, indexing, and retrieval
1,002 lines • 140 kB
JavaScript
#!/usr/bin/env node
"use strict";
var __assign = (this && this.__assign) || function () {
__assign = Object.assign || function(t) {
for (var s, i = 1, n = arguments.length; i < n; i++) {
s = arguments[i];
for (var p in s) if (Object.prototype.hasOwnProperty.call(s, p))
t[p] = s[p];
}
return t;
};
return __assign.apply(this, arguments);
};
var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
return new (P || (P = Promise))(function (resolve, reject) {
function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
step((generator = generator.apply(thisArg, _arguments || [])).next());
});
};
var __generator = (this && this.__generator) || function (thisArg, body) {
var _ = { label: 0, sent: function() { if (t[0] & 1) throw t[1]; return t[1]; }, trys: [], ops: [] }, f, y, t, g = Object.create((typeof Iterator === "function" ? Iterator : Object).prototype);
return g.next = verb(0), g["throw"] = verb(1), g["return"] = verb(2), typeof Symbol === "function" && (g[Symbol.iterator] = function() { return this; }), g;
function verb(n) { return function (v) { return step([n, v]); }; }
function step(op) {
if (f) throw new TypeError("Generator is already executing.");
while (g && (g = 0, op[0] && (_ = 0)), _) try {
if (f = 1, y && (t = op[0] & 2 ? y["return"] : op[0] ? y["throw"] || ((t = y["return"]) && t.call(y), 0) : y.next) && !(t = t.call(y, op[1])).done) return t;
if (y = 0, t) op = [op[0] & 2, t.value];
switch (op[0]) {
case 0: case 1: t = op; break;
case 4: _.label++; return { value: op[1], done: false };
case 5: _.label++; y = op[1]; op = [0]; continue;
case 7: op = _.ops.pop(); _.trys.pop(); continue;
default:
if (!(t = _.trys, t = t.length > 0 && t[t.length - 1]) && (op[0] === 6 || op[0] === 2)) { _ = 0; continue; }
if (op[0] === 3 && (!t || (op[1] > t[0] && op[1] < t[3]))) { _.label = op[1]; break; }
if (op[0] === 6 && _.label < t[1]) { _.label = t[1]; t = op; break; }
if (t && _.label < t[2]) { _.label = t[2]; _.ops.push(op); break; }
if (t[2]) _.ops.pop();
_.trys.pop(); continue;
}
op = body.call(thisArg, _);
} catch (e) { op = [6, e]; y = 0; } finally { f = t = 0; }
if (op[0] & 5) throw op[1]; return { value: op[0] ? op[1] : void 0, done: true };
}
};
var __rest = (this && this.__rest) || function (s, e) {
var t = {};
for (var p in s) if (Object.prototype.hasOwnProperty.call(s, p) && e.indexOf(p) < 0)
t[p] = s[p];
if (s != null && typeof Object.getOwnPropertySymbols === "function")
for (var i = 0, p = Object.getOwnPropertySymbols(s); i < p.length; i++) {
if (e.indexOf(p[i]) < 0 && Object.prototype.propertyIsEnumerable.call(s, p[i]))
t[p[i]] = s[p[i]];
}
return t;
};
var __spreadArray = (this && this.__spreadArray) || function (to, from, pack) {
if (pack || arguments.length === 2) for (var i = 0, l = from.length, ar; i < l; i++) {
if (ar || !(i in from)) {
if (!ar) ar = Array.prototype.slice.call(from, 0, i);
ar[i] = from[i];
}
}
return to.concat(ar || Array.prototype.slice.call(from));
};
var __importDefault = (this && this.__importDefault) || function (mod) {
return (mod && mod.__esModule) ? mod : { "default": mod };
};
Object.defineProperty(exports, "__esModule", { value: true });
var index_js_1 = require("@modelcontextprotocol/sdk/server/index.js");
var stdio_js_1 = require("@modelcontextprotocol/sdk/server/stdio.js");
var types_js_1 = require("@modelcontextprotocol/sdk/types.js");
var path_1 = __importDefault(require("path"));
var fs_1 = __importDefault(require("fs"));
var lowdb_1 = __importDefault(require("lowdb"));
var FileSync_js_1 = __importDefault(require("lowdb/adapters/FileSync.js"));
var natural_1 = __importDefault(require("natural"));
var TfIdf = natural_1.default.TfIdf, PorterStemmer = natural_1.default.PorterStemmer, WordTokenizer = natural_1.default.WordTokenizer;
var firecrawl_js_1 = __importDefault(require("@mendable/firecrawl-js"));
var os_1 = __importDefault(require("os"));
var winston_1 = __importDefault(require("winston"));
var pdf_parse_1 = __importDefault(require("pdf-parse"));
var buffer_1 = require("buffer");
// Configure logger
var logDir = expandTildePath(process.env.VJDOC_LOG_DIR || './logs');
// Ensure log directory exists
if (!fs_1.default.existsSync(logDir)) {
try {
fs_1.default.mkdirSync(logDir, { recursive: true });
}
catch (error) {
console.error("Error creating log directory: ".concat(error));
}
}
var logLevel = process.env.VJDOC_LOG_LEVEL || 'info';
var logToFile = process.env.VJDOC_LOG_TO_FILE !== 'false';
// Create logger instance
var logger = winston_1.default.createLogger({
level: logLevel,
format: winston_1.default.format.combine(winston_1.default.format.timestamp(), winston_1.default.format.json()),
defaultMeta: { service: 'mcp-vj-docs' },
transports: [
new winston_1.default.transports.Console({
format: winston_1.default.format.combine(winston_1.default.format.colorize(), winston_1.default.format.printf(function (info) {
var timestamp = info.timestamp, level = info.level, message = info.message, meta = __rest(info, ["timestamp", "level", "message"]);
return "".concat(timestamp, " ").concat(level, ": ").concat(message, " ").concat(Object.keys(meta).length ? JSON.stringify(meta, null, 2) : '');
}))
})
]
});
// Add file transport if enabled
if (logToFile) {
logger.add(new winston_1.default.transports.File({
filename: path_1.default.join(logDir, 'error.log'),
level: 'error'
}));
logger.add(new winston_1.default.transports.File({
filename: path_1.default.join(logDir, 'combined.log')
}));
}
logger.info('Logger initialized', { logLevel: logLevel, logToFile: logToFile, logDir: logDir });
// Utility function to expand tilde in paths
function expandTildePath(filePath) {
if (!filePath || typeof filePath !== 'string') {
return filePath;
}
// Check if path starts with tilde
if (filePath.startsWith('~/') || filePath === '~') {
return path_1.default.join(os_1.default.homedir(), filePath.substring(1));
}
else if (filePath.startsWith('~')) {
// Handle ~username/path format (not common in Node.js contexts)
var userEnd = filePath.indexOf('/');
if (userEnd === -1) {
return filePath; // Can't handle this case easily in Node.js
}
var username = filePath.substring(1, userEnd);
// This is a simplification; proper handling would require OS-specific logic
return filePath;
}
return filePath;
}
// Define tools
var CRAWL_TOOL = {
name: "vjdoc_crawl",
description: "Crawl a website and index its content for search",
inputSchema: {
type: "object",
properties: {
url: {
type: "string",
description: "URL to crawl",
},
maxDepth: {
type: "number",
description: "Maximum depth to crawl",
},
maxPages: {
type: "number",
description: "Maximum number of pages to crawl",
},
includePatterns: {
type: "array",
items: {
type: "string",
},
description: "Patterns to include in crawl",
},
excludePatterns: {
type: "array",
items: {
type: "string",
},
description: "Patterns to exclude from crawl",
},
defaultCategory: {
type: "string",
description: "Default category for documents if not detected automatically",
},
},
required: ["url"],
},
};
var SEARCH_TOOL = {
name: "vjdoc_search",
description: "Search indexed documents",
inputSchema: {
type: "object",
properties: {
query: {
type: "string",
description: "Search query"
},
limit: {
type: "number",
description: "Maximum number of results to return"
},
filters: {
type: "object",
description: "Optional filters to narrow down search results",
properties: {
categories: {
type: "array",
description: "Filter by document categories (e.g., 'API Documentation', 'Tutorial')",
items: {
type: "string"
}
},
dateFrom: {
type: "number",
description: "Filter documents created after this timestamp (in milliseconds)"
},
dateTo: {
type: "number",
description: "Filter documents created before this timestamp (in milliseconds)"
},
metadata: {
type: "object",
description: "Filter by metadata fields (key-value pairs)",
additionalProperties: true
}
}
},
userId: {
type: "string",
description: "Optional user ID for personalized results"
}
},
required: ["query"]
},
};
var ADD_CORPUS_FILE_TOOL = {
name: "vjdoc_add_corpus_file",
description: "Add a corpus file to the TF-IDF files directory",
inputSchema: {
type: "object",
properties: {
content: {
type: "string",
description: "Content to add to the corpus file (alternative to filePath)"
},
filePath: {
type: "string",
description: "Absolute path to a file to add to the corpus (alternative to content)"
},
filename: {
type: "string",
description: "Optional filename for the corpus file (without extension)"
},
category: {
type: "string",
description: "Optional category for the corpus file (e.g., 'Code Snippet', 'API Documentation', 'Error Solution', 'Technical Note')"
},
contentType: {
type: "string",
description: "Optional content type for the corpus file (e.g., 'text', 'markdown', 'pdf-base64')"
}
},
required: []
}
};
var GET_DOCS_META_TOOL = {
name: "vjdoc_get_docs_meta",
description: "Get metadata information about all documents and corpus files",
inputSchema: {
type: "object",
properties: {
query: {
type: "string",
description: "Natural language query or requirement"
}
},
required: ["query"]
}
};
var GET_DOCUMENT_TOOL = {
name: "vjdoc_get_document",
description: "Get the full content of a specific document by URL or title",
inputSchema: {
type: "object",
properties: {
url: {
type: "string",
description: "URL of the document to retrieve (optional if title is provided)"
},
title: {
type: "string",
description: "Title of the document to retrieve (optional if url is provided)"
}
}
}
};
// Type guard functions for parameter validation
function isCrawlArgs(args) {
logger.debug('Validating crawl args', {
argsType: typeof args,
args: args,
isObject: typeof args === "object",
isNull: args === null,
hasUrlProp: args !== null && typeof args === "object" && "url" in args,
urlType: args !== null && typeof args === "object" && "url" in args ?
typeof args.url : "N/A"
});
if (args === undefined || args === null) {
return false;
}
return (typeof args === "object" &&
args !== null &&
"url" in args &&
typeof args.url === "string");
}
function isSearchArgs(args) {
logger.debug('Validating search args', {
argsType: typeof args,
args: args,
isObject: typeof args === "object",
isNull: args === null,
hasQueryProp: args !== null && typeof args === "object" && "query" in args,
queryType: args !== null && typeof args === "object" && "query" in args ?
typeof args.query : "N/A"
});
return (args !== null &&
typeof args === "object" &&
"query" in args &&
typeof args.query === "string" &&
(!("limit" in args) ||
typeof args.limit === "number") &&
(!("filters" in args) ||
typeof args.filters === "object") &&
(!("userId" in args) ||
typeof args.userId === "string"));
}
function isAddCorpusFileArgs(args) {
return (args !== null &&
typeof args === 'object' &&
((typeof args.content === 'string' && args.content.trim() !== '') ||
(typeof args.filePath === 'string' && args.filePath.trim() !== '')) &&
(args.filename === undefined || typeof args.filename === 'string') &&
(args.category === undefined || typeof args.category === 'string') &&
(args.contentType === undefined || typeof args.contentType === 'string'));
}
function isGetDocsMetaArgs(args) {
return (args !== null &&
typeof args === "object" &&
"query" in args &&
typeof args.query === "string");
}
function isGetDocumentArgs(args) {
logger.debug('Validating get document args', {
argsType: typeof args,
args: args,
isObject: typeof args === "object",
isNull: args === null,
hasUrlProp: args !== null && typeof args === "object" && "url" in args,
urlType: args !== null && typeof args === "object" && "url" in args ?
typeof args.url : "N/A",
hasTitleProp: args !== null && typeof args === "object" && "title" in args,
titleType: args !== null && typeof args === "object" && "title" in args ?
typeof args.title : "N/A"
});
if (args === undefined || args === null) {
return false;
}
return (typeof args === "object" &&
args !== null &&
(("url" in args && typeof args.url === "string") ||
("title" in args && typeof args.title === "string")));
}
// Utility functions for document metadata extraction
/**
* Extract keywords from document content
* @param content Document content
* @param limit Maximum number of keywords to extract
* @returns Array of keywords
*/
function extractKeywords(content, limit) {
if (limit === void 0) { limit = 10; }
if (!content)
return [];
// Use simple TF-IDF idea to extract keywords
var tokenizer = new WordTokenizer();
var tokens = tokenizer.tokenize(content.toLowerCase()) || [];
// Filter out stop words and short words
var stopWords = ['a', 'an', 'the', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'with', 'by', 'of', 'is', 'are', 'was', 'were'];
var filteredTokens = tokens.filter(function (token) {
return token.length > 2 && !stopWords.includes(token) && !/^\d+$/.test(token);
});
// Calculate word frequency
var wordFreq = {};
filteredTokens.forEach(function (token) {
wordFreq[token] = (wordFreq[token] || 0) + 1;
});
// Sort by word frequency and return top N
return Object.entries(wordFreq)
.sort(function (a, b) { return b[1] - a[1]; })
.slice(0, limit)
.map(function (entry) { return entry[0]; });
}
/**
* Generate a summary of document content
* @param content Document content
* @param maxLength Maximum length of summary
* @returns Summary text
*/
function generateSummary(content, maxLength) {
if (maxLength === void 0) { maxLength = 150; }
if (!content)
return '';
// Simple method: take the first N characters as the summary
var summary = content.trim().substring(0, maxLength);
// Ensure the summary doesn't cut off in the middle of a word
var lastSpaceIndex = summary.lastIndexOf(' ');
if (lastSpaceIndex > 0 && summary.length === maxLength) {
return summary.substring(0, lastSpaceIndex) + '...';
}
return summary.length < content.length ? summary + '...' : summary;
}
// DocumentStorage class
var DocumentStorage = /** @class */ (function () {
function DocumentStorage(options) {
this.db = null;
this.tfidf = null;
this.tokenizer = new WordTokenizer();
// Add a map to store paragraph information
this.paragraphMap = new Map();
// Ensure paths have tilde expanded
this.options = __assign(__assign({}, options), { dbPath: expandTildePath(options.dbPath), tfidfFilesDir: options.tfidfFilesDir ? expandTildePath(options.tfidfFilesDir) : undefined });
logger.info("DocumentStorage initialized with dbPath: ".concat(this.options.dbPath, ", tfidfFilesDir: ").concat(this.options.tfidfFilesDir || 'not set'));
// Ensure database directory exists
var dbDir = path_1.default.dirname(this.options.dbPath);
logger.info("Ensuring database directory exists: ".concat(dbDir));
if (!fs_1.default.existsSync(dbDir)) {
try {
fs_1.default.mkdirSync(dbDir, { recursive: true });
logger.info("Created database directory: ".concat(dbDir));
}
catch (error) {
logger.error("Error creating database directory", { error: error, dbDir: dbDir });
throw new Error("Failed to create database directory: ".concat(error));
}
}
// Ensure TF-IDF files directory exists if specified
if (this.options.tfidfFilesDir) {
logger.info("Ensuring TF-IDF files directory exists: ".concat(this.options.tfidfFilesDir));
if (!fs_1.default.existsSync(this.options.tfidfFilesDir)) {
try {
fs_1.default.mkdirSync(this.options.tfidfFilesDir, { recursive: true });
logger.info("Created TF-IDF files directory: ".concat(this.options.tfidfFilesDir));
}
catch (error) {
logger.error("Error creating TF-IDF files directory", { error: error, dir: this.options.tfidfFilesDir });
throw new Error("Failed to create TF-IDF files directory: ".concat(error));
}
}
}
}
/**
* Initialize the database and search index
*/
DocumentStorage.prototype.initialize = function () {
return __awaiter(this, void 0, void 0, function () {
var adapter, documents, error_1;
var _this = this;
return __generator(this, function (_a) {
switch (_a.label) {
case 0:
logger.info("Initializing database from: ".concat(this.options.dbPath));
_a.label = 1;
case 1:
_a.trys.push([1, 4, , 5]);
adapter = new FileSync_js_1.default(this.options.dbPath);
this.db = (0, lowdb_1.default)(adapter);
// Initialize the database with default values if empty
this.db.defaults({ documents: [], links: [], searchHistory: [], interactions: [], userPreferences: [] }).write();
// Initialize TF-IDF
this.tfidf = new TfIdf();
if (!(this.options.tfidfFilesDir && fs_1.default.existsSync(this.options.tfidfFilesDir))) return [3 /*break*/, 3];
return [4 /*yield*/, this.loadTfidfFiles()];
case 2:
_a.sent();
_a.label = 3;
case 3:
documents = this.db.get('documents').value();
if (documents.length > 0) {
logger.info("Indexing ".concat(documents.length, " existing documents"));
documents.forEach(function (doc, index) {
// Process document into paragraphs
var paragraphs = _this.processDocumentParagraphs(doc, index);
// Add each paragraph to the TF-IDF index
paragraphs.forEach(function (paragraph, paragraphId) {
var docKey = "doc_".concat(index, "_p").concat(paragraphId);
if (_this.tfidf) {
_this.tfidf.addDocument(_this.processTextForSearch(paragraph.content), docKey);
}
// Store paragraph information for later retrieval
_this.paragraphMap.set(docKey, paragraph);
});
});
logger.info("Indexed ".concat(this.paragraphMap.size, " paragraphs for search"));
}
else {
logger.info('No existing documents to index');
}
logger.info('Database and search index initialized successfully');
return [3 /*break*/, 5];
case 4:
error_1 = _a.sent();
logger.error('Error initializing database', { error: error_1 });
throw new Error("Failed to initialize database: ".concat(error_1));
case 5: return [2 /*return*/];
}
});
});
};
/**
* Load text files from the TF-IDF files directory
*/
DocumentStorage.prototype.loadTfidfFiles = function () {
return __awaiter(this, void 0, void 0, function () {
var files, _i, files_1, file, pdfBuffer, pdfData, markdownText, pdfError_1, fileContent, error_2, error_3;
var _this = this;
return __generator(this, function (_a) {
switch (_a.label) {
case 0:
if (!this.options.tfidfFilesDir || !this.tfidf) {
return [2 /*return*/];
}
_a.label = 1;
case 1:
_a.trys.push([1, 13, , 14]);
files = fs_1.default.readdirSync(this.options.tfidfFilesDir)
.filter(function (file) { return ['.txt', '.md', '.pdf'].some(function (ext) { return file.endsWith(ext); }); })
.map(function (file) { return path_1.default.join(_this.options.tfidfFilesDir, file); });
if (files.length === 0) {
logger.info("No text files found in TF-IDF directory: ".concat(this.options.tfidfFilesDir));
return [2 /*return*/];
}
logger.info("Loading ".concat(files.length, " text files for TF-IDF calculation"));
_i = 0, files_1 = files;
_a.label = 2;
case 2:
if (!(_i < files_1.length)) return [3 /*break*/, 12];
file = files_1[_i];
_a.label = 3;
case 3:
_a.trys.push([3, 10, , 11]);
if (!file.endsWith('.pdf')) return [3 /*break*/, 8];
pdfBuffer = fs_1.default.readFileSync(file);
_a.label = 4;
case 4:
_a.trys.push([4, 6, , 7]);
return [4 /*yield*/, (0, pdf_parse_1.default)(pdfBuffer)];
case 5:
pdfData = _a.sent();
// Use our custom converter to generate markdown
try {
markdownText = this.convertPdfTextToMarkdown(pdfData.text);
this.tfidf.addDocument(markdownText, file);
logger.info("Added PDF (converted to Markdown) to TF-IDF index: ".concat(file));
}
catch (mdError) {
// Fallback to plain text
this.tfidf.addDocument(pdfData.text, file);
logger.info("Added PDF (plain text) to TF-IDF index: ".concat(file));
}
return [3 /*break*/, 7];
case 6:
pdfError_1 = _a.sent();
logger.error("Error parsing PDF file: ".concat(file), { error: pdfError_1 });
// Skip this file and continue with others
return [3 /*break*/, 11];
case 7: return [3 /*break*/, 9];
case 8:
fileContent = fs_1.default.readFileSync(file, 'utf8');
this.tfidf.addDocument(fileContent, file);
logger.info("Added TF-IDF file: ".concat(file));
_a.label = 9;
case 9: return [3 /*break*/, 11];
case 10:
error_2 = _a.sent();
logger.error("Error adding TF-IDF file: ".concat(file), { error: error_2 });
return [3 /*break*/, 11];
case 11:
_i++;
return [3 /*break*/, 2];
case 12: return [3 /*break*/, 14];
case 13:
error_3 = _a.sent();
logger.error("Error loading TF-IDF files", { error: error_3, dir: this.options.tfidfFilesDir });
return [3 /*break*/, 14];
case 14: return [2 /*return*/];
}
});
});
};
/**
* Process text for search (tokenize, stem, etc.)
*/
DocumentStorage.prototype.processTextForSearch = function (text) {
if (!text)
return [];
// Tokenize the text
var tokens = this.tokenizer.tokenize(text.toLowerCase()) || [];
// Remove stopwords and stem tokens
var processedTokens = tokens
.filter(function (token) { return token.length > 2; }) // Filter out very short tokens
.map(function (token) { return PorterStemmer.stem(token); });
return processedTokens;
};
/**
* Split document content into paragraphs and process for indexing
* @param doc Document to process
* @param docIndex Index of the document
* @returns Array of processed paragraphs
*/
DocumentStorage.prototype.processDocumentParagraphs = function (doc, docIndex) {
if (!doc.content)
return [];
// Split content into paragraphs (split by double newline)
var paragraphs = doc.content.split('\n\n').filter(function (p) { return p.trim().length > 0; });
// Create paragraph objects
return paragraphs.map(function (content, paragraphId) { return ({
docId: docIndex,
paragraphId: paragraphId,
content: content,
url: doc.url,
title: doc.title
}); });
};
/**
* Store documents in the database
*/
DocumentStorage.prototype.storeDocuments = function (documents_1) {
return __awaiter(this, arguments, void 0, function (documents, links) {
var existingDocs, existingLinks, updatedDocs, updatedLinks;
var _this = this;
if (links === void 0) { links = []; }
return __generator(this, function (_a) {
switch (_a.label) {
case 0:
if (!!this.db) return [3 /*break*/, 2];
logger.info('Database not initialized, initializing now');
return [4 /*yield*/, this.initialize()];
case 1:
_a.sent();
_a.label = 2;
case 2:
if (!this.db) {
logger.error('Failed to initialize database');
throw new Error('Failed to initialize database');
}
existingDocs = this.db.get('documents').value();
existingLinks = this.db.get('links').value();
updatedDocs = __spreadArray(__spreadArray([], existingDocs, true), documents, true);
updatedLinks = __spreadArray(__spreadArray([], existingLinks, true), links, true);
// Update database
this.db.set('documents', updatedDocs).write();
this.db.set('links', updatedLinks).write();
// Reset TF-IDF index
this.tfidf = new TfIdf();
this.paragraphMap.clear();
// Index all documents at paragraph level
updatedDocs.forEach(function (doc, docIndex) {
// Process document into paragraphs
var paragraphs = _this.processDocumentParagraphs(doc, docIndex);
// Add each paragraph to the TF-IDF index
paragraphs.forEach(function (paragraph, paragraphId) {
var docKey = "doc_".concat(docIndex, "_p").concat(paragraphId);
if (_this.tfidf) {
// Add document to TF-IDF index
// Note: TfIdf.addDocument accepts either a string or string[] as the first parameter
// and a document identifier (string) as the second parameter
_this.tfidf.addDocument(_this.processTextForSearch(paragraph.content), docKey);
}
// Store paragraph information for later retrieval
_this.paragraphMap.set(docKey, paragraph);
});
});
logger.info("Stored ".concat(documents.length, " documents and ").concat(links.length, " links. Total: ").concat(updatedDocs.length, " documents, ").concat(updatedLinks.length, " links"));
logger.info("Indexed ".concat(this.paragraphMap.size, " paragraphs for search"));
return [2 /*return*/];
}
});
});
};
/**
* Add multiple documents to the storage
*/
DocumentStorage.prototype.addDocuments = function (documents) {
return __awaiter(this, void 0, void 0, function () {
var _i, documents_1, doc;
return __generator(this, function (_a) {
switch (_a.label) {
case 0:
if (!!this.db) return [3 /*break*/, 2];
logger.info('Database not initialized, initializing now');
return [4 /*yield*/, this.initialize()];
case 1:
_a.sent();
_a.label = 2;
case 2:
if (!this.db) {
logger.error('Failed to initialize database');
throw new Error('Failed to initialize database');
}
_i = 0, documents_1 = documents;
_a.label = 3;
case 3:
if (!(_i < documents_1.length)) return [3 /*break*/, 6];
doc = documents_1[_i];
return [4 /*yield*/, this.addDocument(doc)];
case 4:
_a.sent();
_a.label = 5;
case 5:
_i++;
return [3 /*break*/, 3];
case 6:
logger.info("Added ".concat(documents.length, " documents to storage"));
return [2 /*return*/];
}
});
});
};
/**
* Clear all documents
*/
DocumentStorage.prototype.clearDocuments = function () {
return __awaiter(this, void 0, void 0, function () {
return __generator(this, function (_a) {
switch (_a.label) {
case 0:
if (!!this.db) return [3 /*break*/, 2];
logger.info('Database not initialized, initializing now');
return [4 /*yield*/, this.initialize()];
case 1:
_a.sent();
_a.label = 2;
case 2:
if (!this.db) {
logger.error('Failed to initialize database');
throw new Error('Failed to initialize database');
}
this.db.set('documents', []).write();
this.tfidf = new TfIdf();
this.paragraphMap.clear();
logger.info('Cleared all documents');
return [2 /*return*/];
}
});
});
};
/**
* Highlight keywords in text using Markdown or HTML
* @param text Text to highlight keywords in
* @param keywords Keywords to highlight
* @param format Format to use for highlighting ('markdown' or 'html')
* @returns Text with highlighted keywords
*/
DocumentStorage.prototype.highlightKeywords = function (text, keywords, format) {
if (format === void 0) { format = 'markdown'; }
if (!text || keywords.length === 0)
return text;
// Create a copy of the text to avoid modifying the original
var highlightedText = text;
// Sort keywords by length (descending) to avoid highlighting parts of longer keywords
var sortedKeywords = __spreadArray([], keywords, true).sort(function (a, b) { return b.length - a.length; });
// Create a regex pattern that matches whole words only
// This avoids highlighting parts of words (e.g., "cat" in "category")
var keywordPatterns = sortedKeywords.map(function (keyword) {
// Escape special regex characters
var escapedKeyword = keyword.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
// Match whole words only
return "\\b".concat(escapedKeyword, "\\b");
});
// Create a combined pattern with word boundaries
var combinedPattern = new RegExp(keywordPatterns.join('|'), 'gi');
// Replace keywords with highlighted versions
if (format === 'markdown') {
// Use Markdown bold syntax for highlighting
highlightedText = highlightedText.replace(combinedPattern, function (match) { return "**".concat(match, "**"); });
}
else {
// Use HTML span tags with a highlight class for highlighting
highlightedText = highlightedText.replace(combinedPattern, function (match) {
return "<span class=\"highlight\">".concat(match, "</span>");
});
}
return highlightedText;
};
/**
* Search for documents matching a query
*/
DocumentStorage.prototype.search = function (query_1) {
return __awaiter(this, arguments, void 0, function (query, limit, filters, userId) {
var dbDocuments, corpusDocuments, documentMap, documents, userPreferences, userInteractions, urlScores, queryTerms, similarQueries, userHistory, results;
var _this = this;
if (limit === void 0) { limit = 10; }
return __generator(this, function (_a) {
switch (_a.label) {
case 0:
if (!this.db || !this.tfidf) {
logger.error('Failed to search: database or TF-IDF not initialized');
throw new Error('Failed to search: database or TF-IDF not initialized');
}
if (!userId) return [3 /*break*/, 2];
return [4 /*yield*/, this.recordSearchQuery(userId, query, filters)];
case 1:
_a.sent();
_a.label = 2;
case 2:
dbDocuments = this.db.get('documents').value();
logger.debug("Retrieved ".concat(dbDocuments.length, " documents from database"));
return [4 /*yield*/, this.getCorpusDocuments()];
case 3:
corpusDocuments = _a.sent();
logger.debug("Retrieved ".concat(corpusDocuments.length, " documents from corpus"));
documentMap = new Map();
// Add database documents to map
dbDocuments.forEach(function (doc) {
documentMap.set(doc.url, doc);
});
// Add corpus documents to map (will overwrite db documents with same URL if any)
corpusDocuments.forEach(function (doc) {
documentMap.set(doc.url, doc);
});
documents = Array.from(documentMap.values());
logger.debug("Total unique documents after merging: ".concat(documents.length));
if (documents.length === 0) {
logger.info('No documents in database or corpus directory to search');
return [2 /*return*/, []];
}
userPreferences = null;
userInteractions = [];
if (!userId) return [3 /*break*/, 5];
return [4 /*yield*/, this.getUserPreferences(userId)];
case 4:
userPreferences = _a.sent();
// Get recent interactions for this user
userInteractions = this.db.get('interactions')
.filter({ userId: userId })
.sortBy('timestamp')
.reverse()
.take(50) // Consider last 50 interactions
.value();
_a.label = 5;
case 5:
urlScores = new Map();
queryTerms = this.tokenizer.tokenize(query.toLowerCase()) || [];
similarQueries = [];
if (!userId) return [3 /*break*/, 7];
return [4 /*yield*/, this.getUserSearchHistory(userId, 20)];
case 6:
userHistory = _a.sent();
// Find similar past queries using simple term overlap
similarQueries = userHistory
.filter(function (entry) {
var entryTerms = _this.tokenizer.tokenize(entry.query.toLowerCase()) || [];
// Calculate term overlap
var overlap = queryTerms.filter(function (term) { return entryTerms.includes(term); }).length;
return overlap > 0 && overlap / Math.max(queryTerms.length, entryTerms.length) > 0.3;
})
.map(function (entry) { return entry.query; });
_a.label = 7;
case 7:
// Calculate TF-IDF scores for documents
documents.forEach(function (doc) {
// Skip documents that don't match filters
if (filters) {
// Filter by categories if specified
if (filters.categories && filters.categories.length > 0) {
if (!doc.category || !filters.categories.includes(doc.category)) {
return;
}
}
// Filter by date range if specified
if (filters.dateFrom && doc.timestamp && doc.timestamp < filters.dateFrom) {
return;
}
if (filters.dateTo && doc.timestamp && doc.timestamp > filters.dateTo) {
return;
}
// Filter by metadata if specified
if (filters.metadata) {
for (var _i = 0, _a = Object.entries(filters.metadata); _i < _a.length; _i++) {
var _b = _a[_i], key = _b[0], value = _b[1];
if (!doc.metadata || doc.metadata[key] !== value) {
return;
}
}
}
}
// Calculate base relevance score
var score = _this.calculateDocumentScore(doc, query);
// Apply personalization if userId is provided
if (userId && userPreferences) {
// Boost score based on user's preferred categories
if (userPreferences.preferredCategories && doc.category &&
userPreferences.preferredCategories.includes(doc.category)) {
score *= 1.2; // 20% boost for preferred categories
}
// Boost score based on past interactions
var docInteractions = userInteractions.filter(function (i) { return i.documentUrl === doc.url; });
if (docInteractions.length > 0) {
// Calculate recency factor (more recent = higher boost)
var mostRecent = docInteractions[0].timestamp;
var now = Date.now();
var daysSinceInteraction = (now - mostRecent) / (1000 * 60 * 60 * 24);
var recencyFactor = Math.max(0.1, Math.min(1, 1 - (daysSinceInteraction / 30))); // Decay over 30 days
// Calculate interaction boost
var interactionBoost_1 = 0;
docInteractions.forEach(function (interaction) {
switch (interaction.interactionType) {
case 'bookmark':
interactionBoost_1 += 0.3; // Strong signal
break;
case 'view':
// Longer views get higher boost
if (interaction.durationMs) {
interactionBoost_1 += Math.min(0.2, interaction.durationMs / 60000 * 0.1); // Up to 0.2 for 2+ minute views
}
else {
interactionBoost_1 += 0.1;
}
break;
case 'click':
interactionBoost_1 += 0.15;
break;
}
});
// Apply interaction boost with recency factor
score *= (1 + interactionBoost_1 * recencyFactor);
}
// Boost score based on similar queries
if (similarQueries.length > 0) {
// Check if document was relevant for similar queries
var relevantForSimilarQueries = userInteractions.some(function (i) {
return i.documentUrl === doc.url && similarQueries.includes(i.query);
});
if (relevantForSimilarQueries) {
score *= 1.15; // 15% boost for documents relevant to similar queries
}
}
}
urlScores.set(doc.url, score);
});
results = [];
Array.from(urlScores.entries()).forEach(function (_a) {
var url = _a[0], score = _a[1];
var doc = documents.find(function (d) { return d.url === url; });
// Only include documents with score > 0
if (doc && score > 0) {
// Create snippet with highlighting
var snippet = _this.createSnippet(doc.content, query, true);
results.push({
url: doc.url,
title: doc.title,
snippet: _this.createSnippet(doc.content, query, false),
highlightedSnippet: snippet,
score: score,
category: doc.category,
paragraph: doc.content,
highlightedContent: _this.highlightKeywords(doc.content, queryTerms),
fullDocument: doc.content
});
}
});
// Sort by score (descending)
results.sort(function (a, b) { return b.score - a.score; });
// Return limited results
return [2 /*return*/, results.slice(0, limit)];
}
});
});
};
/**
* Search for documents and aggregate results for LLM consumption
* This method aggregates multiple search results into a single document
* optimized for large language models
*/
DocumentStorage.prototype.searchForLLM = function (query_1) {
return __awaiter(this, arguments, void 0, function (query, limit, filters, userId) {
var searchResults, resultsByDocument, documentContents, documentEntries, i, _a, url, results, topResult, fullDocumentContent, dbDoc, filePath, fileExt, pdfBuffer, pdfData, error_4, error_5, queryTerms, relevantContext, topDocUrl, topDocFullContent, dbDoc, filePath, fileExt, pdfBuffer, pdfData, error_6, error_7, sources, topDocument;
var _b, _c;
if (limit === void 0) { limit = 10; }
return __generator(this, function (_d) {
switch (_d.label) {
case 0: return [4 /*yield*/, this.search(query, limit, filters, userId)];
case 1:
searchResults = _d.sent();
logger.debug("Search returned ".concat(searchResults.length, " results"));
if (searchResults.length === 0) {
return [2 /*return*/, {
content: "No results found for your query.",
sources: []
}];
}
resultsByDocument