@vjlanguage/mcp-vj-docs
Version:
MCP server for documentation crawling, indexing, and retrieval
1,009 lines • 176 kB
JavaScript
#!/usr/bin/env node
"use strict";
var __assign = (this && this.__assign) || function () {
__assign = Object.assign || function(t) {
for (var s, i = 1, n = arguments.length; i < n; i++) {
s = arguments[i];
for (var p in s) if (Object.prototype.hasOwnProperty.call(s, p))
t[p] = s[p];
}
return t;
};
return __assign.apply(this, arguments);
};
var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
return new (P || (P = Promise))(function (resolve, reject) {
function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
step((generator = generator.apply(thisArg, _arguments || [])).next());
});
};
var __generator = (this && this.__generator) || function (thisArg, body) {
var _ = { label: 0, sent: function() { if (t[0] & 1) throw t[1]; return t[1]; }, trys: [], ops: [] }, f, y, t, g = Object.create((typeof Iterator === "function" ? Iterator : Object).prototype);
return g.next = verb(0), g["throw"] = verb(1), g["return"] = verb(2), typeof Symbol === "function" && (g[Symbol.iterator] = function() { return this; }), g;
function verb(n) { return function (v) { return step([n, v]); }; }
function step(op) {
if (f) throw new TypeError("Generator is already executing.");
while (g && (g = 0, op[0] && (_ = 0)), _) try {
if (f = 1, y && (t = op[0] & 2 ? y["return"] : op[0] ? y["throw"] || ((t = y["return"]) && t.call(y), 0) : y.next) && !(t = t.call(y, op[1])).done) return t;
if (y = 0, t) op = [op[0] & 2, t.value];
switch (op[0]) {
case 0: case 1: t = op; break;
case 4: _.label++; return { value: op[1], done: false };
case 5: _.label++; y = op[1]; op = [0]; continue;
case 7: op = _.ops.pop(); _.trys.pop(); continue;
default:
if (!(t = _.trys, t = t.length > 0 && t[t.length - 1]) && (op[0] === 6 || op[0] === 2)) { _ = 0; continue; }
if (op[0] === 3 && (!t || (op[1] > t[0] && op[1] < t[3]))) { _.label = op[1]; break; }
if (op[0] === 6 && _.label < t[1]) { _.label = t[1]; t = op; break; }
if (t && _.label < t[2]) { _.label = t[2]; _.ops.push(op); break; }
if (t[2]) _.ops.pop();
_.trys.pop(); continue;
}
op = body.call(thisArg, _);
} catch (e) { op = [6, e]; y = 0; } finally { f = t = 0; }
if (op[0] & 5) throw op[1]; return { value: op[0] ? op[1] : void 0, done: true };
}
};
var __rest = (this && this.__rest) || function (s, e) {
var t = {};
for (var p in s) if (Object.prototype.hasOwnProperty.call(s, p) && e.indexOf(p) < 0)
t[p] = s[p];
if (s != null && typeof Object.getOwnPropertySymbols === "function")
for (var i = 0, p = Object.getOwnPropertySymbols(s); i < p.length; i++) {
if (e.indexOf(p[i]) < 0 && Object.prototype.propertyIsEnumerable.call(s, p[i]))
t[p[i]] = s[p[i]];
}
return t;
};
var __spreadArray = (this && this.__spreadArray) || function (to, from, pack) {
if (pack || arguments.length === 2) for (var i = 0, l = from.length, ar; i < l; i++) {
if (ar || !(i in from)) {
if (!ar) ar = Array.prototype.slice.call(from, 0, i);
ar[i] = from[i];
}
}
return to.concat(ar || Array.prototype.slice.call(from));
};
var __importDefault = (this && this.__importDefault) || function (mod) {
return (mod && mod.__esModule) ? mod : { "default": mod };
};
Object.defineProperty(exports, "__esModule", { value: true });
var express_1 = __importDefault(require("express"));
var index_js_1 = require("@modelcontextprotocol/sdk/server/index.js");
var stdio_js_1 = require("@modelcontextprotocol/sdk/server/stdio.js");
var sse_js_1 = require("@modelcontextprotocol/sdk/server/sse.js");
var streamableHttp_js_1 = require("@modelcontextprotocol/sdk/server/streamableHttp.js");
var types_js_1 = require("@modelcontextprotocol/sdk/types.js");
var path_1 = __importDefault(require("path"));
var fs_1 = __importDefault(require("fs"));
var lowdb_1 = __importDefault(require("lowdb"));
var FileSync_js_1 = __importDefault(require("lowdb/adapters/FileSync.js"));
var natural_1 = __importDefault(require("natural"));
var TfIdf = natural_1.default.TfIdf, PorterStemmer = natural_1.default.PorterStemmer, WordTokenizer = natural_1.default.WordTokenizer;
var firecrawl_js_1 = __importDefault(require("@mendable/firecrawl-js"));
var os_1 = __importDefault(require("os"));
var winston_1 = __importDefault(require("winston"));
var pdf_parse_1 = __importDefault(require("pdf-parse"));
var buffer_1 = require("buffer");
var date_fns_1 = require("date-fns");
var logDir = expandTildePath(process.env.VJDOC_LOG_DIR || './logs');
if (!fs_1.default.existsSync(logDir)) {
try {
fs_1.default.mkdirSync(logDir, { recursive: true });
}
catch (error) {
console.error("Error creating log directory: ".concat(error));
}
}
var logLevel = process.env.VJDOC_LOG_LEVEL || 'info';
var logToFile = process.env.VJDOC_LOG_TO_FILE !== 'false';
var enableStdioTransport = process.env.ENABLE_STDIO_TRANSPORT !== 'false';
var enableStreamableHttp = process.env.ENABLE_STREAMABLE_HTTP === 'true';
var enableLegacySse = process.env.ENABLE_LEGACY_SSE === 'true';
var streamableHttpPort = parseInt(process.env.STREAMABLE_HTTP_PORT || '3000', 10);
var legacySsePort = parseInt(process.env.LEGACY_SSE_PORT || '3001', 10);
console.log(' process.env:', process.env);
var timestampFormat = winston_1.default.format(function (info) {
info.timestamp = (0, date_fns_1.format)(new Date(), 'yyyy-MM-dd HH:mm:ss');
return info;
})();
var logger = winston_1.default.createLogger({
level: logLevel,
format: winston_1.default.format.combine(timestampFormat, winston_1.default.format.json()),
defaultMeta: { service: 'mcp-vj-docs' },
transports: [
new winston_1.default.transports.Console({
format: winston_1.default.format.combine(winston_1.default.format.colorize(), winston_1.default.format.printf(function (info) {
var timestamp = info.timestamp, level = info.level, message = info.message, meta = __rest(info, ["timestamp", "level", "message"]);
return "".concat(timestamp, " ").concat(level, ": ").concat(message, " ").concat(Object.keys(meta).length ? JSON.stringify(meta, null, 2) : '');
}))
})
]
});
if (logToFile) {
logger.add(new winston_1.default.transports.File({
filename: path_1.default.join(logDir, 'error.log'),
level: 'error'
}));
logger.add(new winston_1.default.transports.File({
filename: path_1.default.join(logDir, 'combined.log')
}));
}
logger.info('Logger initialized', { logLevel: logLevel, logToFile: logToFile, logDir: logDir });
function expandTildePath(filePath) {
if (!filePath || typeof filePath !== 'string') {
return filePath;
}
if (filePath.startsWith('~/') || filePath === '~') {
return path_1.default.join(os_1.default.homedir(), filePath.substring(1));
}
else if (filePath.startsWith('~')) {
var userEnd = filePath.indexOf('/');
if (userEnd === -1) {
return filePath;
}
var username = filePath.substring(1, userEnd);
return filePath;
}
return filePath;
}
var CRAWL_TOOL = {
name: "vjdoc_crawl",
description: "Crawl a website and index its content for search",
inputSchema: {
type: "object",
properties: {
url: {
type: "string",
description: "URL to crawl",
},
maxDepth: {
type: "number",
description: "Maximum depth to crawl",
},
maxPages: {
type: "number",
description: "Maximum number of pages to crawl",
},
includePatterns: {
type: "array",
items: {
type: "string",
},
description: "Patterns to include in crawl",
},
excludePatterns: {
type: "array",
items: {
type: "string",
},
description: "Patterns to exclude from crawl",
},
defaultCategory: {
type: "string",
description: "Default category for documents if not detected automatically",
},
},
required: ["url"],
},
};
var SEARCH_TOOL = {
name: "vjdoc_search",
description: "Search indexed documents",
inputSchema: {
type: "object",
properties: {
query: {
type: "string",
description: "Search query"
},
limit: {
type: "number",
description: "Maximum number of results to return"
},
filters: {
type: "object",
description: "Optional filters to narrow down search results",
properties: {
categories: {
type: "array",
description: "Filter by document categories (e.g., 'API Documentation', 'Tutorial')",
items: {
type: "string"
}
},
dateFrom: {
type: "number",
description: "Filter documents created after this timestamp (in milliseconds)"
},
dateTo: {
type: "number",
description: "Filter documents created before this timestamp (in milliseconds)"
},
metadata: {
type: "object",
description: "Filter by metadata fields (key-value pairs)",
additionalProperties: true
}
}
},
userId: {
type: "string",
description: "Optional user ID for personalized results"
}
},
required: ["query"]
},
};
var ADD_CORPUS_FILE_TOOL = {
name: "vjdoc_add_corpus_file",
description: "Add a corpus file to the TF-IDF files directory",
inputSchema: {
type: "object",
properties: {
content: {
type: "string",
description: "Content to add to the corpus file (alternative to filePath)"
},
filePath: {
type: "string",
description: "Absolute path to a file to add to the corpus (alternative to content)"
},
filename: {
type: "string",
description: "Optional filename for the corpus file (without extension)"
},
category: {
type: "string",
description: "Optional category for the corpus file (e.g., 'Code Snippet', 'API Documentation', 'Error Solution', 'Technical Note')"
},
contentType: {
type: "string",
description: "Optional content type for the corpus file (e.g., 'text', 'markdown', 'pdf-base64')"
}
},
required: []
}
};
var GET_DOCS_META_TOOL = {
name: "vjdoc_get_docs_meta",
description: "Get metadata information about all documents and corpus files",
inputSchema: {
type: "object",
properties: {
query: {
type: "string",
description: "Natural language query or requirement"
}
},
required: ["query"]
}
};
var GET_DOCUMENT_TOOL = {
name: "vjdoc_get_document",
description: "Get the full content of a specific document by URL or title",
inputSchema: {
type: "object",
properties: {
url: {
type: "string",
description: "URL of the document to retrieve (optional if title is provided)"
},
title: {
type: "string",
description: "Title of the document to retrieve (optional if url is provided)"
}
}
}
};
function isCrawlArgs(args) {
logger.debug('Validating crawl args', {
argsType: typeof args,
args: args,
isObject: typeof args === "object",
isNull: args === null,
hasUrlProp: args !== null && typeof args === "object" && "url" in args,
urlType: args !== null && typeof args === "object" && "url" in args ?
typeof args.url : "N/A"
});
if (args === undefined || args === null) {
return false;
}
return (typeof args === "object" &&
args !== null &&
"url" in args &&
typeof args.url === "string");
}
function isSearchArgs(args) {
logger.debug('Validating search args', {
argsType: typeof args,
args: args,
isObject: typeof args === "object",
isNull: args === null,
hasQueryProp: args !== null && typeof args === "object" && "query" in args,
queryType: args !== null && typeof args === "object" && "query" in args ?
typeof args.query : "N/A"
});
return (args !== null &&
typeof args === "object" &&
"query" in args &&
typeof args.query === "string" &&
(!("limit" in args) ||
typeof args.limit === "number") &&
(!("filters" in args) ||
typeof args.filters === "object") &&
(!("userId" in args) ||
typeof args.userId === "string"));
}
function isAddCorpusFileArgs(args) {
return (args !== null &&
typeof args === 'object' &&
((typeof args.content === 'string' && args.content.trim() !== '') ||
(typeof args.filePath === 'string' && args.filePath.trim() !== '')) &&
(args.filename === undefined || typeof args.filename === 'string') &&
(args.category === undefined || typeof args.category === 'string') &&
(args.contentType === undefined || typeof args.contentType === 'string'));
}
function isGetDocsMetaArgs(args) {
return (args !== null &&
typeof args === "object" &&
"query" in args &&
typeof args.query === "string");
}
function isGetDocumentArgs(args) {
logger.debug('Validating get document args', {
argsType: typeof args,
args: args,
isObject: typeof args === "object",
isNull: args === null,
hasUrlProp: args !== null && typeof args === "object" && "url" in args,
urlType: args !== null && typeof args === "object" && "url" in args ?
typeof args.url : "N/A",
hasTitleProp: args !== null && typeof args === "object" && "title" in args,
titleType: args !== null && typeof args === "object" && "title" in args ?
typeof args.title : "N/A"
});
if (args === undefined || args === null) {
return false;
}
return (typeof args === "object" &&
args !== null &&
(("url" in args && typeof args.url === "string") ||
("title" in args && typeof args.title === "string")));
}
function extractKeywords(content, limit) {
if (limit === void 0) { limit = 10; }
if (!content)
return [];
var tokenizer = new WordTokenizer();
var tokens = tokenizer.tokenize(content.toLowerCase()) || [];
var stopWords = ['a', 'an', 'the', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'with', 'by', 'of', 'is', 'are', 'was', 'were'];
var filteredTokens = tokens.filter(function (token) {
return token.length > 2 && !stopWords.includes(token) && !/^\d+$/.test(token);
});
var wordFreq = {};
filteredTokens.forEach(function (token) {
wordFreq[token] = (wordFreq[token] || 0) + 1;
});
return Object.entries(wordFreq)
.sort(function (a, b) { return b[1] - a[1]; })
.slice(0, limit)
.map(function (entry) { return entry[0]; });
}
function generateSummary(content, maxLength) {
if (maxLength === void 0) { maxLength = 150; }
if (!content)
return '';
var summary = content.trim().substring(0, maxLength);
var lastSpaceIndex = summary.lastIndexOf(' ');
if (lastSpaceIndex > 0 && summary.length === maxLength) {
return summary.substring(0, lastSpaceIndex) + '...';
}
return summary.length < content.length ? summary + '...' : summary;
}
var DocumentStorage = (function () {
function DocumentStorage(options) {
this.db = null;
this.tfidf = null;
this.tokenizer = new WordTokenizer();
this.paragraphMap = new Map();
this.options = __assign(__assign({}, options), { dbPath: expandTildePath(options.dbPath), tfidfFilesDir: options.tfidfFilesDir ? expandTildePath(options.tfidfFilesDir) : undefined });
logger.info("DocumentStorage initialized with dbPath: ".concat(this.options.dbPath, ", tfidfFilesDir: ").concat(this.options.tfidfFilesDir || 'not set'));
var dbDir = path_1.default.dirname(this.options.dbPath);
logger.info("Ensuring database directory exists: ".concat(dbDir));
if (!fs_1.default.existsSync(dbDir)) {
try {
fs_1.default.mkdirSync(dbDir, { recursive: true });
logger.info("Created database directory: ".concat(dbDir));
}
catch (error) {
logger.error("Error creating database directory", { error: error, dbDir: dbDir });
throw new Error("Failed to create database directory: ".concat(error));
}
}
if (this.options.tfidfFilesDir) {
logger.info("Ensuring TF-IDF files directory exists: ".concat(this.options.tfidfFilesDir));
if (!fs_1.default.existsSync(this.options.tfidfFilesDir)) {
try {
fs_1.default.mkdirSync(this.options.tfidfFilesDir, { recursive: true });
logger.info("Created TF-IDF files directory: ".concat(this.options.tfidfFilesDir));
}
catch (error) {
logger.error("Error creating TF-IDF files directory", { error: error, dir: this.options.tfidfFilesDir });
throw new Error("Failed to create TF-IDF files directory: ".concat(error));
}
}
}
}
DocumentStorage.prototype.initialize = function () {
return __awaiter(this, void 0, void 0, function () {
var adapter, documents, error_1;
var _this = this;
return __generator(this, function (_a) {
switch (_a.label) {
case 0:
logger.info("Initializing database from: ".concat(this.options.dbPath));
_a.label = 1;
case 1:
_a.trys.push([1, 4, , 5]);
adapter = new FileSync_js_1.default(this.options.dbPath);
this.db = (0, lowdb_1.default)(adapter);
this.db.defaults({ documents: [], links: [], searchHistory: [], interactions: [], userPreferences: [] }).write();
this.tfidf = new TfIdf();
if (!(this.options.tfidfFilesDir && fs_1.default.existsSync(this.options.tfidfFilesDir))) return [3, 3];
return [4, this.loadTfidfFiles()];
case 2:
_a.sent();
_a.label = 3;
case 3:
documents = this.db.get('documents').value();
if (documents.length > 0) {
logger.info("Indexing ".concat(documents.length, " existing documents"));
documents.forEach(function (doc, index) {
var paragraphs = _this.processDocumentParagraphs(doc, index);
paragraphs.forEach(function (paragraph, paragraphId) {
var docKey = "doc_".concat(index, "_p").concat(paragraphId);
if (_this.tfidf) {
_this.tfidf.addDocument(_this.processTextForSearch(paragraph.content), docKey);
}
_this.paragraphMap.set(docKey, paragraph);
});
});
logger.info("Indexed ".concat(this.paragraphMap.size, " paragraphs for search"));
}
else {
logger.info('No existing documents to index');
}
logger.info('Database and search index initialized successfully');
return [3, 5];
case 4:
error_1 = _a.sent();
logger.error('Error initializing database', { error: error_1 });
throw new Error("Failed to initialize database: ".concat(error_1));
case 5: return [2];
}
});
});
};
DocumentStorage.prototype.scanFilesRecursive = function (dir) {
var supportedExts = ['.txt', '.md', '.pdf'];
var results = [];
var walk = function (currentDir) {
var entries = fs_1.default.readdirSync(currentDir, { withFileTypes: true });
var _loop_1 = function (entry) {
var fullPath = path_1.default.join(currentDir, entry.name);
if (entry.isDirectory()) {
walk(fullPath);
}
else if (entry.isFile() && supportedExts.some(function (ext) { return entry.name.toLowerCase().endsWith(ext); })) {
results.push(fullPath);
}
};
for (var _i = 0, entries_1 = entries; _i < entries_1.length; _i++) {
var entry = entries_1[_i];
_loop_1(entry);
}
};
walk(dir);
return results;
};
DocumentStorage.prototype.loadTfidfFiles = function () {
return __awaiter(this, void 0, void 0, function () {
var files, _i, files_1, file, pdfBuffer, pdfData, markdownText, pdfError_1, fileContent, error_2, error_3;
return __generator(this, function (_a) {
switch (_a.label) {
case 0:
if (!this.options.tfidfFilesDir || !this.tfidf) {
return [2];
}
_a.label = 1;
case 1:
_a.trys.push([1, 13, , 14]);
files = this.scanFilesRecursive(this.options.tfidfFilesDir);
if (files.length === 0) {
logger.info("No text files found in TF-IDF directory: ".concat(this.options.tfidfFilesDir));
return [2];
}
logger.info("Loading ".concat(files.length, " text files for TF-IDF calculation"));
_i = 0, files_1 = files;
_a.label = 2;
case 2:
if (!(_i < files_1.length)) return [3, 12];
file = files_1[_i];
_a.label = 3;
case 3:
_a.trys.push([3, 10, , 11]);
if (!file.endsWith('.pdf')) return [3, 8];
pdfBuffer = fs_1.default.readFileSync(file);
_a.label = 4;
case 4:
_a.trys.push([4, 6, , 7]);
return [4, (0, pdf_parse_1.default)(pdfBuffer)];
case 5:
pdfData = _a.sent();
try {
markdownText = this.convertPdfTextToMarkdown(pdfData.text);
this.tfidf.addDocument(markdownText, file);
logger.info("Added PDF (converted to Markdown) to TF-IDF index: ".concat(file));
}
catch (mdError) {
this.tfidf.addDocument(pdfData.text, file);
logger.info("Added PDF (plain text) to TF-IDF index: ".concat(file));
}
return [3, 7];
case 6:
pdfError_1 = _a.sent();
logger.error("Error parsing PDF file: ".concat(file), { error: pdfError_1 });
return [3, 11];
case 7: return [3, 9];
case 8:
fileContent = fs_1.default.readFileSync(file, 'utf8');
this.tfidf.addDocument(fileContent, file);
logger.info("Added TF-IDF file: ".concat(file));
_a.label = 9;
case 9: return [3, 11];
case 10:
error_2 = _a.sent();
logger.error("Error adding TF-IDF file: ".concat(file), { error: error_2 });
return [3, 11];
case 11:
_i++;
return [3, 2];
case 12: return [3, 14];
case 13:
error_3 = _a.sent();
logger.error("Error loading TF-IDF files", { error: error_3, dir: this.options.tfidfFilesDir });
return [3, 14];
case 14: return [2];
}
});
});
};
DocumentStorage.prototype.processTextForSearch = function (text) {
if (!text)
return [];
var containsChinese = /[\u4e00-\u9fa5]/.test(text);
if (containsChinese) {
var chineseTokens_1 = [];
var chineseMatches = text.match(/[\u4e00-\u9fa5]+/g) || [];
chineseMatches.forEach(function (match) {
for (var i = 0; i < match.length; i++) {
chineseTokens_1.push(match[i]);
}
for (var i = 0; i < match.length - 1; i++) {
chineseTokens_1.push(match.substring(i, i + 2));
}
for (var i = 0; i < match.length - 2; i++) {
chineseTokens_1.push(match.substring(i, i + 3));
}
if (match.length <= 10) {
chineseTokens_1.push(match);
}
});
var nonChineseText = text.replace(/[\u4e00-\u9fa5]+/g, ' ');
var nonChineseTokens = this.tokenizer.tokenize(nonChineseText.toLowerCase()) || [];
var allTokens = __spreadArray(__spreadArray([], chineseTokens_1, true), nonChineseTokens, true);
return allTokens.filter(function (token) {
return /[\u4e00-\u9fa5]/.test(token) || token.length > 2;
});
}
else {
var tokens = this.tokenizer.tokenize(text.toLowerCase()) || [];
var processedTokens = tokens
.filter(function (token) { return token.length > 2; })
.map(function (token) { return PorterStemmer.stem(token); });
return processedTokens;
}
};
DocumentStorage.prototype.processDocumentParagraphs = function (doc, docIndex) {
if (!doc.content)
return [];
var paragraphs = doc.content.split('\n\n').filter(function (p) { return p.trim().length > 0; });
return paragraphs.map(function (content, paragraphId) { return ({
docId: docIndex,
paragraphId: paragraphId,
content: content,
url: doc.url,
title: doc.title
}); });
};
DocumentStorage.prototype.storeDocuments = function (documents_1) {
return __awaiter(this, arguments, void 0, function (documents, links) {
var existingDocs, existingLinks, updatedDocs, updatedLinks;
var _this = this;
if (links === void 0) { links = []; }
return __generator(this, function (_a) {
switch (_a.label) {
case 0:
if (!!this.db) return [3, 2];
logger.info('Database not initialized, initializing now');
return [4, this.initialize()];
case 1:
_a.sent();
_a.label = 2;
case 2:
if (!this.db) {
logger.error('Failed to initialize database');
throw new Error('Failed to initialize database');
}
existingDocs = this.db.get('documents').value();
existingLinks = this.db.get('links').value();
updatedDocs = __spreadArray(__spreadArray([], existingDocs, true), documents, true);
updatedLinks = __spreadArray(__spreadArray([], existingLinks, true), links, true);
this.db.set('documents', updatedDocs).write();
this.db.set('links', updatedLinks).write();
this.tfidf = new TfIdf();
this.paragraphMap.clear();
updatedDocs.forEach(function (doc, docIndex) {
var paragraphs = _this.processDocumentParagraphs(doc, docIndex);
paragraphs.forEach(function (paragraph, paragraphId) {
var docKey = "doc_".concat(docIndex, "_p").concat(paragraphId);
if (_this.tfidf) {
_this.tfidf.addDocument(_this.processTextForSearch(paragraph.content), docKey);
}
_this.paragraphMap.set(docKey, paragraph);
});
});
logger.info("Stored ".concat(documents.length, " documents and ").concat(links.length, " links. Total: ").concat(updatedDocs.length, " documents, ").concat(updatedLinks.length, " links"));
logger.info("Indexed ".concat(this.paragraphMap.size, " paragraphs for search"));
return [2];
}
});
});
};
DocumentStorage.prototype.addDocuments = function (documents) {
return __awaiter(this, void 0, void 0, function () {
var _i, documents_1, doc;
return __generator(this, function (_a) {
switch (_a.label) {
case 0:
if (!!this.db) return [3, 2];
logger.info('Database not initialized, initializing now');
return [4, this.initialize()];
case 1:
_a.sent();
_a.label = 2;
case 2:
if (!this.db) {
logger.error('Failed to initialize database');
throw new Error('Failed to initialize database');
}
_i = 0, documents_1 = documents;
_a.label = 3;
case 3:
if (!(_i < documents_1.length)) return [3, 6];
doc = documents_1[_i];
return [4, this.addDocument(doc)];
case 4:
_a.sent();
_a.label = 5;
case 5:
_i++;
return [3, 3];
case 6:
logger.info("Added ".concat(documents.length, " documents to storage"));
return [2];
}
});
});
};
DocumentStorage.prototype.clearDocuments = function () {
return __awaiter(this, void 0, void 0, function () {
return __generator(this, function (_a) {
switch (_a.label) {
case 0:
if (!!this.db) return [3, 2];
logger.info('Database not initialized, initializing now');
return [4, this.initialize()];
case 1:
_a.sent();
_a.label = 2;
case 2:
if (!this.db) {
logger.error('Failed to initialize database');
throw new Error('Failed to initialize database');
}
this.db.set('documents', []).write();
this.tfidf = new TfIdf();
this.paragraphMap.clear();
logger.info('Cleared all documents');
return [2];
}
});
});
};
DocumentStorage.prototype.highlightKeywords = function (text, keywords, format) {
if (format === void 0) { format = 'markdown'; }
if (!text || keywords.length === 0)
return text;
var highlightedText = text;
var sortedKeywords = __spreadArray([], keywords, true).sort(function (a, b) { return b.length - a.length; });
var keywordPatterns = sortedKeywords.map(function (keyword) {
var escapedKeyword = keyword.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
return "\\b".concat(escapedKeyword, "\\b");
});
var combinedPattern = new RegExp(keywordPatterns.join('|'), 'gi');
if (format === 'markdown') {
highlightedText = highlightedText.replace(combinedPattern, function (match) { return "**".concat(match, "**"); });
}
else {
highlightedText = highlightedText.replace(combinedPattern, function (match) {
return "<span class=\"highlight\">".concat(match, "</span>");
});
}
return highlightedText;
};
DocumentStorage.prototype.search = function (query_1) {
return __awaiter(this, arguments, void 0, function (query, limit, filters, userId) {
var dbDocuments, corpusDocuments, documentMap, documents, userPreferences, userInteractions, urlScores, queryTerms, similarQueries, userHistory, results;
var _this = this;
if (limit === void 0) { limit = 10; }
return __generator(this, function (_a) {
switch (_a.label) {
case 0:
if (!this.db || !this.tfidf) {
logger.error('Failed to search: database or TF-IDF not initialized');
throw new Error('Failed to search: database or TF-IDF not initialized');
}
if (!userId) return [3, 2];
return [4, this.recordSearchQuery(userId, query, filters)];
case 1:
_a.sent();
_a.label = 2;
case 2:
dbDocuments = this.db.get('documents').value();
logger.debug("Retrieved ".concat(dbDocuments.length, " documents from database"));
return [4, this.getCorpusDocuments()];
case 3:
corpusDocuments = _a.sent();
logger.debug("Retrieved ".concat(corpusDocuments.length, " documents from corpus"));
documentMap = new Map();
dbDocuments.forEach(function (doc) {
documentMap.set(doc.url, doc);
});
corpusDocuments.forEach(function (doc) {
documentMap.set(doc.url, doc);
});
documents = Array.from(documentMap.values());
logger.debug("Total unique documents after merging: ".concat(documents.length));
if (documents.length === 0) {
logger.info('No documents in database or corpus directory to search');
return [2, []];
}
userPreferences = null;
userInteractions = [];
if (!userId) return [3, 5];
return [4, this.getUserPreferences(userId)];
case 4:
userPreferences = _a.sent();
userInteractions = this.db.get('interactions')
.filter({ userId: userId })
.sortBy('timestamp')
.reverse()
.take(50)
.value();
_a.label = 5;
case 5:
urlScores = new Map();
queryTerms = this.tokenizer.tokenize(query.toLowerCase()) || [];
similarQueries = [];
if (!userId) return [3, 7];
return [4, this.getUserSearchHistory(userId, 20)];
case 6:
userHistory = _a.sent();
similarQueries = userHistory
.filter(function (entry) {
var entryTerms = _this.tokenizer.tokenize(entry.query.toLowerCase()) || [];
var overlap = queryTerms.filter(function (term) { return entryTerms.includes(term); }).length;
return overlap > 0 && overlap / Math.max(queryTerms.length, entryTerms.length) > 0.3;
})
.map(function (entry) { return entry.query; });
_a.label = 7;
case 7:
documents.forEach(function (doc) {
if (filters) {
if (filters.categories && filters.categories.length > 0) {
if (!doc.category || !filters.categories.includes(doc.category)) {
return;
}
}
if (filters.dateFrom && doc.timestamp && doc.timestamp < filters.dateFrom) {
return;
}
if (filters.dateTo && doc.timestamp && doc.timestamp > filters.dateTo) {
return;
}
if (filters.metadata) {
for (var _i = 0, _a = Object.entries(filters.metadata); _i < _a.length; _i++) {
var _b = _a[_i], key = _b[0], value = _b[1];
if (!doc.metadata || doc.metadata[key] !== value) {
return;
}
}
}
}
var score = _this.calculateDocumentScore(doc, query);
if (userId && userPreferences) {
if (userPreferences.preferredCategories && doc.category &&
userPreferences.preferredCategories.includes(doc.category)) {
score *= 1.2;
}
var docInteractions = userInteractions.filter(function (i) { return i.documentUrl === doc.url; });
if (docInteractions.length > 0) {
var mostRecent = docInteractions[0].timestamp;
var now = Date.now();
var daysSinceInteraction = (now - mostRecent) / (1000 * 60 * 60 * 24);
var recencyFactor = Math.max(0.1, Math.min(1, 1 - (daysSinceInteraction / 30)));
var interactionBoost_1 = 0;
docInteractions.forEach(function (interaction) {
switch (interaction.interactionType) {
case 'bookmark':
interactionBoost_1 += 0.3;
break;
case 'view':
if (interaction.durationMs) {
interactionBoost_1 += Math.min(0.2, interaction.durationMs / 60000 * 0.1);
}
else {
interactionBoost_1 += 0.1;
}
break;
case 'click':
interactionBoost_1 += 0.15;
break;
}
});
score *= (1 + interactionBoost_1 * recencyFactor);
}
if (similarQueries.length > 0) {
var relevantForSimilarQueries = userInteractions.some(function (i) {
return i.documentUrl === doc.url && similarQueries.includes(i.query);
});
if (relevantForSimilarQueries) {
score *= 1.15;
}
}
}
urlScores.set(doc.url, score);
});
results = [];
Array.from(urlScores.entries()).forEach(function (_a) {
var url = _a[0], score = _a[1];
var doc = documents.find(function (d) { return d.url === url; });
if (doc && score > 0) {
var snippet = _this.createSnippet(doc.content, query, true);
results.push({
url: doc.url,
title: doc.title,
snippet: _this.createSnippet(doc.content, query, false),
highlightedSnippet: snippet,
score: score,
category: doc.category,
paragraph: doc.content,
highlightedContent: _this.highlightKeywords(doc.content, queryTerms),
fullDocument: doc.content
});
}
});
results.sort(function (a, b) { return b.score - a.score; });
return [2, results.slice(0, limit)];
}
});
});
};
DocumentStorage.prototype.searchForLLM = function (query_1) {
return __awaiter(this, arguments, void 0, function (query, limit, filters, userId) {
var searchResults, resultsByDocument, documentContents, documentEntries, i, _a, url, results, topResult, fullDocumentContent, dbDoc, filePath, fileExt, pdfBuffer, pdfData, error_4, error_5, queryTerms, relevantContext, topDocUrl, topDocFullContent, dbDoc, filePath, fileExt, pdfBuffer, pdfData, error_6, error_7, sources, topDocument;
var _b, _c;
if (limit === void 0) { limit = 10; }
return __generator(this, function (_d) {
switch (_d.label) {
case 0: return [4, this.search(query, limit, filters, userId)];
case 1:
searchResults = _d.sent();
logger.debug("Search returned ".concat(searchResults.length, " results"));
if (searchResults.length === 0) {
return [2, {
content: "No results found for your query.",
sources: []
}];
}
resultsByDocument = new Map();
searchResults.forEach(function (result) {
var results = resultsByDocument.get(result.url) || [];
results.push(result);
resultsByDocument.set(result.url, results);
});
documentContents = [];
documentEntries = Array.from(resultsByDocument.entries());
i = 0;
_d.label = 2;
case 2:
if (!(i < documentEntries.length)) return [3, 13];
_a = documentEntries[i], url = _a[0], results = _a[1];
topResult = results.reduce(function (prev, current) {
return (current.score > prev.score) ? current : prev;
});
fullDocumentContent = void 0;
dbDoc = (_b = this.db) === null || _b === void 0 ? void 0 : _b.get('documents').find({ url: url }).value();
if (!(dbDoc && dbDoc.content)) return [3, 3];
fullDocumentContent = dbDoc.content;
return [3, 11];
case 3:
if (!url.startsWith('file://')) return [3, 11];
_d.label = 4;
case 4:
_d.trys.push([4, 10, , 11]);
filePath = url.replace('file://', '');
if (!fs_1.default.existsSync(filePath)) return [3, 9];
fileExt = path_1.default.extname(filePath).toLowerCase();
if (!(fileExt === '.txt' || fileExt === '.md')) return [3, 5];
fullDocumentContent = fs_1.default.readFileSync(filePath, 'utf8');
return [3, 9];
case 5:
if (!(fileExt === '.pdf')) return [3, 9];
_d.label = 6;
case 6:
_d.trys.push([6, 8, , 9]);
pdfBuffer = fs_1.default.readFileSync(filePath);
return [4, (0, pdf_parse_1.default)(pdfBuffer)];
case 7:
pdfData = _d.sent();
fullDocumentContent = pdfData.text;
return [3, 9];
case 8:
error_4 = _d.sent();
logger.error("Error reading PDF file for fullDocument: ".concat(filePath), { error: error_4 });
return [3, 9];
case 9: return [3, 11];
case 10:
error_5 = _d.sent();
logger.error("Error reading file for fullDocument: ".concat(url), { error: error_5 });
return [3, 11];
case 11:
if (!fullDocumentContent) {
fullDocumentContent = topResult.snippet;
}
queryTerms = this.tokenizer.tokenize(query.toLowerCase()) || [];
relevantContext = fullDocumentContent
? this.extractRelevantContext(fullDocumentContent, queryTerms, query, 500)
: topResult.snippet;
documentContents.push({
url: url,
title: topResult.title,
paragraph: relevantContext,
highlightedParagraph: this.highlightKeywords(relevantContext, queryTerms),
score: topResult.score,
category: topResult.category,
fullDocument: undefined
});
_d.label = 12;
case 12:
i++;
return [3, 2];
case 13:
documentContents.sort(function (a, b) { return b.score - a.score; });
if