UNPKG

@aj-archipelago/cortex

Version:

Cortex is a GraphQL API for AI. It provides a simple, extensible interface for using AI services from OpenAI, Azure and others.

1,218 lines (1,083 loc) 109 kB
import logger from "./logger.js"; import stream from 'stream'; import os from 'os'; import http from 'http'; import https from 'https'; import { URL } from 'url'; import { v4 as uuidv4 } from 'uuid'; import { promisify } from 'util'; import { axios } from './requestExecutor.js'; import { config } from '../config.js'; import fs from 'fs'; import path from 'path'; import FormData from 'form-data'; import xxhash from 'xxhash-wasm'; import mime from 'mime-types'; import mimeDb from 'mime-db'; import { encrypt, decrypt } from './crypto.js'; const pipeline = promisify(stream.pipeline); const MEDIA_API_URL = config.get('whisperMediaApiUrl'); /** * Check if a URL is a YouTube URL * Validates URL structure to ensure it's a valid YouTube video URL * @param {string} url - URL to check * @returns {boolean} True if URL is a valid YouTube video URL */ export function isYoutubeUrl(url) { if (!url || typeof url !== 'string') return false; try { const urlObj = new URL(url); // Check for standard youtube.com domains if ( urlObj.hostname === "youtube.com" || urlObj.hostname === "www.youtube.com" ) { // For standard watch URLs, verify they have a video ID if (urlObj.pathname === "/watch") { return !!urlObj.searchParams.get("v"); } // For embed URLs, verify they have a video ID in the path if (urlObj.pathname.startsWith("/embed/")) { return urlObj.pathname.length > 7; // '/embed/' is 7 chars } // For shorts URLs, verify they have a video ID in the path if (urlObj.pathname.startsWith("/shorts/")) { return urlObj.pathname.length > 8; // '/shorts/' is 8 chars } return false; } // Check for shortened youtu.be domain if (urlObj.hostname === "youtu.be") { // Verify there's a video ID in the path return urlObj.pathname.length > 1; // '/' is 1 char } return false; } catch (err) { return false; } } // Cache xxhash instance for reuse let xxhashInstance = null; let xxhashInitPromise = null; /** * Get or initialize xxhash instance (reused for performance) * Thread-safe initialization to prevent race conditions in high-volume scenarios * @returns {Promise<Object>} xxhash instance */ async function getXXHashInstance() { // If already initialized, return immediately if (xxhashInstance) { return xxhashInstance; } // If initialization is in progress, wait for it if (xxhashInitPromise) { return await xxhashInitPromise; } // Start initialization (only one will execute) xxhashInitPromise = (async () => { try { const instance = await xxhash(); xxhashInstance = instance; return instance; } finally { // Clear the promise so we can retry if initialization fails xxhashInitPromise = null; } })(); return await xxhashInitPromise; } /** * Compute xxhash64 hash of a file (super fast hash for file deduplication) * Uses xxhash64 to match the hash format used in labeeb and cortex file handler * @param {string} filePath - Path to the file * @returns {Promise<string>} xxhash64 hash in hex format */ async function computeFileHash(filePath) { const hasher = await getXXHashInstance(); return new Promise((resolve, reject) => { // Create a new xxh64 instance for this file to avoid concurrency issues const xxh64 = hasher.create64(); const stream = fs.createReadStream(filePath); stream.on('data', (data) => xxh64.update(data)); stream.on('end', () => resolve(xxh64.digest().toString(16))); stream.on('error', (error) => reject(error)); }); } /** * Compute xxhash64 hash of a buffer * @param {Buffer} buffer - Buffer to hash * @returns {Promise<string>} xxhash64 hash in hex format */ async function computeBufferHash(buffer) { const hasher = await getXXHashInstance(); const xxh64 = hasher.create64(); xxh64.update(buffer); return xxh64.digest().toString(16); } /** * Fetch/load a file from URL via file handler * Downloads file from URL, processes it, and returns the result * @param {string} fileUrl - URL of file to fetch * @param {string} requestId - Request ID for tracking * @param {string|null} contextId - Optional context ID for scoped file storage * @param {boolean} save - Whether to save the file (default: false) * @returns {Promise<Object>} Response data with file information */ async function fetchFileFromUrl(fileUrl, requestId, contextId = null, save = false) { const fileHandlerUrl = MEDIA_API_URL; if (!fileHandlerUrl || fileHandlerUrl === 'null') { throw new Error('File handler URL is not configured'); } const url = buildFileHandlerUrl(fileHandlerUrl, { fetch: fileUrl, requestId, ...(contextId ? { contextId } : {}), ...(save ? { save: true } : {}) }); const response = await axios.get(url, { timeout: 60000 }); if (!response.data?.url && !Array.isArray(response.data)) { throw new Error("File handler did not return valid data"); } return response.data; } /** * Build a file handler URL with query parameters * Handles separator detection (? vs &) and parameter encoding * @param {string} baseUrl - Base file handler URL * @param {Object} params - Query parameters as key-value pairs (null/undefined values are skipped) * @returns {string} Complete URL with query parameters */ function buildFileHandlerUrl(baseUrl, params = {}) { if (!baseUrl) { throw new Error('baseUrl is required'); } const separator = baseUrl.includes('?') ? '&' : '?'; const queryParams = []; Object.entries(params).forEach(([key, value]) => { if (value != null && value !== '') { queryParams.push(`${encodeURIComponent(key)}=${encodeURIComponent(value)}`); } }); if (queryParams.length === 0) { return baseUrl; } return `${baseUrl}${separator}${queryParams.join('&')}`; } async function deleteTempPath(path) { try { if (!path) { logger.warn('Temporary path is not defined.'); return; } if (!fs.existsSync(path)) { logger.warn(`Temporary path ${path} does not exist.`); return; } const stats = fs.statSync(path); if (stats.isFile()) { fs.unlinkSync(path); logger.info(`Temporary file ${path} deleted successfully.`); } else if (stats.isDirectory()) { fs.rmSync(path, { recursive: true }); logger.info(`Temporary folder ${path} and its contents deleted successfully.`); } } catch (err) { logger.error(`Error occurred while deleting the temporary path: ${err}`); } } function generateUniqueFilename(extension) { return `${uuidv4()}.${extension}`; } const downloadFile = async (fileUrl) => { const urlObj = new URL(fileUrl); const pathname = urlObj.pathname; const fileExtension = path.extname(pathname).slice(1) || 'bin'; const uniqueFilename = generateUniqueFilename(fileExtension); const tempDir = os.tmpdir(); const localFilePath = `${tempDir}/${uniqueFilename}`; // eslint-disable-next-line no-async-promise-executor return new Promise(async (resolve, reject) => { try { const parsedUrl = new URL(fileUrl); const protocol = parsedUrl.protocol === 'https:' ? https : http; const response = await new Promise((resolve, reject) => { protocol.get(parsedUrl, (res) => { if (res.statusCode === 200) { resolve(res); } else { reject(new Error(`HTTP request failed with status code ${res.statusCode}`)); } }).on('error', reject); }); await pipeline(response, fs.createWriteStream(localFilePath)); logger.info(`Downloaded file to ${localFilePath}`); resolve(localFilePath); } catch (error) { fs.unlink(localFilePath, () => { reject(error); }); //throw error; } }); }; /** * Get media chunks from file handler (for chunked media files) * @param {string} file - File URL or URI * @param {string} requestId - Request ID for tracking * @param {string|null} contextId - Optional context ID for scoped file storage * @returns {Promise<Array>} Array of chunk URLs */ async function getMediaChunks(file, requestId, contextId = null) { try { if (MEDIA_API_URL) { const url = buildFileHandlerUrl(MEDIA_API_URL, { uri: file, requestId, ...(contextId ? { contextId } : {}) }); const res = await axios.get(url, { timeout: 600000 }); return res.data; } else { logger.info(`No API_URL set, returning file as chunk`); return [file]; } } catch (err) { logger.error(`Error getting media chunks list from api: ${err}`); throw err; } } /** * Mark a request as completed for cleanup in file handler * @param {string} requestId - Request ID to mark as completed * @param {string|null} contextId - Optional context ID for scoped file storage * @returns {Promise<Object|null>} Response data or null */ async function markCompletedForCleanUp(requestId, contextId = null) { try { if (MEDIA_API_URL) { const url = buildFileHandlerUrl(MEDIA_API_URL, { requestId, ...(contextId ? { contextId } : {}) }); const res = await axios.delete(url, { timeout: 15000 }); logger.info(`Marked request ${requestId} as completed: ${JSON.stringify(res.data)}`); return res.data; } } catch (err) { logger.error(`Error marking request ${requestId} as completed: ${err}`); } return null; } /** * Delete a file from cloud storage by hash * @param {string} hash - File hash to delete * @param {pathwayResolver} pathwayResolver - Optional pathway resolver for logging * @param {string|null} contextId - Optional but strongly recommended context id for scoped hashes * @returns {Promise<boolean>} True if file was deleted, false if not found or error */ async function deleteFileByHash(hash, pathwayResolver = null, contextId = null) { if (!hash || typeof hash !== 'string') { logger.warn('deleteFileByHash: hash is required and must be a string'); return false; } const fileHandlerUrl = MEDIA_API_URL; if (!fileHandlerUrl) { logger.warn('deleteFileByHash: WHISPER_MEDIA_API_URL is not set, cannot delete file'); return false; } try { const deleteUrl = buildFileHandlerUrl(fileHandlerUrl, { hash, ...(contextId ? { contextId } : {}) }); const response = await axios.delete(deleteUrl, { validateStatus: (status) => status >= 200 && status < 500, // Accept 200-499 as valid responses timeout: 30000 }); if (response.status === 200) { logger.info(`Successfully deleted file with hash ${hash}`); return true; } else if (response.status === 404) { logger.info(`File with hash ${hash} not found (may have already been deleted)`); return false; // Not an error - file doesn't exist } else { logger.warn(`Unexpected status ${response.status} when deleting file with hash ${hash}`); return false; } } catch (error) { // If it's a 404, that's fine - file doesn't exist if (error?.response?.status === 404) { logger.info(`File with hash ${hash} not found during deletion (may have already been deleted)`); return false; } // Log other errors but don't throw - deletion failure shouldn't block modification const errorMsg = error?.message || String(error); logger.warn(`Error deleting file with hash ${hash}: ${errorMsg}`); return false; } } // Helper function to extract file metadata from a content object // Returns normalized format with url and gcs (for file collection storage) // Note: displayFilename is not extracted from messages - it's set by CFH on upload, // or by sys_update_file_metadata.js, or by file collection tools function extractFileMetadataFromContent(contentObj) { const files = []; if (contentObj.type === 'image_url' && contentObj.image_url?.url) { files.push({ url: contentObj.image_url.url, gcs: contentObj.gcs || null, hash: contentObj.hash || null, type: 'image_url' }); } else if (contentObj.type === 'file' && contentObj.url) { files.push({ url: contentObj.url, gcs: contentObj.gcs || null, hash: contentObj.hash || null, type: 'file' }); } else if (contentObj.url && (contentObj.type === 'image_url' || !contentObj.type)) { // Handle direct URL objects files.push({ url: contentObj.url, gcs: contentObj.gcs || null, hash: contentObj.hash || null, type: contentObj.type || 'file' }); } return files; } // Cache for file collections during a request lifecycle // Stores raw parsed file data (all files from Redis) to support flexible filtering // Structure: { rawFiles: Array<parsed file data>, timestamp: number } const fileCollectionCache = new Map(); const CACHE_TTL = 5000; // 5 seconds // Singleton Redis client for file collection operations let redisClientSingleton = null; // Helper to get Redis client for direct hash map access async function getRedisClient() { if (redisClientSingleton) { return redisClientSingleton; } try { const { config } = await import('../config.js'); const connectionString = config.get('storageConnectionString'); if (!connectionString) { return null; } // Import Redis and create client const Redis = (await import('ioredis')).default; redisClientSingleton = new Redis(connectionString, { maxRetriesPerRequest: null, enableReadyCheck: true, lazyConnect: false, connectTimeout: 10000, }); // Handle errors redisClientSingleton.on('error', async (error) => { const logger = (await import('./logger.js')).default; logger.error(`Redis client error in fileUtils: ${error}`); }); return redisClientSingleton; } catch (e) { return null; } } /** * Get cache key for file collection */ function getCollectionCacheKey(contextId, contextKey) { // Cache key for file collection (legacy format maintained for cache compatibility) return `${contextId}-fileCollection-${contextKey || 'default'}`; } /** * Invalidate file collection cache for a given context * @param {string} contextId - Context ID for the file collection * @param {string} contextKey - Optional context key for encryption */ export function invalidateFileCollectionCache(contextId, contextKey = null) { const cacheKey = getCollectionCacheKey(contextId, contextKey); fileCollectionCache.delete(cacheKey); } /** * Extract files from chat history * @param {Array} chatHistory - Chat history to scan * @returns {Array} Array of file metadata objects */ function extractFilesFromChatHistory(chatHistory) { if (!chatHistory || !Array.isArray(chatHistory)) { return []; } const extractedFiles = []; for (const message of chatHistory) { if (!message || !message.content) { continue; } // Handle array content if (Array.isArray(message.content)) { for (const content of message.content) { try { const contentObj = typeof content === 'string' ? JSON.parse(content) : content; extractedFiles.push(...extractFileMetadataFromContent(contentObj)); } catch (e) { // Not JSON or couldn't be parsed, continue continue; } } } // Handle string content else if (typeof message.content === 'string') { try { const contentObj = JSON.parse(message.content); extractedFiles.push(...extractFileMetadataFromContent(contentObj)); } catch (e) { // Not JSON or couldn't be parsed, continue continue; } } // Handle object content else if (typeof message.content === 'object') { extractedFiles.push(...extractFileMetadataFromContent(message.content)); } } return extractedFiles; } /** * Check if a file should be included in the collection based on inCollection metadata * Supports both boolean (backward compat) and array format * @param {boolean|Array<string>|undefined} inCollection - inCollection metadata value * @param {string|null} chatId - Optional chat ID to filter by (if null, only global files are included) * @returns {boolean} True if file should be included */ function isFileInCollection(inCollection, chatId = null) { // If not set, file is not in collection // Treat empty array [] the same as undefined (not in collection) // Note: false is not a valid value (normalizes to undefined), but handle it defensively if (inCollection === undefined || inCollection === null || inCollection === false) { return false; } // Empty array means not in collection (same as undefined) if (Array.isArray(inCollection) && inCollection.length === 0) { return false; } // Backward compatibility: boolean true means global if (inCollection === true) { return true; } // Array format: check if it includes '*' (global) or the specific chatId if (Array.isArray(inCollection) && inCollection.length > 0) { // If no chatId specified, only include global files if (chatId === null) { return inCollection.includes('*'); } // Include if global or matches specific chatId return inCollection.includes('*') || inCollection.includes(chatId); } // Unknown format, exclude return false; } /** * Load file collection from memory system or cache * @param {string} contextId - Context ID for the file collection * @param {string} contextKey - Optional context key for encryption * @param {boolean} useCache - Whether to check cache first (default: true) * @param {string|null} chatId - Optional chat ID to filter files by (if provided, only includes files with '*' or this chatId in inCollection) * @returns {Promise<Array>} File collection array */ /** * Write file data to Redis with encryption of sensitive fields * Follows the same pattern as setvWithDoubleEncryption - skips encryption for empty values * @param {Object} redisClient - Redis client * @param {string} contextMapKey - Redis hash map key * @param {string} hash - File hash (key in hash map) * @param {Object} fileData - File data object * @param {string} contextKey - Optional context key for encryption */ async function writeFileDataToRedis(redisClient, contextMapKey, hash, fileData, contextKey = null) { const dataToStore = { ...fileData }; // Encrypt sensitive fields if contextKey is provided (same pattern as memory encryption) if (contextKey && contextKey.trim() !== '') { // Encrypt tags (array of strings) - skip if empty (consistent with memory encryption) if (dataToStore.tags && Array.isArray(dataToStore.tags) && dataToStore.tags.length > 0) { try { const tagsJson = JSON.stringify(dataToStore.tags); const encrypted = encrypt(tagsJson, contextKey); if (encrypted !== null) { dataToStore.tags = encrypted; } // If encryption fails, continue with unencrypted (same pattern as memory) } catch (error) { logger.warn(`Failed to encrypt tags: ${error.message}`); } } // Encrypt notes (string) - skip if empty (consistent with memory encryption) if (dataToStore.notes && typeof dataToStore.notes === 'string' && dataToStore.notes.trim() !== '') { try { const encrypted = encrypt(dataToStore.notes, contextKey); if (encrypted !== null) { dataToStore.notes = encrypted; } // If encryption fails, continue with unencrypted (same pattern as memory) } catch (error) { logger.warn(`Failed to encrypt notes: ${error.message}`); } } } await redisClient.hset(contextMapKey, hash, JSON.stringify(dataToStore)); } /** * Read file data from Redis with decryption of sensitive fields * Follows the same pattern as getvWithDoubleDecryption - tries decrypt, falls back to original * @param {string} dataStr - JSON string from Redis * @param {string} contextKey - Optional context key for decryption * @returns {Object|null} Parsed and decrypted file data, or null if invalid */ function readFileDataFromRedis(dataStr, contextKey = null) { if (!dataStr) return null; try { const fileData = JSON.parse(dataStr); // Decrypt sensitive fields if contextKey is provided (same pattern as memory decryption) if (contextKey && contextKey.trim() !== '') { // Decrypt tags (array of strings) if (fileData.tags !== undefined && fileData.tags !== null) { // If already an array, it's unencrypted legacy data - keep as-is if (!Array.isArray(fileData.tags) && typeof fileData.tags === 'string') { // Try to decrypt (encrypted strings have ':' separator from IV) if (fileData.tags.includes(':')) { try { const decrypted = decrypt(fileData.tags, contextKey); if (decrypted !== null) { // Try to parse as JSON array, fallback to array with single string try { fileData.tags = JSON.parse(decrypted); } catch (e) { fileData.tags = [decrypted]; } } // If decryption returns null, keep original (might be unencrypted legacy data) } catch (error) { // Decryption failed, keep as-is (unencrypted legacy data) } } else { // No ':' means not encrypted - try parsing as JSON, fallback to array try { fileData.tags = JSON.parse(fileData.tags); } catch (e) { fileData.tags = [fileData.tags]; } } } } else { fileData.tags = []; } // Decrypt notes (string) if (fileData.notes !== undefined && fileData.notes !== null) { if (typeof fileData.notes === 'string' && fileData.notes.includes(':')) { // Try to decrypt try { const decrypted = decrypt(fileData.notes, contextKey); if (decrypted !== null) { fileData.notes = decrypted; } // If decryption returns null, keep original (might be unencrypted legacy data) } catch (error) { // Decryption failed, keep as-is (unencrypted legacy data) } } // If not encrypted (no ':'), keep as-is (legacy unencrypted data) } else { fileData.notes = ''; } } return fileData; } catch (e) { return null; } } /** * Parse raw Redis hash map data into file objects (without filtering) * @param {Object} allFiles - Redis HGETALL result {hash: fileDataStr} * @param {string} contextKey - Optional context key for decryption * @returns {Array} Array of parsed file data objects (includes inCollection metadata) */ function parseRawFileData(allFiles, contextKey = null) { return Object.entries(allFiles).map(([hash, fileDataStr]) => { const decryptedData = readFileDataFromRedis(fileDataStr, contextKey); if (!decryptedData) { return null; } // Use converted URL, GCS, and mimeType if converted block exists // This ensures we use the converted file (the actual processable content) as the primary values // Keep displayFilename as the original (e.g., "foo.docx" even if URL is "foo.md") const url = decryptedData.converted?.url || decryptedData.url; const gcs = decryptedData.converted?.gcs || decryptedData.gcs || null; const mimeType = decryptedData.converted?.mimeType || decryptedData.mimeType || null; // Return parsed file data with hash and inCollection preserved for filtering return { id: decryptedData.id || `${Date.now()}-${Math.random().toString(36).substring(2, 9)}`, url: url, // Use converted URL if available gcs: gcs, // Use converted GCS if available displayFilename: decryptedData.displayFilename || decryptedData.filename || null, mimeType: mimeType, // Use converted mimeType if available tags: decryptedData.tags || [], notes: decryptedData.notes || '', hash: hash, permanent: decryptedData.permanent || false, addedDate: decryptedData.addedDate || decryptedData.timestamp || new Date().toISOString(), lastAccessed: decryptedData.lastAccessed || decryptedData.timestamp || new Date().toISOString(), // Mark as converted if converted block exists (for edit prevention) ...(decryptedData.converted && { _isConverted: true }), // Preserve inCollection for filtering inCollection: decryptedData.inCollection }; }).filter(Boolean); } /** * Sort files by lastAccessed (most recent first) * @param {Array} files - Array of file objects * @returns {Array} Sorted array */ function sortFilesByLastAccessed(files) { return files.sort((a, b) => { const aDate = new Date(a.lastAccessed || a.addedDate || 0); const bDate = new Date(b.lastAccessed || b.addedDate || 0); return bDate - aDate; }); } /** * Load file collection from one or more contexts (unified function) * * @param {Array|Object|string} agentContext - Context(s) to load from: * - Array of {contextId, contextKey, default} objects (compound context) * - Single {contextId, contextKey, default} object * - String contextId (contextKey will be null) * @param {Object} options - Load options * @param {Array<string>|string|null} options.chatIds - Chat IDs to filter by: * - If null/undefined/empty array: return ALL files regardless of inCollection * - If provided: filter to files where inCollection includes '*' or any of these chatIds * @param {boolean} options.useCache - Whether to use cache (default: true) * @returns {Promise<Array>} File collection (deduplicated across contexts, sorted by lastAccessed) */ async function loadFileCollection(agentContext, options = {}) { // Normalize agentContext to array format let contexts = []; if (typeof agentContext === 'string') { // Single contextId string contexts = [{ contextId: agentContext, contextKey: null, default: true }]; } else if (Array.isArray(agentContext)) { contexts = agentContext.filter(ctx => ctx && ctx.contextId); } else if (agentContext && typeof agentContext === 'object' && agentContext.contextId) { // Single context object contexts = [agentContext]; } if (contexts.length === 0) { return []; } // Normalize options const useCache = options.useCache !== false; // default true let chatIds = options.chatIds; // Normalize chatIds to array or null if (chatIds === undefined || chatIds === null) { chatIds = null; // No filtering } else if (typeof chatIds === 'string') { chatIds = chatIds.trim() ? [chatIds] : null; } else if (Array.isArray(chatIds)) { chatIds = chatIds.filter(id => id && typeof id === 'string' && id.trim()); if (chatIds.length === 0) chatIds = null; } else { chatIds = null; } // Load files from all contexts let allFiles = []; for (const ctx of contexts) { const contextId = ctx.contextId; const contextKey = ctx.contextKey || null; const cacheKey = getCollectionCacheKey(contextId, contextKey); let rawFiles = []; // Check cache first if (useCache && fileCollectionCache.has(cacheKey)) { const cached = fileCollectionCache.get(cacheKey); if (Date.now() - cached.timestamp < CACHE_TTL) { rawFiles = cached.rawFiles; } } // Load from Redis if not cached if (rawFiles.length === 0) { try { const redisClient = await getRedisClient(); if (redisClient) { const contextMapKey = `FileStoreMap:ctx:${contextId}`; const filesData = await redisClient.hgetall(contextMapKey); rawFiles = parseRawFileData(filesData, contextKey); // Update cache if (useCache) { fileCollectionCache.set(cacheKey, { rawFiles: rawFiles, timestamp: Date.now() }); } } } catch (e) { // Collection doesn't exist yet or error reading rawFiles = []; } } // Tag files with their source context allFiles.push(...rawFiles.map(f => ({ ...f, _contextId: contextId }))); } // Deduplicate by hash/url/gcs (keep first occurrence - primary context wins) const seenHashes = new Set(); const seenUrls = new Set(); const seenGcs = new Set(); const deduped = []; for (const file of allFiles) { const isDupe = (file.hash && seenHashes.has(file.hash)) || (file.url && seenUrls.has(file.url)) || (file.gcs && seenGcs.has(file.gcs)); if (!isDupe) { if (file.hash) seenHashes.add(file.hash); if (file.url) seenUrls.add(file.url); if (file.gcs) seenGcs.add(file.gcs); deduped.push(file); } } // Apply filtering let filtered; if (chatIds !== null) { // Filter to files that are in collection for any of the provided chatIds filtered = deduped.filter(file => { // Check if file is accessible for any of the chatIds for (const chatId of chatIds) { if (isFileInCollection(file.inCollection, chatId)) { return true; } } return false; }); } else { // If chatIds is null, return ALL files // Valid inCollection is an array with strings (or boolean true for backward compat) // Files with inCollection === undefined/null/[] are included (Labeeb uploads, not in collection - treat the same) // Note: false is not a valid value (normalizes to undefined), so we shouldn't see it in data filtered = deduped.filter(file => { const ic = file.inCollection; // Include undefined/null/empty array (Labeeb uploads, not in collection - treat the same) if (ic === undefined || ic === null || (Array.isArray(ic) && ic.length === 0)) return true; // Include boolean true (backward compat - global) if (ic === true) return true; // Include valid arrays (non-empty arrays with strings) if (Array.isArray(ic) && ic.length > 0) return true; // Exclude everything else (invalid formats - defensive programming) return false; }); } return sortFilesByLastAccessed(filtered); } /** * Normalize inCollection value to array format * @param {boolean|Array<string>|undefined} inCollection - inCollection value to normalize * @returns {Array<string>|undefined} Normalized array, or undefined if false/null/undefined/empty array */ function normalizeInCollection(inCollection) { // If explicitly false, return undefined (not in collection, same as undefined) if (inCollection === false) { return undefined; } // If null or undefined, return undefined (file not in collection) if (inCollection === null || inCollection === undefined) { return undefined; } // If empty array, return undefined (not in collection, same as undefined) if (Array.isArray(inCollection) && inCollection.length === 0) { return undefined; } // Boolean true means global if (inCollection === true) { return ['*']; } // Already an array, return as-is if (Array.isArray(inCollection)) { return inCollection; } // Unknown format, default to global return ['*']; } /** * Get the appropriate inCollection value based on chatId * Centralized function to ensure consistent behavior across all file operations * @param {string|null|undefined} chatId - Optional chat ID * @returns {Array<string>} Array with chatId if provided, otherwise ['*'] for global */ function getInCollectionValue(chatId = null) { if (chatId && typeof chatId === 'string' && chatId.trim() !== '') { return [chatId]; } return ['*']; } /** * Add a chatId to an existing inCollection array (reference counting) * If the chatId is already present, returns the array unchanged. * * IMPORTANT: inCollection is either ['*'] (global) OR [chatId, ...] (chat-scoped), never mixed. * If inCollection contains '*' (global), it stays global - no chatIds are added. * * @param {Array<string>|undefined} existingInCollection - Current inCollection value * @param {string|null} chatId - Chat ID to add * @returns {Array<string>} Updated inCollection array */ function addChatIdToInCollection(existingInCollection, chatId) { // Normalize existing to array const existing = Array.isArray(existingInCollection) ? existingInCollection : []; // If already global, stay global if (existing.includes('*')) { return existing; } // If no chatId provided, return existing or default to global if (!chatId || typeof chatId !== 'string' || chatId.trim() === '') { return existing.length > 0 ? existing : ['*']; } // Add chatId if not already present if (!existing.includes(chatId)) { return [...existing, chatId]; } return existing; } /** * Remove a chatId from an inCollection array (reference counting) * Returns the updated array without the chatId. * * IMPORTANT: Global files (['*']) are not reference-counted - they return unchanged. * Only chat-scoped files have chatIds removed. When removing from collection, * global files should be fully deleted, not reference-counted. * * @param {Array<string>|undefined} existingInCollection - Current inCollection value * @param {string|null} chatId - Chat ID to remove * @returns {Array<string>} Updated inCollection array (may be empty for chat-scoped files) */ function removeChatIdFromInCollection(existingInCollection, chatId) { // Normalize existing to array const existing = Array.isArray(existingInCollection) ? existingInCollection : []; // If no chatId provided, can't remove anything if (!chatId || typeof chatId !== 'string' || chatId.trim() === '') { return existing; } // If global, removing a specific chatId doesn't make sense - return as-is // (global files aren't scoped to chats) if (existing.includes('*')) { return existing; } // Remove the chatId return existing.filter(id => id !== chatId); } /** * Update file metadata in Redis hash map (direct atomic operation) * @param {string} contextId - Context ID * @param {string} hash - File hash * @param {Object} metadata - Metadata to update (displayFilename, id, tags, notes, mimeType, addedDate, lastAccessed, permanent, inCollection) * @param {string} contextKey - Optional context key for encryption * @param {string|null} chatId - Optional chat ID, used as default for inCollection if not provided in metadata and not already set * Note: Does NOT update CFH core fields (url, gcs, hash, filename) - those are managed by CFH * @returns {Promise<boolean>} True if successful */ async function updateFileMetadata(contextId, hash, metadata, contextKey = null, chatId = null) { if (!contextId || !hash) { return false; } try { const redisClient = await getRedisClient(); if (!redisClient) { return false; } const contextMapKey = `FileStoreMap:ctx:${contextId}`; // Get existing file data - must exist to update const existingDataStr = await redisClient.hget(contextMapKey, hash); if (!existingDataStr) { // File doesn't exist in this context - don't create new entries return false; } const existingData = readFileDataFromRedis(existingDataStr, contextKey) || {}; // Merge CFH data with Cortex metadata // Only update Cortex-managed fields, preserve CFH fields (url, gcs, hash, filename) const fileData = { ...existingData, // Preserve all CFH data (url, gcs, hash, filename, etc.) // Handle inCollection: normalize if provided, otherwise preserve existing or default based on chatId // normalizeInCollection converts false/null/undefined/[] to undefined (no collection membership) inCollection: metadata.inCollection !== undefined ? normalizeInCollection(metadata.inCollection) : (existingData.inCollection !== undefined ? normalizeInCollection(existingData.inCollection) : getInCollectionValue(chatId)), // Update only Cortex-managed metadata fields ...(metadata.displayFilename !== undefined && { displayFilename: metadata.displayFilename }), ...(metadata.id !== undefined && { id: metadata.id }), ...(metadata.tags !== undefined && { tags: metadata.tags }), ...(metadata.notes !== undefined && { notes: metadata.notes }), ...(metadata.mimeType !== undefined && { mimeType: metadata.mimeType }), ...(metadata.addedDate !== undefined && { addedDate: metadata.addedDate }), ...(metadata.lastAccessed !== undefined && { lastAccessed: metadata.lastAccessed }), ...(metadata.permanent !== undefined && { permanent: metadata.permanent }) }; // Remove inCollection if it's undefined (file not in collection) // Empty arrays [] are normalized to undefined, so they get deleted too if (fileData.inCollection === undefined) { delete fileData.inCollection; } // Write back to hash map (atomic operation) - encryption happens in helper await writeFileDataToRedis(redisClient, contextMapKey, hash, fileData, contextKey); // Invalidate cache (use contextKey to match the correct cache key) invalidateFileCollectionCache(contextId, contextKey); return true; } catch (e) { const logger = (await import('./logger.js')).default; logger.warn(`Failed to update file metadata: ${e.message}`); return false; } } /** * Save file collection to memory system * Only updates files that have changed (optimized) * @param {string} contextId - Context ID for the file collection * @param {string} contextKey - Optional context key for encryption (unused with hash maps) * @param {Array} collection - File collection array * @param {string|null} chatId - Optional chat ID, used for inCollection value (chat-scoped if provided, global if not) * @returns {Promise<boolean>} True if successful */ async function saveFileCollection(contextId, contextKey, collection, chatId = null) { const cacheKey = getCollectionCacheKey(contextId, contextKey); try { const redisClient = await getRedisClient(); if (!redisClient) { return false; } const contextMapKey = `FileStoreMap:ctx:${contextId}`; // Get current state to detect changes const currentFiles = await redisClient.hgetall(contextMapKey); // Update only files that changed or are new for (const file of collection) { // Generate hash from URL if not present (for files added without hash) let fileHash = file.hash; if (!fileHash && file.url) { fileHash = await computeBufferHash(Buffer.from(file.url)); } if (!fileHash) continue; const currentDataStr = currentFiles[fileHash]; let needsUpdate = true; // Check if file actually changed if (currentDataStr) { const currentData = readFileDataFromRedis(currentDataStr, contextKey); if (currentData) { // Compare metadata fields (ignore CFH fields like url, gcs, timestamp) if (currentData.id === file.id && JSON.stringify(currentData.tags || []) === JSON.stringify(file.tags || []) && currentData.notes === (file.notes || '') && currentData.mimeType === (file.mimeType || null) && currentData.permanent === (file.permanent || false)) { needsUpdate = false; } } } if (needsUpdate) { // Get existing CFH data const existingData = readFileDataFromRedis(currentDataStr, contextKey) || {}; // Merge CFH data with Cortex metadata // Preserve all CFH fields (url, gcs, filename, displayFilename, etc.) // Mark as inCollection: true (chat files that should appear in file collection) const fileData = { ...existingData, // Preserve all CFH data first id: file.id, url: file.url || existingData.url, // Preserve URL (CFH-managed) gcs: file.gcs || existingData.gcs || null, // Preserve GCS (CFH-managed) // Preserve CFH's filename (CFH-managed), only update displayFilename (Cortex-managed) displayFilename: file.displayFilename !== undefined ? file.displayFilename : (existingData.displayFilename || null), tags: file.tags || [], notes: file.notes || '', mimeType: file.mimeType || existingData.mimeType || null, addedDate: file.addedDate || existingData.timestamp || new Date().toISOString(), lastAccessed: file.lastAccessed || new Date().toISOString(), permanent: file.permanent !== undefined ? file.permanent : (existingData.permanent || false), // Add chatId to existing inCollection (reference counting) - file may be used in multiple chats inCollection: existingData.inCollection ? addChatIdToInCollection(existingData.inCollection, chatId) : getInCollectionValue(chatId) }; // Write back to hash map (atomic operation) - encryption happens in helper await writeFileDataToRedis(redisClient, contextMapKey, fileHash, fileData, contextKey); } } // Note: We don't remove files from hash map when removed from collection // CFH manages file lifecycle, and files might still exist in storage // Invalidate cache (will be repopulated on next loadFileCollection call with fresh Redis data) fileCollectionCache.delete(cacheKey); return true; } catch (e) { const logger = (await import('./logger.js')).default; logger.warn(`Failed to save file collection: ${e.message}`); return false; } } /** * Add a file to the file collection * If fileUrl is provided and is not already a cloud URL, it will be uploaded first * @param {string} contextId - Context ID for the file collection * @param {string} contextKey - Optional context key for encryption * @param {string} url - Cloud storage URL (Azure URL) - if fileUrl is provided, this can be null * @param {string} gcs - Optional Google Cloud Storage URL * @param {string} filename - Filename or title for the file * @param {Array<string>} tags - Optional array of tags * @param {string} notes - Optional notes or description * @param {string} hash - Optional file hash * @param {string} fileUrl - Optional: URL of file to upload (if not already in cloud storage) * @param {pathwayResolver} pathwayResolver - Optional pathway resolver for logging * @param {boolean} permanent - If true, file is stored with permanent retention * @param {string|null} chatId - Optional chat ID, used for inCollection value (chat-scoped if provided, global if not) * @returns {Promise<Object>} File entry object with id */ async function addFileToCollection(contextId, contextKey, url, gcs, filename, tags = [], notes = '', hash = null, fileUrl = null, pathwayResolver = null, permanent = false, chatId = null) { if (!contextId || !filename) { throw new Error("contextId and filename are required"); } // If permanent=true, set retention=permanent to keep file forever const desiredRetention = permanent ? 'permanent' : 'temporary'; // YouTube URLs should not be added to the file collection (they are never uploaded to CFH) // They can be used directly in analyzer tools without being in the collection if (fileUrl && isYoutubeUrl(fileUrl)) { throw new Error("YouTube URLs cannot be added to the file collection. Use the YouTube URL directly with analyzer tools instead."); } if (url && isYoutubeUrl(url)) { throw new Error("YouTube URLs cannot be added to the file collection. Use the YouTube URL directly with analyzer tools instead."); } // If fileUrl is provided and url is not already a cloud URL, upload the file first let finalUrl = url; let finalGcs = gcs; let finalHash = hash; if (fileUrl && (!url || (!url.includes('blob.core.windows.net') && !url.includes('storage.googleapis.com')))) { // Upload the file from the URL // uploadFileToCloud will download it, compute hash, check if it exists, and upload if needed // It uploads the local file stream, not the URL, to avoid triggering remoteFile fetch const uploadResult = await uploadFileToCloud(fileUrl, null, filename, pathwayResolver, contextId); finalUrl = uploadResult.url; finalGcs = uploadResult.gcs; finalHash = uploadResult.hash || hash; } // If the caller asked for permanence/privacy and we have a hash, update retention (best-effort) if (finalHash && desiredRetention === 'permanent') { try { await setRetentionForHash(finalHash, desiredRetention, contextId, pathwayResolver); } catch (e) { const msg = `Failed to set retention=${desiredRetention} for hash ${finalHash}: ${e?.message || String(e)}`; if (pathwayResolver?.logWarning) pathwayResolver.logWarning(msg); else logger.warn(msg); } } if (!finalUrl) { throw new Error("url or fileUrl is required"); } // Determine MIME type from URL (the actual stored content, which may be converted) // E.g., if user uploaded foo.docx but it was converted to foo.md, MIME type should be text/markdown const mimeType = determineMimeTypeFromUrl(finalUrl, finalGcs, null); // IMPORTANT: Keep the original user-provided filename as displayFilename // Do NOT "correct" the extension based on MIME type // The user's original filename (e.g., "foo.docx") should be preserved even if the // stored content is a converted format (e.g., "foo.md") // This allows users to recognize their files by original name while tools // use the actual URL to determine content type for operation