@aj-archipelago/cortex
Version:
Cortex is a GraphQL API for AI. It provides a simple, extensible interface for using AI services from OpenAI, Azure and others.
1,218 lines (1,083 loc) • 109 kB
JavaScript
import logger from "./logger.js";
import stream from 'stream';
import os from 'os';
import http from 'http';
import https from 'https';
import { URL } from 'url';
import { v4 as uuidv4 } from 'uuid';
import { promisify } from 'util';
import { axios } from './requestExecutor.js';
import { config } from '../config.js';
import fs from 'fs';
import path from 'path';
import FormData from 'form-data';
import xxhash from 'xxhash-wasm';
import mime from 'mime-types';
import mimeDb from 'mime-db';
import { encrypt, decrypt } from './crypto.js';
const pipeline = promisify(stream.pipeline);
const MEDIA_API_URL = config.get('whisperMediaApiUrl');
/**
* Check if a URL is a YouTube URL
* Validates URL structure to ensure it's a valid YouTube video URL
* @param {string} url - URL to check
* @returns {boolean} True if URL is a valid YouTube video URL
*/
export function isYoutubeUrl(url) {
if (!url || typeof url !== 'string') return false;
try {
const urlObj = new URL(url);
// Check for standard youtube.com domains
if (
urlObj.hostname === "youtube.com" ||
urlObj.hostname === "www.youtube.com"
) {
// For standard watch URLs, verify they have a video ID
if (urlObj.pathname === "/watch") {
return !!urlObj.searchParams.get("v");
}
// For embed URLs, verify they have a video ID in the path
if (urlObj.pathname.startsWith("/embed/")) {
return urlObj.pathname.length > 7; // '/embed/' is 7 chars
}
// For shorts URLs, verify they have a video ID in the path
if (urlObj.pathname.startsWith("/shorts/")) {
return urlObj.pathname.length > 8; // '/shorts/' is 8 chars
}
return false;
}
// Check for shortened youtu.be domain
if (urlObj.hostname === "youtu.be") {
// Verify there's a video ID in the path
return urlObj.pathname.length > 1; // '/' is 1 char
}
return false;
} catch (err) {
return false;
}
}
// Cache xxhash instance for reuse
let xxhashInstance = null;
let xxhashInitPromise = null;
/**
* Get or initialize xxhash instance (reused for performance)
* Thread-safe initialization to prevent race conditions in high-volume scenarios
* @returns {Promise<Object>} xxhash instance
*/
async function getXXHashInstance() {
// If already initialized, return immediately
if (xxhashInstance) {
return xxhashInstance;
}
// If initialization is in progress, wait for it
if (xxhashInitPromise) {
return await xxhashInitPromise;
}
// Start initialization (only one will execute)
xxhashInitPromise = (async () => {
try {
const instance = await xxhash();
xxhashInstance = instance;
return instance;
} finally {
// Clear the promise so we can retry if initialization fails
xxhashInitPromise = null;
}
})();
return await xxhashInitPromise;
}
/**
* Compute xxhash64 hash of a file (super fast hash for file deduplication)
* Uses xxhash64 to match the hash format used in labeeb and cortex file handler
* @param {string} filePath - Path to the file
* @returns {Promise<string>} xxhash64 hash in hex format
*/
async function computeFileHash(filePath) {
const hasher = await getXXHashInstance();
return new Promise((resolve, reject) => {
// Create a new xxh64 instance for this file to avoid concurrency issues
const xxh64 = hasher.create64();
const stream = fs.createReadStream(filePath);
stream.on('data', (data) => xxh64.update(data));
stream.on('end', () => resolve(xxh64.digest().toString(16)));
stream.on('error', (error) => reject(error));
});
}
/**
* Compute xxhash64 hash of a buffer
* @param {Buffer} buffer - Buffer to hash
* @returns {Promise<string>} xxhash64 hash in hex format
*/
async function computeBufferHash(buffer) {
const hasher = await getXXHashInstance();
const xxh64 = hasher.create64();
xxh64.update(buffer);
return xxh64.digest().toString(16);
}
/**
* Fetch/load a file from URL via file handler
* Downloads file from URL, processes it, and returns the result
* @param {string} fileUrl - URL of file to fetch
* @param {string} requestId - Request ID for tracking
* @param {string|null} contextId - Optional context ID for scoped file storage
* @param {boolean} save - Whether to save the file (default: false)
* @returns {Promise<Object>} Response data with file information
*/
async function fetchFileFromUrl(fileUrl, requestId, contextId = null, save = false) {
const fileHandlerUrl = MEDIA_API_URL;
if (!fileHandlerUrl || fileHandlerUrl === 'null') {
throw new Error('File handler URL is not configured');
}
const url = buildFileHandlerUrl(fileHandlerUrl, {
fetch: fileUrl,
requestId,
...(contextId ? { contextId } : {}),
...(save ? { save: true } : {})
});
const response = await axios.get(url, { timeout: 60000 });
if (!response.data?.url && !Array.isArray(response.data)) {
throw new Error("File handler did not return valid data");
}
return response.data;
}
/**
* Build a file handler URL with query parameters
* Handles separator detection (? vs &) and parameter encoding
* @param {string} baseUrl - Base file handler URL
* @param {Object} params - Query parameters as key-value pairs (null/undefined values are skipped)
* @returns {string} Complete URL with query parameters
*/
function buildFileHandlerUrl(baseUrl, params = {}) {
if (!baseUrl) {
throw new Error('baseUrl is required');
}
const separator = baseUrl.includes('?') ? '&' : '?';
const queryParams = [];
Object.entries(params).forEach(([key, value]) => {
if (value != null && value !== '') {
queryParams.push(`${encodeURIComponent(key)}=${encodeURIComponent(value)}`);
}
});
if (queryParams.length === 0) {
return baseUrl;
}
return `${baseUrl}${separator}${queryParams.join('&')}`;
}
async function deleteTempPath(path) {
try {
if (!path) {
logger.warn('Temporary path is not defined.');
return;
}
if (!fs.existsSync(path)) {
logger.warn(`Temporary path ${path} does not exist.`);
return;
}
const stats = fs.statSync(path);
if (stats.isFile()) {
fs.unlinkSync(path);
logger.info(`Temporary file ${path} deleted successfully.`);
} else if (stats.isDirectory()) {
fs.rmSync(path, { recursive: true });
logger.info(`Temporary folder ${path} and its contents deleted successfully.`);
}
} catch (err) {
logger.error(`Error occurred while deleting the temporary path: ${err}`);
}
}
function generateUniqueFilename(extension) {
return `${uuidv4()}.${extension}`;
}
const downloadFile = async (fileUrl) => {
const urlObj = new URL(fileUrl);
const pathname = urlObj.pathname;
const fileExtension = path.extname(pathname).slice(1) || 'bin';
const uniqueFilename = generateUniqueFilename(fileExtension);
const tempDir = os.tmpdir();
const localFilePath = `${tempDir}/${uniqueFilename}`;
// eslint-disable-next-line no-async-promise-executor
return new Promise(async (resolve, reject) => {
try {
const parsedUrl = new URL(fileUrl);
const protocol = parsedUrl.protocol === 'https:' ? https : http;
const response = await new Promise((resolve, reject) => {
protocol.get(parsedUrl, (res) => {
if (res.statusCode === 200) {
resolve(res);
} else {
reject(new Error(`HTTP request failed with status code ${res.statusCode}`));
}
}).on('error', reject);
});
await pipeline(response, fs.createWriteStream(localFilePath));
logger.info(`Downloaded file to ${localFilePath}`);
resolve(localFilePath);
} catch (error) {
fs.unlink(localFilePath, () => {
reject(error);
});
//throw error;
}
});
};
/**
* Get media chunks from file handler (for chunked media files)
* @param {string} file - File URL or URI
* @param {string} requestId - Request ID for tracking
* @param {string|null} contextId - Optional context ID for scoped file storage
* @returns {Promise<Array>} Array of chunk URLs
*/
async function getMediaChunks(file, requestId, contextId = null) {
try {
if (MEDIA_API_URL) {
const url = buildFileHandlerUrl(MEDIA_API_URL, {
uri: file,
requestId,
...(contextId ? { contextId } : {})
});
const res = await axios.get(url, { timeout: 600000 });
return res.data;
} else {
logger.info(`No API_URL set, returning file as chunk`);
return [file];
}
} catch (err) {
logger.error(`Error getting media chunks list from api: ${err}`);
throw err;
}
}
/**
* Mark a request as completed for cleanup in file handler
* @param {string} requestId - Request ID to mark as completed
* @param {string|null} contextId - Optional context ID for scoped file storage
* @returns {Promise<Object|null>} Response data or null
*/
async function markCompletedForCleanUp(requestId, contextId = null) {
try {
if (MEDIA_API_URL) {
const url = buildFileHandlerUrl(MEDIA_API_URL, {
requestId,
...(contextId ? { contextId } : {})
});
const res = await axios.delete(url, { timeout: 15000 });
logger.info(`Marked request ${requestId} as completed: ${JSON.stringify(res.data)}`);
return res.data;
}
} catch (err) {
logger.error(`Error marking request ${requestId} as completed: ${err}`);
}
return null;
}
/**
* Delete a file from cloud storage by hash
* @param {string} hash - File hash to delete
* @param {pathwayResolver} pathwayResolver - Optional pathway resolver for logging
* @param {string|null} contextId - Optional but strongly recommended context id for scoped hashes
* @returns {Promise<boolean>} True if file was deleted, false if not found or error
*/
async function deleteFileByHash(hash, pathwayResolver = null, contextId = null) {
if (!hash || typeof hash !== 'string') {
logger.warn('deleteFileByHash: hash is required and must be a string');
return false;
}
const fileHandlerUrl = MEDIA_API_URL;
if (!fileHandlerUrl) {
logger.warn('deleteFileByHash: WHISPER_MEDIA_API_URL is not set, cannot delete file');
return false;
}
try {
const deleteUrl = buildFileHandlerUrl(fileHandlerUrl, {
hash,
...(contextId ? { contextId } : {})
});
const response = await axios.delete(deleteUrl, {
validateStatus: (status) => status >= 200 && status < 500, // Accept 200-499 as valid responses
timeout: 30000
});
if (response.status === 200) {
logger.info(`Successfully deleted file with hash ${hash}`);
return true;
} else if (response.status === 404) {
logger.info(`File with hash ${hash} not found (may have already been deleted)`);
return false; // Not an error - file doesn't exist
} else {
logger.warn(`Unexpected status ${response.status} when deleting file with hash ${hash}`);
return false;
}
} catch (error) {
// If it's a 404, that's fine - file doesn't exist
if (error?.response?.status === 404) {
logger.info(`File with hash ${hash} not found during deletion (may have already been deleted)`);
return false;
}
// Log other errors but don't throw - deletion failure shouldn't block modification
const errorMsg = error?.message || String(error);
logger.warn(`Error deleting file with hash ${hash}: ${errorMsg}`);
return false;
}
}
// Helper function to extract file metadata from a content object
// Returns normalized format with url and gcs (for file collection storage)
// Note: displayFilename is not extracted from messages - it's set by CFH on upload,
// or by sys_update_file_metadata.js, or by file collection tools
function extractFileMetadataFromContent(contentObj) {
const files = [];
if (contentObj.type === 'image_url' && contentObj.image_url?.url) {
files.push({
url: contentObj.image_url.url,
gcs: contentObj.gcs || null,
hash: contentObj.hash || null,
type: 'image_url'
});
} else if (contentObj.type === 'file' && contentObj.url) {
files.push({
url: contentObj.url,
gcs: contentObj.gcs || null,
hash: contentObj.hash || null,
type: 'file'
});
} else if (contentObj.url && (contentObj.type === 'image_url' || !contentObj.type)) {
// Handle direct URL objects
files.push({
url: contentObj.url,
gcs: contentObj.gcs || null,
hash: contentObj.hash || null,
type: contentObj.type || 'file'
});
}
return files;
}
// Cache for file collections during a request lifecycle
// Stores raw parsed file data (all files from Redis) to support flexible filtering
// Structure: { rawFiles: Array<parsed file data>, timestamp: number }
const fileCollectionCache = new Map();
const CACHE_TTL = 5000; // 5 seconds
// Singleton Redis client for file collection operations
let redisClientSingleton = null;
// Helper to get Redis client for direct hash map access
async function getRedisClient() {
if (redisClientSingleton) {
return redisClientSingleton;
}
try {
const { config } = await import('../config.js');
const connectionString = config.get('storageConnectionString');
if (!connectionString) {
return null;
}
// Import Redis and create client
const Redis = (await import('ioredis')).default;
redisClientSingleton = new Redis(connectionString, {
maxRetriesPerRequest: null,
enableReadyCheck: true,
lazyConnect: false,
connectTimeout: 10000,
});
// Handle errors
redisClientSingleton.on('error', async (error) => {
const logger = (await import('./logger.js')).default;
logger.error(`Redis client error in fileUtils: ${error}`);
});
return redisClientSingleton;
} catch (e) {
return null;
}
}
/**
* Get cache key for file collection
*/
function getCollectionCacheKey(contextId, contextKey) {
// Cache key for file collection (legacy format maintained for cache compatibility)
return `${contextId}-fileCollection-${contextKey || 'default'}`;
}
/**
* Invalidate file collection cache for a given context
* @param {string} contextId - Context ID for the file collection
* @param {string} contextKey - Optional context key for encryption
*/
export function invalidateFileCollectionCache(contextId, contextKey = null) {
const cacheKey = getCollectionCacheKey(contextId, contextKey);
fileCollectionCache.delete(cacheKey);
}
/**
* Extract files from chat history
* @param {Array} chatHistory - Chat history to scan
* @returns {Array} Array of file metadata objects
*/
function extractFilesFromChatHistory(chatHistory) {
if (!chatHistory || !Array.isArray(chatHistory)) {
return [];
}
const extractedFiles = [];
for (const message of chatHistory) {
if (!message || !message.content) {
continue;
}
// Handle array content
if (Array.isArray(message.content)) {
for (const content of message.content) {
try {
const contentObj = typeof content === 'string' ? JSON.parse(content) : content;
extractedFiles.push(...extractFileMetadataFromContent(contentObj));
} catch (e) {
// Not JSON or couldn't be parsed, continue
continue;
}
}
}
// Handle string content
else if (typeof message.content === 'string') {
try {
const contentObj = JSON.parse(message.content);
extractedFiles.push(...extractFileMetadataFromContent(contentObj));
} catch (e) {
// Not JSON or couldn't be parsed, continue
continue;
}
}
// Handle object content
else if (typeof message.content === 'object') {
extractedFiles.push(...extractFileMetadataFromContent(message.content));
}
}
return extractedFiles;
}
/**
* Check if a file should be included in the collection based on inCollection metadata
* Supports both boolean (backward compat) and array format
* @param {boolean|Array<string>|undefined} inCollection - inCollection metadata value
* @param {string|null} chatId - Optional chat ID to filter by (if null, only global files are included)
* @returns {boolean} True if file should be included
*/
function isFileInCollection(inCollection, chatId = null) {
// If not set, file is not in collection
// Treat empty array [] the same as undefined (not in collection)
// Note: false is not a valid value (normalizes to undefined), but handle it defensively
if (inCollection === undefined || inCollection === null || inCollection === false) {
return false;
}
// Empty array means not in collection (same as undefined)
if (Array.isArray(inCollection) && inCollection.length === 0) {
return false;
}
// Backward compatibility: boolean true means global
if (inCollection === true) {
return true;
}
// Array format: check if it includes '*' (global) or the specific chatId
if (Array.isArray(inCollection) && inCollection.length > 0) {
// If no chatId specified, only include global files
if (chatId === null) {
return inCollection.includes('*');
}
// Include if global or matches specific chatId
return inCollection.includes('*') || inCollection.includes(chatId);
}
// Unknown format, exclude
return false;
}
/**
* Load file collection from memory system or cache
* @param {string} contextId - Context ID for the file collection
* @param {string} contextKey - Optional context key for encryption
* @param {boolean} useCache - Whether to check cache first (default: true)
* @param {string|null} chatId - Optional chat ID to filter files by (if provided, only includes files with '*' or this chatId in inCollection)
* @returns {Promise<Array>} File collection array
*/
/**
* Write file data to Redis with encryption of sensitive fields
* Follows the same pattern as setvWithDoubleEncryption - skips encryption for empty values
* @param {Object} redisClient - Redis client
* @param {string} contextMapKey - Redis hash map key
* @param {string} hash - File hash (key in hash map)
* @param {Object} fileData - File data object
* @param {string} contextKey - Optional context key for encryption
*/
async function writeFileDataToRedis(redisClient, contextMapKey, hash, fileData, contextKey = null) {
const dataToStore = { ...fileData };
// Encrypt sensitive fields if contextKey is provided (same pattern as memory encryption)
if (contextKey && contextKey.trim() !== '') {
// Encrypt tags (array of strings) - skip if empty (consistent with memory encryption)
if (dataToStore.tags && Array.isArray(dataToStore.tags) && dataToStore.tags.length > 0) {
try {
const tagsJson = JSON.stringify(dataToStore.tags);
const encrypted = encrypt(tagsJson, contextKey);
if (encrypted !== null) {
dataToStore.tags = encrypted;
}
// If encryption fails, continue with unencrypted (same pattern as memory)
} catch (error) {
logger.warn(`Failed to encrypt tags: ${error.message}`);
}
}
// Encrypt notes (string) - skip if empty (consistent with memory encryption)
if (dataToStore.notes && typeof dataToStore.notes === 'string' && dataToStore.notes.trim() !== '') {
try {
const encrypted = encrypt(dataToStore.notes, contextKey);
if (encrypted !== null) {
dataToStore.notes = encrypted;
}
// If encryption fails, continue with unencrypted (same pattern as memory)
} catch (error) {
logger.warn(`Failed to encrypt notes: ${error.message}`);
}
}
}
await redisClient.hset(contextMapKey, hash, JSON.stringify(dataToStore));
}
/**
* Read file data from Redis with decryption of sensitive fields
* Follows the same pattern as getvWithDoubleDecryption - tries decrypt, falls back to original
* @param {string} dataStr - JSON string from Redis
* @param {string} contextKey - Optional context key for decryption
* @returns {Object|null} Parsed and decrypted file data, or null if invalid
*/
function readFileDataFromRedis(dataStr, contextKey = null) {
if (!dataStr) return null;
try {
const fileData = JSON.parse(dataStr);
// Decrypt sensitive fields if contextKey is provided (same pattern as memory decryption)
if (contextKey && contextKey.trim() !== '') {
// Decrypt tags (array of strings)
if (fileData.tags !== undefined && fileData.tags !== null) {
// If already an array, it's unencrypted legacy data - keep as-is
if (!Array.isArray(fileData.tags) && typeof fileData.tags === 'string') {
// Try to decrypt (encrypted strings have ':' separator from IV)
if (fileData.tags.includes(':')) {
try {
const decrypted = decrypt(fileData.tags, contextKey);
if (decrypted !== null) {
// Try to parse as JSON array, fallback to array with single string
try {
fileData.tags = JSON.parse(decrypted);
} catch (e) {
fileData.tags = [decrypted];
}
}
// If decryption returns null, keep original (might be unencrypted legacy data)
} catch (error) {
// Decryption failed, keep as-is (unencrypted legacy data)
}
} else {
// No ':' means not encrypted - try parsing as JSON, fallback to array
try {
fileData.tags = JSON.parse(fileData.tags);
} catch (e) {
fileData.tags = [fileData.tags];
}
}
}
} else {
fileData.tags = [];
}
// Decrypt notes (string)
if (fileData.notes !== undefined && fileData.notes !== null) {
if (typeof fileData.notes === 'string' && fileData.notes.includes(':')) {
// Try to decrypt
try {
const decrypted = decrypt(fileData.notes, contextKey);
if (decrypted !== null) {
fileData.notes = decrypted;
}
// If decryption returns null, keep original (might be unencrypted legacy data)
} catch (error) {
// Decryption failed, keep as-is (unencrypted legacy data)
}
}
// If not encrypted (no ':'), keep as-is (legacy unencrypted data)
} else {
fileData.notes = '';
}
}
return fileData;
} catch (e) {
return null;
}
}
/**
* Parse raw Redis hash map data into file objects (without filtering)
* @param {Object} allFiles - Redis HGETALL result {hash: fileDataStr}
* @param {string} contextKey - Optional context key for decryption
* @returns {Array} Array of parsed file data objects (includes inCollection metadata)
*/
function parseRawFileData(allFiles, contextKey = null) {
return Object.entries(allFiles).map(([hash, fileDataStr]) => {
const decryptedData = readFileDataFromRedis(fileDataStr, contextKey);
if (!decryptedData) {
return null;
}
// Use converted URL, GCS, and mimeType if converted block exists
// This ensures we use the converted file (the actual processable content) as the primary values
// Keep displayFilename as the original (e.g., "foo.docx" even if URL is "foo.md")
const url = decryptedData.converted?.url || decryptedData.url;
const gcs = decryptedData.converted?.gcs || decryptedData.gcs || null;
const mimeType = decryptedData.converted?.mimeType || decryptedData.mimeType || null;
// Return parsed file data with hash and inCollection preserved for filtering
return {
id: decryptedData.id || `${Date.now()}-${Math.random().toString(36).substring(2, 9)}`,
url: url, // Use converted URL if available
gcs: gcs, // Use converted GCS if available
displayFilename: decryptedData.displayFilename || decryptedData.filename || null,
mimeType: mimeType, // Use converted mimeType if available
tags: decryptedData.tags || [],
notes: decryptedData.notes || '',
hash: hash,
permanent: decryptedData.permanent || false,
addedDate: decryptedData.addedDate || decryptedData.timestamp || new Date().toISOString(),
lastAccessed: decryptedData.lastAccessed || decryptedData.timestamp || new Date().toISOString(),
// Mark as converted if converted block exists (for edit prevention)
...(decryptedData.converted && { _isConverted: true }),
// Preserve inCollection for filtering
inCollection: decryptedData.inCollection
};
}).filter(Boolean);
}
/**
* Sort files by lastAccessed (most recent first)
* @param {Array} files - Array of file objects
* @returns {Array} Sorted array
*/
function sortFilesByLastAccessed(files) {
return files.sort((a, b) => {
const aDate = new Date(a.lastAccessed || a.addedDate || 0);
const bDate = new Date(b.lastAccessed || b.addedDate || 0);
return bDate - aDate;
});
}
/**
* Load file collection from one or more contexts (unified function)
*
* @param {Array|Object|string} agentContext - Context(s) to load from:
* - Array of {contextId, contextKey, default} objects (compound context)
* - Single {contextId, contextKey, default} object
* - String contextId (contextKey will be null)
* @param {Object} options - Load options
* @param {Array<string>|string|null} options.chatIds - Chat IDs to filter by:
* - If null/undefined/empty array: return ALL files regardless of inCollection
* - If provided: filter to files where inCollection includes '*' or any of these chatIds
* @param {boolean} options.useCache - Whether to use cache (default: true)
* @returns {Promise<Array>} File collection (deduplicated across contexts, sorted by lastAccessed)
*/
async function loadFileCollection(agentContext, options = {}) {
// Normalize agentContext to array format
let contexts = [];
if (typeof agentContext === 'string') {
// Single contextId string
contexts = [{ contextId: agentContext, contextKey: null, default: true }];
} else if (Array.isArray(agentContext)) {
contexts = agentContext.filter(ctx => ctx && ctx.contextId);
} else if (agentContext && typeof agentContext === 'object' && agentContext.contextId) {
// Single context object
contexts = [agentContext];
}
if (contexts.length === 0) {
return [];
}
// Normalize options
const useCache = options.useCache !== false; // default true
let chatIds = options.chatIds;
// Normalize chatIds to array or null
if (chatIds === undefined || chatIds === null) {
chatIds = null; // No filtering
} else if (typeof chatIds === 'string') {
chatIds = chatIds.trim() ? [chatIds] : null;
} else if (Array.isArray(chatIds)) {
chatIds = chatIds.filter(id => id && typeof id === 'string' && id.trim());
if (chatIds.length === 0) chatIds = null;
} else {
chatIds = null;
}
// Load files from all contexts
let allFiles = [];
for (const ctx of contexts) {
const contextId = ctx.contextId;
const contextKey = ctx.contextKey || null;
const cacheKey = getCollectionCacheKey(contextId, contextKey);
let rawFiles = [];
// Check cache first
if (useCache && fileCollectionCache.has(cacheKey)) {
const cached = fileCollectionCache.get(cacheKey);
if (Date.now() - cached.timestamp < CACHE_TTL) {
rawFiles = cached.rawFiles;
}
}
// Load from Redis if not cached
if (rawFiles.length === 0) {
try {
const redisClient = await getRedisClient();
if (redisClient) {
const contextMapKey = `FileStoreMap:ctx:${contextId}`;
const filesData = await redisClient.hgetall(contextMapKey);
rawFiles = parseRawFileData(filesData, contextKey);
// Update cache
if (useCache) {
fileCollectionCache.set(cacheKey, {
rawFiles: rawFiles,
timestamp: Date.now()
});
}
}
} catch (e) {
// Collection doesn't exist yet or error reading
rawFiles = [];
}
}
// Tag files with their source context
allFiles.push(...rawFiles.map(f => ({ ...f, _contextId: contextId })));
}
// Deduplicate by hash/url/gcs (keep first occurrence - primary context wins)
const seenHashes = new Set();
const seenUrls = new Set();
const seenGcs = new Set();
const deduped = [];
for (const file of allFiles) {
const isDupe = (file.hash && seenHashes.has(file.hash)) ||
(file.url && seenUrls.has(file.url)) ||
(file.gcs && seenGcs.has(file.gcs));
if (!isDupe) {
if (file.hash) seenHashes.add(file.hash);
if (file.url) seenUrls.add(file.url);
if (file.gcs) seenGcs.add(file.gcs);
deduped.push(file);
}
}
// Apply filtering
let filtered;
if (chatIds !== null) {
// Filter to files that are in collection for any of the provided chatIds
filtered = deduped.filter(file => {
// Check if file is accessible for any of the chatIds
for (const chatId of chatIds) {
if (isFileInCollection(file.inCollection, chatId)) {
return true;
}
}
return false;
});
} else {
// If chatIds is null, return ALL files
// Valid inCollection is an array with strings (or boolean true for backward compat)
// Files with inCollection === undefined/null/[] are included (Labeeb uploads, not in collection - treat the same)
// Note: false is not a valid value (normalizes to undefined), so we shouldn't see it in data
filtered = deduped.filter(file => {
const ic = file.inCollection;
// Include undefined/null/empty array (Labeeb uploads, not in collection - treat the same)
if (ic === undefined || ic === null || (Array.isArray(ic) && ic.length === 0)) return true;
// Include boolean true (backward compat - global)
if (ic === true) return true;
// Include valid arrays (non-empty arrays with strings)
if (Array.isArray(ic) && ic.length > 0) return true;
// Exclude everything else (invalid formats - defensive programming)
return false;
});
}
return sortFilesByLastAccessed(filtered);
}
/**
* Normalize inCollection value to array format
* @param {boolean|Array<string>|undefined} inCollection - inCollection value to normalize
* @returns {Array<string>|undefined} Normalized array, or undefined if false/null/undefined/empty array
*/
function normalizeInCollection(inCollection) {
// If explicitly false, return undefined (not in collection, same as undefined)
if (inCollection === false) {
return undefined;
}
// If null or undefined, return undefined (file not in collection)
if (inCollection === null || inCollection === undefined) {
return undefined;
}
// If empty array, return undefined (not in collection, same as undefined)
if (Array.isArray(inCollection) && inCollection.length === 0) {
return undefined;
}
// Boolean true means global
if (inCollection === true) {
return ['*'];
}
// Already an array, return as-is
if (Array.isArray(inCollection)) {
return inCollection;
}
// Unknown format, default to global
return ['*'];
}
/**
* Get the appropriate inCollection value based on chatId
* Centralized function to ensure consistent behavior across all file operations
* @param {string|null|undefined} chatId - Optional chat ID
* @returns {Array<string>} Array with chatId if provided, otherwise ['*'] for global
*/
function getInCollectionValue(chatId = null) {
if (chatId && typeof chatId === 'string' && chatId.trim() !== '') {
return [chatId];
}
return ['*'];
}
/**
* Add a chatId to an existing inCollection array (reference counting)
* If the chatId is already present, returns the array unchanged.
*
* IMPORTANT: inCollection is either ['*'] (global) OR [chatId, ...] (chat-scoped), never mixed.
* If inCollection contains '*' (global), it stays global - no chatIds are added.
*
* @param {Array<string>|undefined} existingInCollection - Current inCollection value
* @param {string|null} chatId - Chat ID to add
* @returns {Array<string>} Updated inCollection array
*/
function addChatIdToInCollection(existingInCollection, chatId) {
// Normalize existing to array
const existing = Array.isArray(existingInCollection) ? existingInCollection : [];
// If already global, stay global
if (existing.includes('*')) {
return existing;
}
// If no chatId provided, return existing or default to global
if (!chatId || typeof chatId !== 'string' || chatId.trim() === '') {
return existing.length > 0 ? existing : ['*'];
}
// Add chatId if not already present
if (!existing.includes(chatId)) {
return [...existing, chatId];
}
return existing;
}
/**
* Remove a chatId from an inCollection array (reference counting)
* Returns the updated array without the chatId.
*
* IMPORTANT: Global files (['*']) are not reference-counted - they return unchanged.
* Only chat-scoped files have chatIds removed. When removing from collection,
* global files should be fully deleted, not reference-counted.
*
* @param {Array<string>|undefined} existingInCollection - Current inCollection value
* @param {string|null} chatId - Chat ID to remove
* @returns {Array<string>} Updated inCollection array (may be empty for chat-scoped files)
*/
function removeChatIdFromInCollection(existingInCollection, chatId) {
// Normalize existing to array
const existing = Array.isArray(existingInCollection) ? existingInCollection : [];
// If no chatId provided, can't remove anything
if (!chatId || typeof chatId !== 'string' || chatId.trim() === '') {
return existing;
}
// If global, removing a specific chatId doesn't make sense - return as-is
// (global files aren't scoped to chats)
if (existing.includes('*')) {
return existing;
}
// Remove the chatId
return existing.filter(id => id !== chatId);
}
/**
* Update file metadata in Redis hash map (direct atomic operation)
* @param {string} contextId - Context ID
* @param {string} hash - File hash
* @param {Object} metadata - Metadata to update (displayFilename, id, tags, notes, mimeType, addedDate, lastAccessed, permanent, inCollection)
* @param {string} contextKey - Optional context key for encryption
* @param {string|null} chatId - Optional chat ID, used as default for inCollection if not provided in metadata and not already set
* Note: Does NOT update CFH core fields (url, gcs, hash, filename) - those are managed by CFH
* @returns {Promise<boolean>} True if successful
*/
async function updateFileMetadata(contextId, hash, metadata, contextKey = null, chatId = null) {
if (!contextId || !hash) {
return false;
}
try {
const redisClient = await getRedisClient();
if (!redisClient) {
return false;
}
const contextMapKey = `FileStoreMap:ctx:${contextId}`;
// Get existing file data - must exist to update
const existingDataStr = await redisClient.hget(contextMapKey, hash);
if (!existingDataStr) {
// File doesn't exist in this context - don't create new entries
return false;
}
const existingData = readFileDataFromRedis(existingDataStr, contextKey) || {};
// Merge CFH data with Cortex metadata
// Only update Cortex-managed fields, preserve CFH fields (url, gcs, hash, filename)
const fileData = {
...existingData, // Preserve all CFH data (url, gcs, hash, filename, etc.)
// Handle inCollection: normalize if provided, otherwise preserve existing or default based on chatId
// normalizeInCollection converts false/null/undefined/[] to undefined (no collection membership)
inCollection: metadata.inCollection !== undefined
? normalizeInCollection(metadata.inCollection)
: (existingData.inCollection !== undefined
? normalizeInCollection(existingData.inCollection)
: getInCollectionValue(chatId)),
// Update only Cortex-managed metadata fields
...(metadata.displayFilename !== undefined && { displayFilename: metadata.displayFilename }),
...(metadata.id !== undefined && { id: metadata.id }),
...(metadata.tags !== undefined && { tags: metadata.tags }),
...(metadata.notes !== undefined && { notes: metadata.notes }),
...(metadata.mimeType !== undefined && { mimeType: metadata.mimeType }),
...(metadata.addedDate !== undefined && { addedDate: metadata.addedDate }),
...(metadata.lastAccessed !== undefined && { lastAccessed: metadata.lastAccessed }),
...(metadata.permanent !== undefined && { permanent: metadata.permanent })
};
// Remove inCollection if it's undefined (file not in collection)
// Empty arrays [] are normalized to undefined, so they get deleted too
if (fileData.inCollection === undefined) {
delete fileData.inCollection;
}
// Write back to hash map (atomic operation) - encryption happens in helper
await writeFileDataToRedis(redisClient, contextMapKey, hash, fileData, contextKey);
// Invalidate cache (use contextKey to match the correct cache key)
invalidateFileCollectionCache(contextId, contextKey);
return true;
} catch (e) {
const logger = (await import('./logger.js')).default;
logger.warn(`Failed to update file metadata: ${e.message}`);
return false;
}
}
/**
* Save file collection to memory system
* Only updates files that have changed (optimized)
* @param {string} contextId - Context ID for the file collection
* @param {string} contextKey - Optional context key for encryption (unused with hash maps)
* @param {Array} collection - File collection array
* @param {string|null} chatId - Optional chat ID, used for inCollection value (chat-scoped if provided, global if not)
* @returns {Promise<boolean>} True if successful
*/
async function saveFileCollection(contextId, contextKey, collection, chatId = null) {
const cacheKey = getCollectionCacheKey(contextId, contextKey);
try {
const redisClient = await getRedisClient();
if (!redisClient) {
return false;
}
const contextMapKey = `FileStoreMap:ctx:${contextId}`;
// Get current state to detect changes
const currentFiles = await redisClient.hgetall(contextMapKey);
// Update only files that changed or are new
for (const file of collection) {
// Generate hash from URL if not present (for files added without hash)
let fileHash = file.hash;
if (!fileHash && file.url) {
fileHash = await computeBufferHash(Buffer.from(file.url));
}
if (!fileHash) continue;
const currentDataStr = currentFiles[fileHash];
let needsUpdate = true;
// Check if file actually changed
if (currentDataStr) {
const currentData = readFileDataFromRedis(currentDataStr, contextKey);
if (currentData) {
// Compare metadata fields (ignore CFH fields like url, gcs, timestamp)
if (currentData.id === file.id &&
JSON.stringify(currentData.tags || []) === JSON.stringify(file.tags || []) &&
currentData.notes === (file.notes || '') &&
currentData.mimeType === (file.mimeType || null) &&
currentData.permanent === (file.permanent || false)) {
needsUpdate = false;
}
}
}
if (needsUpdate) {
// Get existing CFH data
const existingData = readFileDataFromRedis(currentDataStr, contextKey) || {};
// Merge CFH data with Cortex metadata
// Preserve all CFH fields (url, gcs, filename, displayFilename, etc.)
// Mark as inCollection: true (chat files that should appear in file collection)
const fileData = {
...existingData, // Preserve all CFH data first
id: file.id,
url: file.url || existingData.url, // Preserve URL (CFH-managed)
gcs: file.gcs || existingData.gcs || null, // Preserve GCS (CFH-managed)
// Preserve CFH's filename (CFH-managed), only update displayFilename (Cortex-managed)
displayFilename: file.displayFilename !== undefined ? file.displayFilename : (existingData.displayFilename || null),
tags: file.tags || [],
notes: file.notes || '',
mimeType: file.mimeType || existingData.mimeType || null,
addedDate: file.addedDate || existingData.timestamp || new Date().toISOString(),
lastAccessed: file.lastAccessed || new Date().toISOString(),
permanent: file.permanent !== undefined ? file.permanent : (existingData.permanent || false),
// Add chatId to existing inCollection (reference counting) - file may be used in multiple chats
inCollection: existingData.inCollection
? addChatIdToInCollection(existingData.inCollection, chatId)
: getInCollectionValue(chatId)
};
// Write back to hash map (atomic operation) - encryption happens in helper
await writeFileDataToRedis(redisClient, contextMapKey, fileHash, fileData, contextKey);
}
}
// Note: We don't remove files from hash map when removed from collection
// CFH manages file lifecycle, and files might still exist in storage
// Invalidate cache (will be repopulated on next loadFileCollection call with fresh Redis data)
fileCollectionCache.delete(cacheKey);
return true;
} catch (e) {
const logger = (await import('./logger.js')).default;
logger.warn(`Failed to save file collection: ${e.message}`);
return false;
}
}
/**
* Add a file to the file collection
* If fileUrl is provided and is not already a cloud URL, it will be uploaded first
* @param {string} contextId - Context ID for the file collection
* @param {string} contextKey - Optional context key for encryption
* @param {string} url - Cloud storage URL (Azure URL) - if fileUrl is provided, this can be null
* @param {string} gcs - Optional Google Cloud Storage URL
* @param {string} filename - Filename or title for the file
* @param {Array<string>} tags - Optional array of tags
* @param {string} notes - Optional notes or description
* @param {string} hash - Optional file hash
* @param {string} fileUrl - Optional: URL of file to upload (if not already in cloud storage)
* @param {pathwayResolver} pathwayResolver - Optional pathway resolver for logging
* @param {boolean} permanent - If true, file is stored with permanent retention
* @param {string|null} chatId - Optional chat ID, used for inCollection value (chat-scoped if provided, global if not)
* @returns {Promise<Object>} File entry object with id
*/
async function addFileToCollection(contextId, contextKey, url, gcs, filename, tags = [], notes = '', hash = null, fileUrl = null, pathwayResolver = null, permanent = false, chatId = null) {
if (!contextId || !filename) {
throw new Error("contextId and filename are required");
}
// If permanent=true, set retention=permanent to keep file forever
const desiredRetention = permanent ? 'permanent' : 'temporary';
// YouTube URLs should not be added to the file collection (they are never uploaded to CFH)
// They can be used directly in analyzer tools without being in the collection
if (fileUrl && isYoutubeUrl(fileUrl)) {
throw new Error("YouTube URLs cannot be added to the file collection. Use the YouTube URL directly with analyzer tools instead.");
}
if (url && isYoutubeUrl(url)) {
throw new Error("YouTube URLs cannot be added to the file collection. Use the YouTube URL directly with analyzer tools instead.");
}
// If fileUrl is provided and url is not already a cloud URL, upload the file first
let finalUrl = url;
let finalGcs = gcs;
let finalHash = hash;
if (fileUrl && (!url || (!url.includes('blob.core.windows.net') && !url.includes('storage.googleapis.com')))) {
// Upload the file from the URL
// uploadFileToCloud will download it, compute hash, check if it exists, and upload if needed
// It uploads the local file stream, not the URL, to avoid triggering remoteFile fetch
const uploadResult = await uploadFileToCloud(fileUrl, null, filename, pathwayResolver, contextId);
finalUrl = uploadResult.url;
finalGcs = uploadResult.gcs;
finalHash = uploadResult.hash || hash;
}
// If the caller asked for permanence/privacy and we have a hash, update retention (best-effort)
if (finalHash && desiredRetention === 'permanent') {
try {
await setRetentionForHash(finalHash, desiredRetention, contextId, pathwayResolver);
} catch (e) {
const msg = `Failed to set retention=${desiredRetention} for hash ${finalHash}: ${e?.message || String(e)}`;
if (pathwayResolver?.logWarning) pathwayResolver.logWarning(msg);
else logger.warn(msg);
}
}
if (!finalUrl) {
throw new Error("url or fileUrl is required");
}
// Determine MIME type from URL (the actual stored content, which may be converted)
// E.g., if user uploaded foo.docx but it was converted to foo.md, MIME type should be text/markdown
const mimeType = determineMimeTypeFromUrl(finalUrl, finalGcs, null);
// IMPORTANT: Keep the original user-provided filename as displayFilename
// Do NOT "correct" the extension based on MIME type
// The user's original filename (e.g., "foo.docx") should be preserved even if the
// stored content is a converted format (e.g., "foo.md")
// This allows users to recognize their files by original name while tools
// use the actual URL to determine content type for operation