UNPKG

@aj-archipelago/cortex

Version:

Cortex is a GraphQL API for AI. It provides a simple, extensible interface for using AI services from OpenAI, Azure and others.

912 lines (823 loc) 32.7 kB
import fs from "fs"; import os from "os"; import path from "path"; import { v4 as uuidv4 } from "uuid"; import mime from "mime-types"; import { DOC_EXTENSIONS, AZURITE_ACCOUNT_NAME } from "./constants.js"; import { easyChunker } from "./docHelper.js"; import { downloadFile, splitMediaFile } from "./fileChunker.js"; import { ensureEncoded, ensureFileExtension, urlExists } from "./helper.js"; import { cleanupRedisFileStoreMap, getFileStoreMap, publishRequestProgress, removeFromFileStoreMap, setFileStoreMap, cleanupRedisFileStoreMapAge, } from "./redis.js"; import { FileConversionService } from "./services/FileConversionService.js"; import { StorageService } from "./services/storage/StorageService.js"; import { uploadBlob, getMimeTypeFromUrl } from "./blobHandler.js"; import { generateShortId } from "./utils/filenameUtils.js"; import { redactContextId, redactSasToken, sanitizeForLogging } from "./utils/logSecurity.js"; // Hybrid cleanup approach: // 1. Lazy cleanup: Check file existence when cache entries are accessed (in getFileStoreMap) // 2. Age cleanup: Remove old entries every 100 requests to prevent cache bloat let requestCount = 0; /** * Lightweight age-based cleanup - removes old cache entries to prevent bloat * Only removes entries older than 7 days and only checks a small sample * Runs every 100 requests to avoid performance impact */ async function cleanupInactive(context) { try { // Only run age cleanup every 100 requests to avoid overhead requestCount++; if (requestCount % 100 === 0) { const cleaned = await cleanupRedisFileStoreMapAge(7, 10); // 7 days, max 10 entries if (cleaned.length > 0) { context.log(`Age cleanup: Removed ${cleaned.length} old cache entries`); } } } catch (error) { console.log("Error occurred during age-based cleanup:", error); } } async function CortexFileHandler(context, req) { // Parse body if it's a string (Azure Functions sometimes doesn't auto-parse DELETE bodies) let parsedBody = req.body; if (typeof req.body === 'string' && req.body.length > 0) { try { parsedBody = JSON.parse(req.body); } catch (e) { // If parsing fails, treat as empty object parsedBody = {}; } } // For GET requests, prioritize query string. For other methods, check body first, then query // Also check if parsedBody actually has content (not just empty object) const hasBodyContent = parsedBody && typeof parsedBody === 'object' && Object.keys(parsedBody).length > 0; const bodySource = hasBodyContent ? (parsedBody.params || parsedBody) : {}; const querySource = req.query || {}; // Merge sources: for GET, query takes priority; for others, body takes priority const isGet = req.method?.toLowerCase() === 'get'; const source = isGet ? { ...bodySource, ...querySource } : { ...querySource, ...bodySource }; const { uri, requestId, save, hash, checkHash, clearHash, shortLivedMinutes, fetch, load, restore, setRetention, contextId, } = source; // Container parameter is ignored - always uses default container from env var const resolvedContextId = contextId || null; // Normalize boolean parameters const shouldSave = save === true || save === "true"; const shouldCheckHash = checkHash === true || checkHash === "true"; const shouldClearHash = clearHash === true || clearHash === "true"; const shortLivedDuration = parseInt(shortLivedMinutes) || 5; // Default to 5 minutes const shouldFetchRemote = fetch || load || restore; const shouldSetRetention = setRetention === true || setRetention === "true" || (req.query?.operation === "setRetention") || (parsedBody?.operation === "setRetention"); const operation = shouldSave ? "save" : shouldCheckHash ? "checkHash" : shouldClearHash ? "clearHash" : shouldSetRetention ? "setRetention" : shouldFetchRemote ? "remoteFile" : req.method.toLowerCase() === "delete" || (req.query?.operation === "delete") || (parsedBody?.operation === "delete") ? "delete" : uri ? DOC_EXTENSIONS.some((ext) => uri.toLowerCase().endsWith(ext)) ? "document_processing" : "media_chunking" : "upload"; context.log( `Processing ${req.method} request - ${requestId ? `requestId: ${requestId}, ` : ""}${uri ? `uri: ${redactSasToken(uri)}, ` : ""}${hash ? `hash: ${hash}, ` : ""}${resolvedContextId ? `contextId: ${redactContextId(resolvedContextId)}, ` : ""}operation: ${operation}`, ); // Trigger lightweight age-based cleanup (runs every 100 requests) cleanupInactive(context); // Initialize services const storageService = new StorageService(); await storageService._initialize(); // Ensure providers are initialized const conversionService = new FileConversionService( context, storageService.primaryProvider.constructor.name === "AzureStorageProvider", null, ); // Validate URL for document processing and media chunking operations if (operation === "document_processing" || operation === "media_chunking") { try { const urlObj = new URL(uri); if (!["http:", "https:", "gs:"].includes(urlObj.protocol)) { context.res = { status: 400, body: "Invalid URL protocol - only HTTP, HTTPS, and GCS URLs are supported", }; return; } // Check if the pathname is too long (e.g., > 1024 characters) if (urlObj.pathname.length > 1024) { context.res = { status: 400, body: "URL pathname is too long", }; return; } } catch (error) { context.res = { status: 400, body: "Invalid URL format", }; return; } } // Clean up files when request delete which means processing marked completed // Supports two modes: // 1. Delete multiple files by requestId (existing behavior) // 2. Delete single file by hash (new behavior) if (operation === "delete") { // Check both query string and body params for delete parameters // Handle both req.body.params.hash and req.body.hash formats // Note: container is already extracted from source above (line 82), same as checkHash const deleteRequestId = req.query.requestId || parsedBody?.params?.requestId || parsedBody?.requestId || requestId; const deleteHash = req.query.hash || parsedBody?.params?.hash || parsedBody?.hash || hash; // If only hash is provided, delete single file by hash if (deleteHash && !deleteRequestId) { try { // Container parameter is ignored - always uses default container from env var const deleted = await storageService.deleteFileByHash(deleteHash, resolvedContextId); context.res = { status: 200, body: { message: `File with hash ${deleteHash} deleted successfully`, deleted }, }; return; } catch (error) { context.res = { status: 404, body: error.message, }; return; } } // If requestId is provided, use the existing multi-file delete flow if (!deleteRequestId) { context.res = { status: 400, body: "Please pass either a requestId or hash in the query string or request body", }; return; } // First, get the hash from the map if it exists if (deleteHash) { const hashResult = await getFileStoreMap(deleteHash, false, resolvedContextId); if (hashResult) { context.log(`Found hash in map for deletion: ${deleteHash}${resolvedContextId ? ` (contextId: ${redactContextId(resolvedContextId)})` : ""}`); await removeFromFileStoreMap(deleteHash, resolvedContextId); } } const deleted = await storageService.deleteFiles(deleteRequestId); context.res = { status: 200, body: { body: deleted }, }; return; } // Set file retention (temporary or permanent) if (operation === "setRetention") { // Extract parameters from query string or body const fileHash = req.query.hash || parsedBody?.params?.hash || parsedBody?.hash || hash; const retention = req.query.retention || parsedBody?.params?.retention || parsedBody?.retention; if (!fileHash) { context.res = { status: 400, body: "Missing hash parameter. Please provide hash in query string or request body.", }; return; } if (!retention) { context.res = { status: 400, body: "Missing retention parameter. Please provide retention ('temporary' or 'permanent') in query string or request body.", }; return; } // Validate retention value if (retention !== 'temporary' && retention !== 'permanent') { context.res = { status: 400, body: "Invalid retention value. Must be 'temporary' or 'permanent'.", }; return; } try { const result = await storageService.setRetention(fileHash, retention, context, resolvedContextId); context.res = { status: 200, body: result, }; return; } catch (error) { context.res = { status: error.message.includes("not found") ? 404 : 500, body: error.message, }; return; } } const remoteUrl = shouldFetchRemote; if (req.method.toLowerCase() === "get" && remoteUrl) { context.log(`Remote file: ${redactSasToken(remoteUrl)}`); let filename; try { // Validate URL format and accessibility const urlCheck = await urlExists(remoteUrl); if (!urlCheck.valid) { context.res = { status: 400, body: "Invalid or inaccessible URL", }; return; } // Check if file already exists (using hash or URL as the key) // Always respect contextId if provided, even for URL-based lookups const exists = hash ? await getFileStoreMap(hash, false, resolvedContextId) : await getFileStoreMap(remoteUrl, false, resolvedContextId); if (exists) { context.res = { status: 200, body: exists, }; //update redis timestamp with current time if (hash) { await setFileStoreMap(hash, exists, resolvedContextId); } else { await setFileStoreMap(remoteUrl, exists, resolvedContextId); } return; } // Download the file first const urlObj = new URL(remoteUrl); // Use LLM-friendly naming for temp files instead of original filename const fileExtension = path.extname(urlObj.pathname) || ".mp3"; const shortId = generateShortId(); const tempFileName = `${shortId}${fileExtension}`; filename = path.join(os.tmpdir(), tempFileName); await downloadFile(remoteUrl, filename); // For remote files, we don't need a requestId folder structure since it's just a single file // Pass empty string to store the file directly in the root // Container parameter is ignored - always uses default container from env var const res = await storageService.uploadFile(context, filename, '', null, null); // All uploads default to temporary (permanent: false) to match file collection logic res.permanent = false; //Update Redis (using hash or URL as the key) // Always respect contextId if provided, even for URL-based lookups if (hash) { await setFileStoreMap(hash, res, resolvedContextId); } else { await setFileStoreMap(remoteUrl, res, resolvedContextId); } // Return the file URL context.res = { status: 200, body: res, }; } catch (error) { context.log("Error processing remote file request:", error); context.res = { status: 500, body: `Error processing file: ${error.message}`, }; } finally { // Cleanup temp file if it exists try { if (filename && fs.existsSync(filename)) { fs.unlinkSync(filename); } } catch (err) { context.log("Error cleaning up temp file:", err); } } return; } if (hash && clearHash) { try { const hashValue = await getFileStoreMap(hash, false, resolvedContextId); if (hashValue) { await removeFromFileStoreMap(hash, resolvedContextId); context.res = { status: 200, body: `Hash ${hash} removed`, }; } else { context.res = { status: 404, body: `Hash ${hash} not found`, }; } } catch (error) { context.res = { status: 500, body: `Error occurred during hash cleanup: ${error}`, }; console.log("Error occurred during hash cleanup:", error); } return; } if (hash && checkHash) { let hashResult = await getFileStoreMap(hash, true, resolvedContextId); // Skip lazy cleanup to handle it ourselves if (hashResult) { context.log(`File exists in map: ${hash}${resolvedContextId ? ` (contextId: ${redactContextId(resolvedContextId)})` : ""}`); // Log the URL retrieved from Redis before checking existence context.log(`Checking existence of URL from Redis: ${redactSasToken(hashResult?.url || '')}`); try { // Check primary storage first const primaryExists = hashResult?.url ? await storageService.fileExists(hashResult.url) : false; const gcsExists = hashResult?.gcs ? await storageService.fileExists(hashResult.gcs) : false; // If neither storage has the file, remove from map and return not found if (!primaryExists && !gcsExists) { context.log( `File not found in any storage. Removing from map: ${hash}`, ); await removeFromFileStoreMap(hash, resolvedContextId); context.res = { status: 404, body: `Hash ${hash} not found in storage`, }; return; } // If GCS is missing but primary exists, restore to GCS if (primaryExists && !gcsExists && hashResult?.url) { context.log(`GCS file missing, restoring from primary: ${hash}`); try { hashResult = await storageService.ensureGCSUpload( context, hashResult, ); } catch (error) { context.log(`Error restoring to GCS: ${error}`); // If restoration fails, remove the hash from the map await removeFromFileStoreMap(hash, resolvedContextId); context.res = { status: 404, body: `Hash ${hash} not found`, }; return; } } // If primary is missing but GCS exists, restore from GCS if ( !primaryExists && gcsExists && hashResult?.gcs && storageService.backupProvider?.isConfigured() ) { context.log( `Primary storage file missing, restoring from GCS: ${hash}`, ); try { // Create a temporary file to store the downloaded content const tempDir = path.join(os.tmpdir(), `${uuidv4()}`); fs.mkdirSync(tempDir); const downloadedFile = path.join( tempDir, path.basename(hashResult.gcs), ); // Download from GCS await storageService.downloadFile(hashResult.gcs, downloadedFile); // Upload to primary storage // Container parameter is ignored - always uses default container from env var const res = await storageService.uploadFile( context, downloadedFile, hash, null, null, ); // Update the hash result with the new primary storage URL hashResult.url = res.url; // Clean up temp file try { if (downloadedFile && fs.existsSync(downloadedFile)) { fs.unlinkSync(downloadedFile); } if (tempDir && fs.existsSync(tempDir)) { fs.rmSync(tempDir, { recursive: true }); } } catch (err) { console.log("Error cleaning up temp files:", err); } } catch (error) { console.error("Error restoring from GCS:", error); // If restoration fails, remove the hash from the map await removeFromFileStoreMap(hash, resolvedContextId); context.res = { status: 404, body: `Hash ${hash} not found`, }; return; } } // Final check to ensure we have at least one valid storage location const finalPrimaryCheck = hashResult?.url ? await storageService.fileExists(hashResult.url) : false; const finalGCSCheck = hashResult?.gcs ? await storageService.fileExists(hashResult.gcs) : false; if (!finalPrimaryCheck && !finalGCSCheck) { context.log(`Failed to restore file. Removing from map: ${hash}`); await removeFromFileStoreMap(hash, resolvedContextId); context.res = { status: 404, body: `Hash ${hash} not found`, }; return; } // Reconstruct missing filename from URL if needed (before creating response) if (!hashResult.filename && hashResult.url) { try { const urlObj = new URL(hashResult.url); const pathSegments = urlObj.pathname.split('/').filter(segment => segment.length > 0); if (pathSegments.length > 0) { // Extract filename from URL path (last segment) const blobName = pathSegments[pathSegments.length - 1]; // Remove query params if any got included hashResult.filename = blobName.split('?')[0]; } } catch (error) { context.log(`Error extracting filename from URL: ${error.message}`); } } // Ensure hash is set if missing if (!hashResult.hash) { hashResult.hash = hash; } // Create the response object const response = { message: `File '${hashResult.filename || 'unknown'}' uploaded successfully.`, filename: hashResult.filename, url: hashResult.url, gcs: hashResult.gcs, hash: hashResult.hash || hash, timestamp: new Date().toISOString(), }; // Include displayFilename if it exists in Redis record if (hashResult.displayFilename) { response.displayFilename = hashResult.displayFilename; } // Ensure converted version exists and is synced across storage providers try { // Container parameter is ignored - always uses default container from env var hashResult = await conversionService.ensureConvertedVersion( hashResult, requestId, ); } catch (error) { context.log(`Error ensuring converted version: ${error}`); } // Add mimeType to converted block if it exists but doesn't have mimeType yet if (hashResult.converted && !hashResult.converted.mimeType) { hashResult.converted.mimeType = getMimeTypeFromUrl(hashResult.converted.url); } // Generate short-lived URLs for both original and converted files (if converted exists) // Helper function to generate short-lived URL for a given URL const generateShortLivedUrlForUrl = async (urlToProcess) => { if (!urlToProcess) return null; try { // Extract blob name from the URL to generate new SAS token let blobName; try { const url = new URL(urlToProcess); let path = url.pathname.substring(1); // For Azurite URLs, the path includes account name: devstoreaccount1/container/blob // For real Azure URLs, the path is: container/blob if (path.startsWith(`${AZURITE_ACCOUNT_NAME}/`)) { path = path.substring(`${AZURITE_ACCOUNT_NAME}/`.length); } const pathSegments = path.split('/').filter(segment => segment.length > 0); if (pathSegments.length >= 2) { blobName = pathSegments.slice(1).join('/'); } else if (pathSegments.length === 1) { blobName = pathSegments[0]; } } catch (urlError) { context.log(`Error parsing URL for short-lived generation: ${urlError}`); return null; } if (blobName) { const provider = storageService.primaryProvider; if (provider && provider.generateShortLivedSASToken) { const blobClientResult = await provider.getBlobClient(); const containerClient = blobClientResult.containerClient; const sasToken = provider.generateShortLivedSASToken( containerClient, blobName, shortLivedDuration ); const baseUrl = urlToProcess.split('?')[0]; return `${baseUrl}?${sasToken}`; } } } catch (error) { context.log(`Error generating short-lived URL: ${error}`); } return null; }; // Generate short-lived URLs for response (not stored in Redis) // Generate short-lived URL for converted file if it exists let convertedShortLivedUrl = null; if (hashResult.converted?.url) { convertedShortLivedUrl = await generateShortLivedUrlForUrl(hashResult.converted.url); if (!convertedShortLivedUrl) { // Fallback to regular URL convertedShortLivedUrl = hashResult.converted.url; } context.log(`Generated shortLivedUrl for converted file`); } // Generate short-lived URL for original file (for main response) const urlForShortLived = hashResult.converted?.url || hashResult.url; const mainShortLivedUrl = await generateShortLivedUrlForUrl(urlForShortLived); if (mainShortLivedUrl) { response.shortLivedUrl = mainShortLivedUrl; response.expiresInMinutes = shortLivedDuration; const urlType = hashResult.converted?.url ? 'converted' : 'original'; context.log(`Generated short-lived URL for hash: ${hash} using ${urlType} URL (expires in ${shortLivedDuration} minutes)`); } else { // Fallback for storage providers that don't support short-lived tokens response.shortLivedUrl = urlForShortLived; response.expiresInMinutes = shortLivedDuration; const urlType = hashResult.converted?.url ? 'converted' : 'original'; context.log(`Storage provider doesn't support short-lived tokens, using ${urlType} URL`); } // Attach converted info to response if present (include shortLivedUrl in response only) if (hashResult.converted) { response.converted = { url: hashResult.converted.url, shortLivedUrl: convertedShortLivedUrl || hashResult.converted.url, gcs: hashResult.converted.gcs, mimeType: hashResult.converted.mimeType || null, }; } // Update redis timestamp with current time // Note: setFileStoreMap will remove shortLivedUrl fields before storing // hashResult has already been enriched with filename/hash above if missing await setFileStoreMap(hash, hashResult, resolvedContextId); context.res = { status: 200, body: response, }; return; } catch (error) { context.log(`Error checking file existence: ${error}`); // If there's an error checking file existence, remove the hash from the map await removeFromFileStoreMap(hash, resolvedContextId); context.res = { status: 404, body: `Hash ${hash} not found`, }; return; } } context.res = { status: 404, body: `Hash ${hash} not found`, }; return; } if (req.method.toLowerCase() === "post") { // Determine if we should save to local storage based on primary provider const saveToLocal = storageService.primaryProvider.constructor.name === "LocalStorageProvider"; // Use uploadBlob to handle multipart/form-data // Container parameter is ignored - always uses default container from env var const result = await uploadBlob(context, req, saveToLocal, null, hash); if (result?.hash && context?.res?.body) { // Use contextId from result (extracted from form fields) or from resolvedContextId (query/body) const uploadContextId = result.contextId || resolvedContextId; // Store contextId alongside the entry for debugging/traceability if (uploadContextId && typeof context.res.body === "object" && context.res.body) { context.res.body.contextId = uploadContextId; } await setFileStoreMap(result.hash, context.res.body, uploadContextId); } return; } if (!uri || !requestId) { context.res = { status: 400, body: "Please pass a uri and requestId on the query string or in the request body", }; return; } let totalCount = 0; let completedCount = 0; let numberOfChunks; const file = ensureEncoded(uri); // encode url to handle special characters const result = []; const sendProgress = async (data = null) => { completedCount++; const progress = completedCount / totalCount; await publishRequestProgress({ requestId, progress, completedCount, totalCount, numberOfChunks, data, }); }; try { // Parse URL and get pathname without query parameters for extension check const urlObj = new URL(uri); const pathWithoutQuery = urlObj.pathname; if ( DOC_EXTENSIONS.some((ext) => pathWithoutQuery.toLowerCase().endsWith(ext)) ) { const extension = path.extname(pathWithoutQuery).toLowerCase(); const tempDir = path.join(os.tmpdir(), `${uuidv4()}`); fs.mkdirSync(tempDir); const downloadedFile = path.join(tempDir, `${uuidv4()}${extension}`); await downloadFile(uri, downloadedFile); try { if (shouldSave) { // Check if file needs conversion first if (conversionService.needsConversion(downloadedFile)) { // Convert the file const conversion = await conversionService.convertFile( downloadedFile, uri, ); if (!conversion.converted) { throw new Error("File conversion failed"); } // Save the converted file // Container parameter is ignored - always uses default container from env var const convertedSaveResult = await conversionService._saveConvertedFile( conversion.convertedPath, requestId, null, ); // Return the converted file URL context.res = { status: 200, body: { url: convertedSaveResult.url, blobName: path.basename(convertedSaveResult.url), }, }; } else { // File doesn't need conversion, save the original file // Container parameter is ignored - always uses default container from env var const saveResult = await conversionService._saveConvertedFile( downloadedFile, requestId, null, ); // Return the original file URL context.res = { status: 200, body: { url: saveResult.url, blobName: path.basename(saveResult.url), }, }; } return; } else { let text; if (conversionService.needsConversion(downloadedFile)) { text = await conversionService.convertFile( downloadedFile, uri, true, ); } else { // For files that don't need conversion, read the file contents directly text = await fs.promises.readFile(downloadedFile, "utf-8"); } result.push(...easyChunker(text)); } } catch (err) { console.log( `Error saving file ${uri} with request id ${requestId}:`, err, ); throw err; // Re-throw to handle in outer catch } finally { try { // delete temporary files if (downloadedFile && fs.existsSync(downloadedFile)) { fs.unlinkSync(downloadedFile); console.log(`Cleaned temp file ${downloadedFile}`); } } catch (err) { console.log(`Error cleaning temp file ${downloadedFile}:`, err); } // Delete uploaded files only if we're NOT saving the converted version. // When save=true we need to keep the converted file (which is stored under the same requestId prefix), // so skip the cleanup in that case. if (!shouldSave) { await storageService.deleteFiles(requestId); console.log(`Cleaned temp files for request id ${requestId}`); } else { console.log( `Skip cleanup for request id ${requestId} because save flag is set`, ); } } } else { const { chunkPromises, chunkOffsets, uniqueOutputPath, chunkBaseName } = await splitMediaFile(file); numberOfChunks = chunkPromises.length; // for progress reporting totalCount += chunkPromises.length * 4; // 4 steps for each chunk (download and upload) // sequential download of chunks const chunks = []; for (const chunkPromise of chunkPromises) { const chunkPath = await chunkPromise; chunks.push(chunkPath); await sendProgress(); } // sequential processing of chunks for (let index = 0; index < chunks.length; index++) { const chunkPath = chunks[index]; // Use the same base filename for all chunks to ensure consistency const chunkFilename = `chunk-${index + 1}-${chunkBaseName}`; // Container parameter is ignored - always uses default container from env var const chunkResult = await storageService.uploadFile( context, chunkPath, requestId, null, chunkFilename, ); const chunkOffset = chunkOffsets[index]; result.push({ uri: chunkResult.url, offset: chunkOffset, gcs: chunkResult.gcs, }); // Redact SAS tokens for secure logging const { redactSasToken } = await import('./utils/logSecurity.js'); const redactedUrl = redactSasToken(chunkResult.url); const redactedGcs = chunkResult.gcs ? redactSasToken(chunkResult.gcs) : ''; console.log( `Saved chunk as: ${redactedUrl}${redactedGcs ? ` and ${redactedGcs}` : ""}`, ); await sendProgress(); } // Cleanup the temp directory try { if (uniqueOutputPath && fs.existsSync(uniqueOutputPath)) { fs.rmSync(uniqueOutputPath, { recursive: true }); console.log(`Cleaned temp directory: ${uniqueOutputPath}`); } } catch (err) { console.log(`Error cleaning temp directory ${uniqueOutputPath}:`, err); } } } catch (error) { console.error("An error occurred:", error); context.res = { status: 500, body: error.message || error, }; return; } // Sanitize result before logging to redact SAS tokens and contextIds const sanitizedResult = sanitizeForLogging(result); console.log( "result:", sanitizedResult .map((item) => typeof item === "object" ? JSON.stringify(item, null, 2) : item, ) .join("\n"), ); context.res = { body: result, }; } export default CortexFileHandler;