UNPKG

@aj-archipelago/cortex

Version:

Cortex is a GraphQL API for AI. It provides a simple, extensible interface for using AI services from OpenAI, Azure and others.

608 lines (545 loc) 23.4 kB
import fs from 'fs'; import os from 'os'; import path from 'path'; import { v4 as uuidv4 } from 'uuid'; import { DOC_EXTENSIONS } from './constants.js'; import { easyChunker } from './docHelper.js'; import { downloadFile, splitMediaFile } from './fileChunker.js'; import { ensureEncoded, ensureFileExtension, urlExists } from './helper.js'; import { cleanupRedisFileStoreMap, getFileStoreMap, publishRequestProgress, removeFromFileStoreMap, setFileStoreMap, } from './redis.js'; import { FileConversionService } from './services/FileConversionService.js'; import { StorageService } from './services/storage/StorageService.js'; import { uploadBlob } from './blobHandler.js'; let isCleanupRunning = false; async function cleanupInactive(context) { try { if (isCleanupRunning) { return; } isCleanupRunning = true; const cleaned = await cleanupRedisFileStoreMap(); const urls = []; for (const key in cleaned) { const item = cleaned[key]; if (item.url) { urls.push(item.url); } if (item.gcs) { urls.push(item.gcs); } } if (urls.length > 0) { const storageService = new StorageService(); await storageService.cleanup(urls); } } catch (error) { console.log('Error occurred during cleanup:', error); } finally { isCleanupRunning = false; } } async function CortexFileHandler(context, req) { const { uri, requestId, save, hash, checkHash, clearHash, fetch, load, restore, } = req.body?.params || req.query; // Normalize boolean parameters const shouldSave = save === true || save === 'true'; const shouldCheckHash = checkHash === true || checkHash === 'true'; const shouldClearHash = clearHash === true || clearHash === 'true'; const shouldFetchRemote = fetch || load || restore; const operation = shouldSave ? 'save' : shouldCheckHash ? 'checkHash' : shouldClearHash ? 'clearHash' : shouldFetchRemote ? 'remoteFile' : req.method.toLowerCase() === 'delete' || req.query.operation === 'delete' ? 'delete' : uri ? DOC_EXTENSIONS.some((ext) => uri.toLowerCase().endsWith(ext)) ? 'document_processing' : 'media_chunking' : 'upload'; context.log( `Processing ${req.method} request - ${requestId ? `requestId: ${requestId}, ` : ''}${uri ? `uri: ${uri}, ` : ''}${hash ? `hash: ${hash}, ` : ''}operation: ${operation}`, ); cleanupInactive(context); //trigger & no need to wait for it // Initialize services const storageService = new StorageService(); const conversionService = new FileConversionService(context, storageService.primaryProvider.constructor.name === 'AzureStorageProvider'); // Validate URL for document processing and media chunking operations if (operation === 'document_processing' || operation === 'media_chunking') { try { const urlObj = new URL(uri); if (!['http:', 'https:', 'gs:'].includes(urlObj.protocol)) { context.res = { status: 400, body: 'Invalid URL protocol - only HTTP, HTTPS, and GCS URLs are supported', }; return; } // Check if the pathname is too long (e.g., > 1024 characters) if (urlObj.pathname.length > 1024) { context.res = { status: 400, body: 'URL pathname is too long', }; return; } } catch (error) { context.res = { status: 400, body: 'Invalid URL format', }; return; } } // Clean up files when request delete which means processing marked completed if (operation === 'delete') { const deleteRequestId = req.query.requestId || requestId; const deleteHash = req.query.hash || hash; if (!deleteRequestId) { context.res = { status: 400, body: 'Please pass a requestId on the query string', }; return; } // First, get the hash from the map if it exists if (deleteHash) { const hashResult = await getFileStoreMap(deleteHash); if (hashResult) { context.log(`Found hash in map for deletion: ${deleteHash}`); await removeFromFileStoreMap(deleteHash); } } const deleted = await storageService.deleteFiles(deleteRequestId); context.res = { status: 200, body: { body: deleted }, }; return; } const remoteUrl = shouldFetchRemote; if (req.method.toLowerCase() === 'get' && remoteUrl) { context.log(`Remote file: ${remoteUrl}`); let filename; try { // Validate URL format and accessibility const urlCheck = await urlExists(remoteUrl); if (!urlCheck.valid) { context.res = { status: 400, body: 'Invalid or inaccessible URL', }; return; } // Check if file already exists (using hash as the key) const exists = await getFileStoreMap(remoteUrl); if (exists) { context.res = { status: 200, body: exists, }; //update redis timestamp with current time await setFileStoreMap(remoteUrl, exists); return; } // Download the file first const urlObj = new URL(remoteUrl); let originalFileName = decodeURIComponent(path.basename(urlObj.pathname)); if (!originalFileName || originalFileName === '') { originalFileName = urlObj.hostname; } // Ensure the filename has the correct extension based on content type originalFileName = ensureFileExtension( originalFileName, urlCheck.contentType, ); const maxLength = 200; // Set the maximum length for the filename let truncatedFileName = originalFileName; if (originalFileName.length > maxLength) { const extension = path.extname(originalFileName); const basename = path.basename(originalFileName, extension); truncatedFileName = basename.substring(0, maxLength - extension.length) + extension; } // Use the original-truncated file name when saving the downloaded file filename = path.join(os.tmpdir(), truncatedFileName); await downloadFile(remoteUrl, filename); // Now upload the downloaded file const res = await storageService.uploadFile( context, filename, remoteUrl, ); //Update Redis (using hash as the key) await setFileStoreMap(remoteUrl, res); // Return the file URL context.res = { status: 200, body: res, }; } catch (error) { context.log('Error processing remote file request:', error); context.res = { status: 500, body: `Error processing file: ${error.message}`, }; } finally { // Cleanup temp file if it exists try { if (filename && fs.existsSync(filename)) { fs.unlinkSync(filename); } } catch (err) { context.log('Error cleaning up temp file:', err); } } return; } if (hash && clearHash) { try { const hashValue = await getFileStoreMap(hash); if (hashValue) { await removeFromFileStoreMap(hash); context.res = { status: 200, body: `Hash ${hash} removed`, }; } else { context.res = { status: 404, body: `Hash ${hash} not found`, }; } } catch (error) { context.res = { status: 500, body: `Error occurred during hash cleanup: ${error}`, }; console.log('Error occurred during hash cleanup:', error); } return; } if (hash && checkHash) { let hashResult = await getFileStoreMap(hash); if (hashResult) { context.log(`File exists in map: ${hash}`); // Log the URL retrieved from Redis before checking existence context.log(`Checking existence of URL from Redis: ${hashResult?.url}`); try { // Check primary storage first const primaryExists = hashResult?.url ? await storageService.fileExists(hashResult.url) : false; const gcsExists = hashResult?.gcs ? await storageService.fileExists(hashResult.gcs) : false; // If neither storage has the file, remove from map and return not found if (!primaryExists && !gcsExists) { context.log(`File not found in any storage. Removing from map: ${hash}`); await removeFromFileStoreMap(hash); context.res = { status: 404, body: `Hash ${hash} not found in storage`, }; return; } // If GCS is missing but primary exists, restore to GCS if (primaryExists && !gcsExists && hashResult?.url) { context.log(`GCS file missing, restoring from primary: ${hash}`); try { hashResult = await storageService.ensureGCSUpload(context, hashResult); } catch (error) { context.log(`Error restoring to GCS: ${error}`); // If restoration fails, remove the hash from the map await removeFromFileStoreMap(hash); context.res = { status: 404, body: `Hash ${hash} not found`, }; return; } } // If primary is missing but GCS exists, restore from GCS if (!primaryExists && gcsExists && hashResult?.gcs && storageService.backupProvider?.isConfigured()) { context.log(`Primary storage file missing, restoring from GCS: ${hash}`); try { // Create a temporary file to store the downloaded content const tempDir = path.join(os.tmpdir(), `${uuidv4()}`); fs.mkdirSync(tempDir); const downloadedFile = path.join(tempDir, path.basename(hashResult.gcs)); // Download from GCS await storageService.downloadFile(hashResult.gcs, downloadedFile); // Upload to primary storage const res = await storageService.uploadFile( context, downloadedFile, hash ); // Update the hash result with the new primary storage URL hashResult.url = res.url; // Clean up temp file try { if (downloadedFile && fs.existsSync(downloadedFile)) { fs.unlinkSync(downloadedFile); } if (tempDir && fs.existsSync(tempDir)) { fs.rmSync(tempDir, { recursive: true }); } } catch (err) { console.log('Error cleaning up temp files:', err); } } catch (error) { console.error('Error restoring from GCS:', error); // If restoration fails, remove the hash from the map await removeFromFileStoreMap(hash); context.res = { status: 404, body: `Hash ${hash} not found`, }; return; } } // Final check to ensure we have at least one valid storage location const finalPrimaryCheck = hashResult?.url ? await storageService.fileExists(hashResult.url) : false; const finalGCSCheck = hashResult?.gcs ? await storageService.fileExists(hashResult.gcs) : false; if (!finalPrimaryCheck && !finalGCSCheck) { context.log(`Failed to restore file. Removing from map: ${hash}`); await removeFromFileStoreMap(hash); context.res = { status: 404, body: `Hash ${hash} not found`, }; return; } // Create the response object const response = { message: `File '${hashResult.filename}' uploaded successfully.`, filename: hashResult.filename, url: hashResult.url, gcs: hashResult.gcs, hash: hashResult.hash, timestamp: new Date().toISOString() }; // Ensure converted version exists and is synced across storage providers try { hashResult = await conversionService.ensureConvertedVersion(hashResult, requestId); } catch (error) { context.log(`Error ensuring converted version: ${error}`); } // Attach converted info to response if present if (hashResult.converted) { response.converted = { url: hashResult.converted.url, gcs: hashResult.converted.gcs }; } //update redis timestamp with current time await setFileStoreMap(hash, hashResult); context.res = { status: 200, body: response }; return; } catch (error) { context.log(`Error checking file existence: ${error}`); // If there's an error checking file existence, remove the hash from the map await removeFromFileStoreMap(hash); context.res = { status: 404, body: `Hash ${hash} not found`, }; return; } } context.res = { status: 404, body: `Hash ${hash} not found`, }; return; } if (req.method.toLowerCase() === 'post') { // Determine if we should save to local storage based on primary provider const saveToLocal = storageService.primaryProvider.constructor.name === 'LocalStorageProvider'; // Use uploadBlob to handle multipart/form-data const result = await uploadBlob(context, req, saveToLocal, null, hash); if (result?.hash && context?.res?.body) { await setFileStoreMap(result.hash, context.res.body); } return; } if (!uri || !requestId) { context.res = { status: 400, body: 'Please pass a uri and requestId on the query string or in the request body', }; return; } let totalCount = 0; let completedCount = 0; let numberOfChunks; const file = ensureEncoded(uri); // encode url to handle special characters const result = []; const sendProgress = async (data = null) => { completedCount++; const progress = completedCount / totalCount; await publishRequestProgress({ requestId, progress, completedCount, totalCount, numberOfChunks, data, }); }; try { // Parse URL and get pathname without query parameters for extension check const urlObj = new URL(uri); const pathWithoutQuery = urlObj.pathname; if ( DOC_EXTENSIONS.some((ext) => pathWithoutQuery.toLowerCase().endsWith(ext)) ) { const extension = path.extname(pathWithoutQuery).toLowerCase(); const tempDir = path.join(os.tmpdir(), `${uuidv4()}`); fs.mkdirSync(tempDir); const downloadedFile = path.join(tempDir, `${uuidv4()}${extension}`); await downloadFile(uri, downloadedFile); try { if (shouldSave) { // Check if file needs conversion first if (conversionService.needsConversion(downloadedFile)) { // Convert the file const conversion = await conversionService.convertFile(downloadedFile, uri); if (!conversion.converted) { throw new Error('File conversion failed'); } // Save the converted file const convertedSaveResult = await conversionService._saveConvertedFile(conversion.convertedPath, requestId); // Return the converted file URL context.res = { status: 200, body: { url: convertedSaveResult.url, blobName: path.basename(convertedSaveResult.url) } }; } else { // File doesn't need conversion, save the original file const saveResult = await conversionService._saveConvertedFile(downloadedFile, requestId); // Return the original file URL context.res = { status: 200, body: { url: saveResult.url, blobName: path.basename(saveResult.url) } }; } return; } else { let text; if (conversionService.needsConversion(downloadedFile)) { text = await conversionService.convertFile(downloadedFile, uri, true); } else { // For files that don't need conversion, read the file contents directly text = await fs.promises.readFile(downloadedFile, 'utf-8'); } result.push(...easyChunker(text)); } } catch (err) { console.log( `Error saving file ${uri} with request id ${requestId}:`, err, ); throw err; // Re-throw to handle in outer catch } finally { try { // delete temporary files if (downloadedFile && fs.existsSync(downloadedFile)) { fs.unlinkSync(downloadedFile); console.log(`Cleaned temp file ${downloadedFile}`); } } catch (err) { console.log(`Error cleaning temp file ${downloadedFile}:`, err); } // Delete uploaded files only if we're NOT saving the converted version. // When save=true we need to keep the converted file (which is stored under the same requestId prefix), // so skip the cleanup in that case. if (!shouldSave) { await storageService.deleteFiles(requestId); console.log( `Cleaned temp files for request id ${requestId}`, ); } else { console.log(`Skip cleanup for request id ${requestId} because save flag is set`); } } } else { const { chunkPromises, chunkOffsets, uniqueOutputPath } = await splitMediaFile(file); numberOfChunks = chunkPromises.length; // for progress reporting totalCount += chunkPromises.length * 4; // 4 steps for each chunk (download and upload) // sequential download of chunks const chunks = []; for (const chunkPromise of chunkPromises) { const chunkPath = await chunkPromise; chunks.push(chunkPath); await sendProgress(); } // sequential processing of chunks for (let index = 0; index < chunks.length; index++) { const chunkPath = chunks[index]; const chunkResult = await storageService.uploadFile(context, chunkPath, requestId); const chunkOffset = chunkOffsets[index]; result.push({ uri: chunkResult.url, offset: chunkOffset, gcs: chunkResult.gcs }); console.log( `Saved chunk as: ${chunkResult.url}${chunkResult.gcs ? ` and ${chunkResult.gcs}` : ''}`, ); await sendProgress(); } // Cleanup the temp directory try { if (uniqueOutputPath && fs.existsSync(uniqueOutputPath)) { fs.rmSync(uniqueOutputPath, { recursive: true }); console.log(`Cleaned temp directory: ${uniqueOutputPath}`); } } catch (err) { console.log(`Error cleaning temp directory ${uniqueOutputPath}:`, err); } } } catch (error) { console.error('An error occurred:', error); context.res = { status: 500, body: error.message || error, }; return; } console.log( 'result:', result .map((item) => typeof item === 'object' ? JSON.stringify(item, null, 2) : item, ) .join('\n'), ); context.res = { body: result, }; } export default CortexFileHandler;