UNPKG

@aj-archipelago/cortex

Version:

Cortex is a GraphQL API for AI. It provides a simple, extensible interface for using AI services from OpenAI, Azure and others.

1,288 lines (1,157 loc) 45.5 kB
import fs from "fs"; import os from "os"; import path from "path"; import { promisify } from "util"; import { pipeline as _pipeline } from "stream"; import { v4 as uuidv4 } from "uuid"; import Busboy from "busboy"; import { PassThrough } from "stream"; import { Storage } from "@google-cloud/storage"; import { BlobServiceClient } from "@azure/storage-blob"; import axios from "axios"; import mime from "mime-types"; import { sanitizeFilename, generateShortId, generateBlobName, } from "./utils/filenameUtils.js"; import { publicFolder, port, ipAddress } from "./start.js"; import { CONVERTED_EXTENSIONS, AZURITE_ACCOUNT_NAME, getDefaultContainerName, GCS_BUCKETNAME, AZURE_STORAGE_CONTAINER_NAME } from "./constants.js"; import { FileConversionService } from "./services/FileConversionService.js"; import { StorageFactory } from "./services/storage/StorageFactory.js"; const pipeline = promisify(_pipeline); function isBase64(str) { try { return btoa(atob(str)) == str; } catch (err) { return false; } } const { SAS_TOKEN_LIFE_DAYS = 30 } = process.env; let GCP_SERVICE_ACCOUNT; let GCP_PROJECT_ID; try { const GCP_SERVICE_ACCOUNT_KEY = process.env.GCP_SERVICE_ACCOUNT_KEY_BASE64 || process.env.GCP_SERVICE_ACCOUNT_KEY || "{}"; GCP_SERVICE_ACCOUNT = isBase64(GCP_SERVICE_ACCOUNT_KEY) ? JSON.parse(Buffer.from(GCP_SERVICE_ACCOUNT_KEY, "base64").toString()) : JSON.parse(GCP_SERVICE_ACCOUNT_KEY); GCP_PROJECT_ID = GCP_SERVICE_ACCOUNT.project_id; } catch (error) { console.warn("Error parsing GCP service account credentials, GCS will not be used:", error.message); GCP_SERVICE_ACCOUNT = {}; GCP_PROJECT_ID = null; } let gcs; if (!GCP_PROJECT_ID || !GCP_SERVICE_ACCOUNT) { console.warn( "No Google Cloud Storage credentials provided - GCS will not be used", ); } else { try { gcs = new Storage({ projectId: GCP_PROJECT_ID, credentials: GCP_SERVICE_ACCOUNT, }); // Rest of your Google Cloud operations using gcs object } catch (error) { console.error( "Google Cloud Storage credentials are invalid - GCS will not be used: ", error, ); } } function isEncoded(str) { // Checks for any percent-encoded sequence return /%[0-9A-Fa-f]{2}/.test(str); } // Helper function to ensure GCS URLs are never encoded function ensureUnencodedGcsUrl(url) { if (!url || !url.startsWith("gs://")) { return url; } // Split into bucket and path parts const [bucket, ...pathParts] = url.replace("gs://", "").split("/"); // Reconstruct URL with decoded path parts, handling invalid characters return `gs://${bucket}/${pathParts .map((part) => { try { return decodeURIComponent(part); } catch (error) { // If decoding fails, sanitize the filename by removing invalid characters return part.replace(/[^\w\-\.]/g, "_"); } }) .join("/")}`; } async function gcsUrlExists(url, defaultReturn = false) { try { if (!url || !gcs) { return defaultReturn; // Cannot check return } // Ensure URL is not encoded const unencodedUrl = ensureUnencodedGcsUrl(url); const urlParts = unencodedUrl.replace("gs://", "").split("/"); const bucketName = urlParts[0]; const fileName = urlParts.slice(1).join("/"); if (process.env.STORAGE_EMULATOR_HOST) { try { const response = await axios.get( `${process.env.STORAGE_EMULATOR_HOST}/storage/v1/b/${bucketName}/o/${encodeURIComponent(fileName)}`, { validateStatus: (status) => status === 200 || status === 404 }, ); return response.status === 200; } catch (error) { console.error("Error checking emulator file:", error); return false; } } const bucket = gcs.bucket(bucketName); const file = bucket.file(fileName); const [exists] = await file.exists(); return exists; } catch (error) { console.error("Error checking if GCS URL exists:", error); return false; } } /** * Downloads a file from Google Cloud Storage to a local file * @param {string} gcsUrl - The GCS URL in format gs://bucket-name/file-path * @param {string} destinationPath - The local path where the file should be saved * @returns {Promise<void>} */ async function downloadFromGCS(gcsUrl, destinationPath) { if (!gcsUrl || !gcs) { throw new Error("Invalid GCS URL or GCS client not initialized"); } const urlParts = gcsUrl.replace("gs://", "").split("/"); const bucketName = urlParts[0]; const fileName = urlParts.slice(1).join("/"); if (process.env.STORAGE_EMULATOR_HOST) { // Use axios to download from emulator const response = await axios({ method: "GET", url: `${process.env.STORAGE_EMULATOR_HOST}/storage/v1/b/${bucketName}/o/${encodeURIComponent(fileName)}?alt=media`, responseType: "stream", }); // Write the response to file const writer = fs.createWriteStream(destinationPath); await new Promise((resolve, reject) => { response.data.pipe(writer); writer.on("finish", resolve); writer.on("error", reject); }); } else { // Use GCS client for real GCS const bucket = gcs.bucket(bucketName); const file = bucket.file(fileName); await file.download({ destination: destinationPath }); } } /** * Extracts MIME type from a URL based on file extension * @param {string} url - The URL to extract MIME type from * @returns {string} The MIME type or 'application/octet-stream' as fallback */ function getMimeTypeFromUrl(url) { const defaultMimeType = 'application/octet-stream'; if (!url) return defaultMimeType; try { const urlObj = new URL(url); const pathname = urlObj.pathname; const extension = path.extname(pathname); return mime.lookup(extension) || defaultMimeType; } catch (e) { // If URL parsing fails, try to extract extension from URL string const urlMatch = url.match(/\.([a-zA-Z0-9]+)(?:\?|$)/); if (urlMatch) { return mime.lookup(urlMatch[1]) || defaultMimeType; } return defaultMimeType; } } /** * Generates a short-lived SAS URL for a converted file * @param {object} context - The request context for logging * @param {string} convertedUrl - The URL of the converted file * @param {string} [logSuffix=''] - Optional suffix for log messages * @returns {Promise<string>} The short-lived URL or the original URL as fallback */ async function generateShortLivedUrlForConvertedFile(context, convertedUrl, logSuffix = '') { let shortLivedUrl = convertedUrl; // Fallback to regular URL try { const storageFactory = StorageFactory.getInstance(); const primaryProvider = await storageFactory.getAzureProvider(); if (primaryProvider.generateShortLivedSASToken && primaryProvider.extractBlobNameFromUrl) { const blobName = primaryProvider.extractBlobNameFromUrl(convertedUrl); if (blobName) { const { containerClient } = await primaryProvider.getBlobClient(); const sasToken = primaryProvider.generateShortLivedSASToken( containerClient, blobName, 5 ); const urlObj = new URL(convertedUrl); const baseUrl = `${urlObj.protocol}//${urlObj.host}${urlObj.pathname}`; shortLivedUrl = `${baseUrl}?${sasToken}`; context.log(`Generated shortLivedUrl for converted file${logSuffix}`); } } } catch (error) { context.log(`Warning: Could not generate shortLivedUrl for converted file: ${error.message}`); // Fallback to regular URL } return shortLivedUrl; } export const getBlobClient = async () => { const connectionString = process.env.AZURE_STORAGE_CONNECTION_STRING; // Always use default container from env var const finalContainerName = getDefaultContainerName(); if (!connectionString || !finalContainerName) { throw new Error( "Missing Azure Storage connection string or container name environment variable", ); } const blobServiceClient = BlobServiceClient.fromConnectionString(connectionString); const serviceProperties = await blobServiceClient.getProperties(); if (!serviceProperties.defaultServiceVersion) { serviceProperties.defaultServiceVersion = "2020-02-10"; await blobServiceClient.setProperties(serviceProperties); } const containerClient = blobServiceClient.getContainerClient(finalContainerName); return { blobServiceClient, containerClient }; }; async function saveFileToBlob(chunkPath, requestId, filename = null) { // Use provider for consistency with cache control headers // Container parameter is ignored - always uses default container from env var const storageFactory = StorageFactory.getInstance(); const provider = await storageFactory.getAzureProvider(); return await provider.uploadFile({}, chunkPath, requestId, null, filename); } //deletes blob that has the requestId async function deleteBlob(requestId) { if (!requestId) throw new Error("Missing requestId parameter"); // Container parameter is ignored - always uses default container from env var const { containerClient } = await getBlobClient(); // List all blobs in the container const blobs = containerClient.listBlobsFlat(); const result = []; // Iterate through the blobs for await (const blob of blobs) { // Check if the blob name starts with requestId_ (flat structure) // or is inside a folder named requestId/ (folder structure) if ( blob.name.startsWith(`${requestId}_`) || blob.name.startsWith(`${requestId}/`) ) { // Delete the matching blob const blockBlobClient = containerClient.getBlockBlobClient(blob.name); await blockBlobClient.delete(); console.log(`Cleaned blob: ${blob.name}`); result.push(blob.name); } } return result; } function uploadBlob( context, req, saveToLocal = false, filePath = null, hash = null, ) { return new Promise((resolve, reject) => { (async () => { try { let requestId = uuidv4(); // Container parameter is ignored - always uses default container from env var const body = {}; const fields = {}; // Buffer for all fields // If filePath is given, we are dealing with local file and not form-data if (filePath) { const file = fs.createReadStream(filePath); const filename = path.basename(filePath); // Generate LLM-friendly ID for requestId to match the filename pattern const fileExtension = path.extname(filename); const shortId = generateShortId(); const uploadName = `${shortId}${fileExtension}`; requestId = shortId; // Use the short ID as requestId try { const result = await uploadFile( context, requestId, body, saveToLocal, file, uploadName, // Use the LLM-friendly filename resolve, hash, fields, // Pass fields for contextId extraction ); resolve(result); } catch (error) { console.error("Error in uploadFile (local file path):", error); const err = new Error(`Error processing file upload: ${error.message}`); err.status = 500; throw err; } } else { const busboy = Busboy({ headers: req.headers }); let hasFile = false; let errorOccurred = false; busboy.on("field", (fieldname, value) => { if (fieldname === "requestId") { requestId = value; } else if (fieldname === "hash") { hash = value; } else if (fieldname === "container") { // Container parameter is ignored - always uses default container from env var // No validation or error needed, just ignore it } fields[fieldname] = value; // Store all fields }); busboy.on("file", async (fieldname, file, info) => { if (errorOccurred) return; hasFile = true; // Validate file if (!info.filename || info.filename.trim() === "") { errorOccurred = true; const err = new Error("Invalid file: missing filename"); err.status = 400; reject(err); return; } // Simple approach: small delay to allow container field to be processed console.log("File received, giving fields time to process..."); await new Promise(resolve => setTimeout(resolve, 20)); // Container parameter is ignored - always uses default container from env var if (errorOccurred) return; // Check again after waiting // Container parameter is ignored - always uses default container from env var await processFile(fieldname, file, info); }); const processFile = async (fieldname, file, info) => { if (errorOccurred) return; // Validate file if (!info.filename || info.filename.trim() === "") { errorOccurred = true; const err = new Error("Invalid file: missing filename"); err.status = 400; reject(err); return; } // Prepare for streaming to cloud destinations const displayFilename = info.filename; // Preserve original filename for metadata const fileExtension = path.extname(displayFilename); const shortId = generateShortId(); const uploadName = `${shortId}${fileExtension}`; // Extract content-type from busboy info (preserves charset if provided) const contentType = info.mimeType || null; const azureStream = !saveToLocal ? new PassThrough() : null; const gcsStream = gcs ? new PassThrough() : null; let diskWriteStream, tempDir, tempFilePath; let diskWritePromise; let diskWriteError = null; let cloudUploadError = null; // Start local disk write in parallel (non-blocking for response) if (saveToLocal) { try { tempDir = fs.mkdtempSync(path.join(os.tmpdir(), "upload-")); } catch (err) { console.error("Error creating tempDir:", err); errorOccurred = true; reject(err); return; } tempFilePath = path.join(tempDir, uploadName); try { diskWriteStream = fs.createWriteStream(tempFilePath, { highWaterMark: 1024 * 1024, autoClose: true, }); } catch (err) { console.error( "Error creating write stream:", err, "Temp dir exists:", fs.existsSync(tempDir), ); errorOccurred = true; reject(err); return; } diskWriteStream.on("error", (err) => { console.error("Disk write stream error:", err); }); diskWriteStream.on("close", () => { console.log("Disk write stream closed for:", tempFilePath); }); diskWritePromise = new Promise((res, rej) => { diskWriteStream.on("finish", res); diskWriteStream.on("error", (err) => { diskWriteError = err; rej(err); }); }); } // Pipe incoming file to all destinations let receivedAnyData = false; file.on("data", () => { receivedAnyData = true; }); if (azureStream) file.pipe(azureStream); if (gcsStream) file.pipe(gcsStream); if (diskWriteStream) file.pipe(diskWriteStream); // Listen for end event to check for empty file file.on("end", async () => { if (!receivedAnyData) { errorOccurred = true; // Abort all streams if (azureStream) azureStream.destroy(); if (gcsStream) gcsStream.destroy(); if (diskWriteStream) diskWriteStream.destroy(); const err = new Error("Invalid file: file is empty"); err.status = 400; reject(err); } }); // Start cloud uploads immediately let azurePromise; if (!saveToLocal) { azurePromise = saveToAzureStorage( context, uploadName, azureStream, null, // containerName ignored contentType, ).catch(async (err) => { cloudUploadError = err; // Fallback: try from disk if available if (diskWritePromise) { await diskWritePromise; const diskStream = fs.createReadStream(tempFilePath, { highWaterMark: 1024 * 1024, autoClose: true, }); return saveToAzureStorage(context, uploadName, diskStream, null, contentType); } throw err; }); } let gcsPromise; if (gcsStream) { gcsPromise = saveToGoogleStorage( context, uploadName, gcsStream, contentType, ).catch(async (err) => { cloudUploadError = err; if (diskWritePromise) { await diskWritePromise; const diskStream = fs.createReadStream(tempFilePath, { highWaterMark: 1024 * 1024, autoClose: true, }); return saveToGoogleStorage(context, uploadName, diskStream, contentType); } throw err; }); } // Wait for cloud uploads to finish try { const results = await Promise.all( [ azurePromise ? azurePromise.then((result) => ({ result, type: "primary" })) : null, !azurePromise && saveToLocal ? Promise.resolve({ result: { url: null }, type: "primary-local" }) // placeholder for local, url handled later : null, gcsPromise ? gcsPromise.then((gcs) => ({ gcs, type: "gcs" })) : null, ].filter(Boolean), ); const result = { message: `File '${uploadName}' uploaded successfully.`, filename: uploadName, displayFilename: displayFilename, // Store original filename in metadata ...results.reduce((acc, item) => { if (item.type === "primary") { acc.url = item.result.url || item.result; acc.shortLivedUrl = item.result.shortLivedUrl || item.result.url || item.result; } if (item.type === "gcs") acc.gcs = ensureUnencodedGcsUrl(item.gcs); return acc; }, {}), }; if (hash) result.hash = hash; // Store MIME type from upload (used by Cortex for file type detection) if (contentType) { result.mimeType = contentType; } // Extract contextId from form fields if present if (fields && fields.contextId) { result.contextId = fields.contextId; } // All uploads default to temporary (permanent: false) to match file collection logic result.permanent = false; // Container parameter is ignored - always uses default container from env var // Ensure shortLivedUrl is always present if (!result.shortLivedUrl && result.url) { result.shortLivedUrl = result.url; } // If saving locally, wait for disk write to finish and then move to public folder if (saveToLocal) { try { if (diskWritePromise) { await diskWritePromise; // ensure file fully written } const localResult = await saveToLocalStorage( context, requestId, uploadName, fs.createReadStream(tempFilePath, { highWaterMark: 1024 * 1024, autoClose: true, }), ); // Handle both old format (string) and new format (object) result.url = typeof localResult === 'string' ? localResult : localResult.url; result.shortLivedUrl = localResult.shortLivedUrl || result.url; } catch (err) { console.error("Error saving to local storage:", err); throw err; } } // After original uploads, handle optional conversion const conversionService = new FileConversionService( context, !saveToLocal, ); if (conversionService.needsConversion(fileExtension)) { try { context.log("Starting file conversion (busboy)..."); // Ensure we have a local copy of the file for conversion let localPathForConversion = tempFilePath; if (!localPathForConversion) { // No temp file was written (saveToLocal === false). Download from primary URL. const tmpDir = fs.mkdtempSync( path.join(os.tmpdir(), "convert-"), ); localPathForConversion = path.join(tmpDir, uploadName); await conversionService._downloadFile( result.url, localPathForConversion, ); } else { // Wait until disk write completes to guarantee full file is present if (diskWritePromise) { await diskWritePromise; } } // Perform the conversion const conversion = await conversionService.convertFile( localPathForConversion, result.url, ); context.log( "File conversion completed (busboy):", conversion, ); if (conversion.converted) { context.log("Saving converted file (busboy)..."); // Save converted file to primary storage const convertedSaveResult = await conversionService._saveConvertedFile( conversion.convertedPath, requestId, null, null, // containerName ignored ); // Optionally save to GCS let convertedGcsUrl; if (conversionService._isGCSConfigured()) { convertedGcsUrl = await conversionService._uploadChunkToGCS( conversion.convertedPath, requestId, ); } // Generate shortLivedUrl for converted file const convertedShortLivedUrl = await generateShortLivedUrlForConvertedFile( context, convertedSaveResult.url, ' (busboy)' ); // Determine MIME type of converted file from its URL const convertedMimeType = getMimeTypeFromUrl(convertedSaveResult.url); // Attach to response body result.converted = { url: convertedSaveResult.url, shortLivedUrl: convertedShortLivedUrl, gcs: convertedGcsUrl, mimeType: convertedMimeType, }; // Note: result.shortLivedUrl remains pointing to the original file // result.converted.shortLivedUrl points to the converted file // Both are available for different use cases context.log( "Conversion process (busboy) completed successfully", ); } } catch (convErr) { console.error("Error converting file (busboy):", convErr); context.log( "Error during conversion (busboy):", convErr.message, ); // Continue without failing the upload } } // Respond after conversion (if any) context.res = { status: 200, body: result }; resolve(result); } catch (err) { console.error("Error in main busboy processing:", err); console.error("Stack trace:", err.stack); errorOccurred = true; reject(err); } finally { // Clean up temp file if written if (tempDir) { fs.rmSync(tempDir, { recursive: true, force: true }); } } }; busboy.on("error", (error) => { if (errorOccurred) return; errorOccurred = true; const err = new Error("No file provided in request"); err.status = 400; reject(err); }); busboy.on("finish", () => { if (!hasFile) { errorOccurred = true; const err = new Error("No file provided in request"); err.status = 400; reject(err); } }); // Handle errors from piping the request req.on("error", (error) => { if (errorOccurred) return; errorOccurred = true; // Only log unexpected errors if (error.message !== "No file provided in request") { context.log("Error in request stream:", error); } const err = new Error("No file provided in request"); err.status = 400; reject(err); }); try { req.pipe(busboy); } catch (error) { if (errorOccurred) return; errorOccurred = true; // Only log unexpected errors if (error.message !== "No file provided in request") { context.log("Error piping request to busboy:", error); } const err = new Error("No file provided in request"); err.status = 400; reject(err); } } } catch (error) { // Always log errors with full details for debugging console.error("Top-level error processing file upload:", error); console.error("Error stack trace:", error.stack); context.log("Error processing file upload:", error); const err = new Error(error.message || "Error processing file upload."); err.status = error.status || 500; reject(err); } })(); }); } // Helper function to handle local file storage async function saveToLocalStorage(context, requestId, encodedFilename, file) { const storageFactory = StorageFactory.getInstance(); const localProvider = storageFactory.getLocalProvider(); const contextWithRequestId = { ...context, requestId }; return await localProvider.uploadStream(contextWithRequestId, encodedFilename, file); } // Helper function to handle Azure blob storage async function saveToAzureStorage(context, encodedFilename, file, containerName = null, contentType = null) { const storageFactory = StorageFactory.getInstance(); const provider = await storageFactory.getAzureProvider(containerName); return await provider.uploadStream(context, encodedFilename, file, contentType); } // Wrapper that checks if GCS is configured async function saveToGoogleStorage(context, encodedFilename, file, contentType = null) { if (!gcs) { throw new Error("Google Cloud Storage is not initialized"); } const storageFactory = StorageFactory.getInstance(); const gcsProvider = storageFactory.getGCSProvider(); if (!gcsProvider) { throw new Error("GCS provider not available"); } return await gcsProvider.uploadStream(context, encodedFilename, file, contentType); } async function uploadFile( context, requestId, body, saveToLocal, file, filename, resolve, hash = null, fields = null, // Optional fields from form data (for contextId) ) { try { if (!file) { context.res = { status: 400, body: "No file provided in request", }; resolve(context.res); return; } const ext = path.extname(filename).toLowerCase(); context.log(`Processing file with extension: ${ext}`); let uploadPath = null; let uploadName = null; let tempDir = null; // Create temp directory for file operations tempDir = fs.mkdtempSync(path.join(os.tmpdir(), "upload-")); const tempOriginal = path.join(tempDir, filename); context.log(`Created temp directory: ${tempDir}`); // Optimize initial write with larger buffer const writeStream = fs.createWriteStream(tempOriginal, { highWaterMark: 1024 * 1024, // 1MB chunks for initial write autoClose: true, }); // Use pipeline with error handling context.log("Writing file to temp location..."); await pipeline(file, writeStream); context.log("File written to temp location successfully"); uploadPath = tempOriginal; // Use the filename that was passed in (which should already be the LLM-friendly name) uploadName = filename; const fileExtension = path.extname(filename); context.log(`Prepared upload name: ${uploadName}`); // Create optimized read streams with larger buffers for storage uploads const createOptimizedReadStream = (path) => fs.createReadStream(path, { highWaterMark: 1024 * 1024, // 1MB chunks for storage uploads autoClose: true, }); // Upload original in parallel with optimized streams const storagePromises = []; context.log("Starting primary storage upload..."); const primaryPromise = saveToLocal ? saveToLocalStorage( context, requestId, uploadName, createOptimizedReadStream(uploadPath), ) : saveToAzureStorage( context, uploadName, createOptimizedReadStream(uploadPath), null, // containerName ignored ); storagePromises.push( primaryPromise.then((result) => { context.log("Primary storage upload completed"); // Handle both old format (string URL) and new format (object with url and shortLivedUrl) const url = typeof result === 'string' ? result : (result.url || result); const shortLivedUrl = result.shortLivedUrl || url; return { url, shortLivedUrl, type: "primary" }; }), ); if (gcs) { context.log("Starting GCS upload..."); storagePromises.push( saveToGoogleStorage( context, uploadName, createOptimizedReadStream(uploadPath), ).then((gcsUrl) => { context.log("GCS upload completed"); return { gcs: gcsUrl, type: "gcs", }; }), ); } // Wait for original uploads to complete context.log("Waiting for all storage uploads to complete..."); const results = await Promise.all(storagePromises); // Note: filename parameter here is the uploadName (generated short ID), not the original // For local file path uploads, we don't have the original filename, so originalFilename will be undefined const result = { message: `File '${uploadName}' ${saveToLocal ? "saved to folder" : "uploaded"} successfully.`, filename: uploadName, ...results.reduce((acc, item) => { if (item.type === "primary") { acc.url = item.url; acc.shortLivedUrl = item.shortLivedUrl || item.url; } if (item.type === "gcs") acc.gcs = ensureUnencodedGcsUrl(item.gcs); return acc; }, {}), }; if (hash) { result.hash = hash; } // Store MIME type determined from filename (used by Cortex for file type detection) const mimeType = mime.lookup(uploadName) || 'application/octet-stream'; result.mimeType = mimeType; // Extract contextId from form fields if present (only available for multipart uploads) if (fields && fields.contextId) { result.contextId = fields.contextId; } // All uploads default to temporary (permanent: false) to match file collection logic result.permanent = false; // Container parameter is ignored - always uses default container from env var // Ensure shortLivedUrl is always present if (!result.shortLivedUrl && result.url) { result.shortLivedUrl = result.url; } // Initialize conversion service const conversionService = new FileConversionService(context, !saveToLocal); // Check if file needs conversion and handle it if (conversionService.needsConversion(fileExtension)) { try { context.log("Starting file conversion..."); // Convert the file const conversion = await conversionService.convertFile( uploadPath, result.url, ); context.log("File conversion completed:", conversion); if (conversion.converted) { context.log("Saving converted file..."); // Save converted file const convertedSaveResult = await conversionService._saveConvertedFile( conversion.convertedPath, requestId, null, containerName, ); context.log("Converted file saved to primary storage"); // If GCS is configured, also save to GCS let convertedGcsUrl; if (conversionService._isGCSConfigured()) { context.log("Saving converted file to GCS..."); convertedGcsUrl = await conversionService._uploadChunkToGCS( conversion.convertedPath, requestId, ); context.log("Converted file saved to GCS"); } // Generate shortLivedUrl for converted file const convertedShortLivedUrl = await generateShortLivedUrlForConvertedFile( context, convertedSaveResult.url ); // Determine MIME type of converted file from its URL const convertedMimeType = getMimeTypeFromUrl(convertedSaveResult.url); // Add converted file info to result result.converted = { url: convertedSaveResult.url, shortLivedUrl: convertedShortLivedUrl, gcs: convertedGcsUrl, mimeType: convertedMimeType, }; // Note: result.shortLivedUrl remains pointing to the original file // result.converted.shortLivedUrl points to the converted file // Both are available for different use cases context.log("Conversion process completed successfully"); } } catch (error) { console.error("Error converting file:", error); context.log("Error during conversion:", error.message); // Don't fail the upload if conversion fails } } context.res = { status: 200, body: result, }; // Clean up temp files context.log("Cleaning up temporary files..."); if (tempDir) { fs.rmSync(tempDir, { recursive: true, force: true }); context.log("Temporary files cleaned up"); } context.log("Upload process completed successfully"); resolve(result); } catch (error) { context.log("Error in upload process:", error); if (body.url) { try { // Container parameter is ignored - always uses default container from env var await cleanup(context, [body.url]); } catch (cleanupError) { context.log("Error during cleanup after failure:", cleanupError); } } throw error; } } // Helper to convert a stream to a buffer async function streamToBuffer(stream) { return new Promise((resolve, reject) => { const chunks = []; stream.on("data", (chunk) => chunks.push(chunk)); stream.on("end", () => resolve(Buffer.concat(chunks))); stream.on("error", reject); }); } // Function to delete files that haven't been used in more than a month // Container parameter is ignored - always uses default container from env var async function cleanup(context, urls = null) { const { containerClient } = await getBlobClient(); const cleanedURLs = []; if (!urls) { const xMonthAgo = new Date(); xMonthAgo.setMonth(xMonthAgo.getMonth() - 1); const blobs = containerClient.listBlobsFlat(); for await (const blob of blobs) { const lastModified = blob.properties.lastModified; if (lastModified < xMonthAgo) { try { const blockBlobClient = containerClient.getBlockBlobClient(blob.name); await blockBlobClient.delete(); context.log(`Cleaned blob: ${blob.name}`); cleanedURLs.push(blob.name); } catch (error) { if (error.statusCode !== 404) { context.log(`Error cleaning blob ${blob.name}:`, error); } } } } } else { for (const url of urls) { try { const blobName = url.replace(containerClient.url, ""); const blockBlobClient = containerClient.getBlockBlobClient(blobName); await blockBlobClient.delete(); context.log(`Cleaned blob: ${blobName}`); cleanedURLs.push(blobName); } catch (error) { if (error.statusCode !== 404) { context.log(`Error cleaning blob ${url}:`, error); } } } } return cleanedURLs; } async function cleanupGCS(urls = null) { if (!gcs) return []; const bucket = gcs.bucket(GCS_BUCKETNAME); const directories = new Set(); const cleanedURLs = []; if (!urls) { const daysN = 30; const threshold = Date.now() - daysN * 24 * 60 * 60 * 1000; const [files] = await bucket.getFiles(); for (const file of files) { const [metadata] = await file.getMetadata(); const directoryPath = path.dirname(file.name); directories.add(directoryPath); if (metadata.updated) { const updatedTime = new Date(metadata.updated).getTime(); if (updatedTime < threshold) { await file.delete(); cleanedURLs.push(file.name); } } } } else { for (const url of urls) { const filePath = url.split("/").slice(3).join("/"); const file = bucket.file(filePath); const directoryPath = path.dirname(file.name); directories.add(directoryPath); await file.delete(); cleanedURLs.push(url); } } for (const directory of directories) { const [files] = await bucket.getFiles({ prefix: directory }); if (files.length === 0) { await bucket.deleteFiles({ prefix: directory }); } } return cleanedURLs; } async function deleteGCS(blobName) { if (!blobName) { console.log("[deleteGCS] No blobName provided, skipping GCS deletion"); return; } if (!gcs) { console.log("[deleteGCS] GCS not initialized, skipping deletion"); return; } try { if (process.env.STORAGE_EMULATOR_HOST) { console.log( `[deleteGCS] Using emulator at ${process.env.STORAGE_EMULATOR_HOST}`, ); console.log( `[deleteGCS] Attempting to delete files with prefix: ${blobName}`, ); // List files first const listUrl = `${process.env.STORAGE_EMULATOR_HOST}/storage/v1/b/${GCS_BUCKETNAME}/o?prefix=${blobName}`; console.log(`[deleteGCS] Listing files with URL: ${listUrl}`); const listResponse = await axios.get(listUrl, { validateStatus: (status) => true, }); console.log(`[deleteGCS] List response status: ${listResponse.status}`); console.log( `[deleteGCS] List response data: ${JSON.stringify(listResponse.data)}`, ); if (listResponse.status === 200 && listResponse.data.items) { console.log( `[deleteGCS] Found ${listResponse.data.items.length} items to delete`, ); // Delete each file for (const item of listResponse.data.items) { const deleteUrl = `${process.env.STORAGE_EMULATOR_HOST}/storage/v1/b/${GCS_BUCKETNAME}/o/${encodeURIComponent(item.name)}`; console.log(`[deleteGCS] Deleting file: ${item.name}`); console.log(`[deleteGCS] Delete URL: ${deleteUrl}`); const deleteResponse = await axios.delete(deleteUrl, { validateStatus: (status) => true, headers: { "Content-Type": "application/json", }, }); console.log( `[deleteGCS] Delete response status: ${deleteResponse.status}`, ); console.log( `[deleteGCS] Delete response data: ${JSON.stringify(deleteResponse.data)}`, ); } console.log("[deleteGCS] All files deleted successfully"); } else { console.log("[deleteGCS] No files found to delete"); } } else { console.log("[deleteGCS] Using real GCS"); const bucket = gcs.bucket(GCS_BUCKETNAME); const [files] = await bucket.getFiles({ prefix: blobName }); console.log(`[deleteGCS] Found ${files.length} files to delete`); if (files.length > 0) { await Promise.all(files.map((file) => file.delete())); console.log("[deleteGCS] All files deleted successfully"); } else { console.log("[deleteGCS] No files found to delete"); } } } catch (error) { // If we get a 404 error, it means the file is already gone, which is fine if (error.response?.status === 404 || error.code === 404) { console.log( "[deleteGCS] File not found in GCS (404) - this is expected if file was already deleted", ); return; } console.error("[deleteGCS] Error during deletion:", error); console.error("[deleteGCS] Error details:", { message: error.message, code: error.code, errors: error.errors, response: error.response ? { status: error.response.status, statusText: error.response.statusText, data: error.response.data, headers: error.response.headers, } : null, }); // Don't throw the error - we want to continue with cleanup even if GCS deletion fails } } // Helper function to ensure GCS upload for existing files async function ensureGCSUpload(context, existingFile) { if (!existingFile.gcs && gcs) { context.log("GCS file was missing - uploading."); // Use LLM-friendly naming instead of extracting original filename const fileExtension = path.extname(existingFile.url.split("?")[0]); const shortId = generateShortId(); const fileName = `${shortId}${fileExtension}`; const response = await axios({ method: "get", url: existingFile.url, responseType: "stream", }); const storageFactory = StorageFactory.getInstance(); const gcsProvider = storageFactory.getGCSProvider(); if (gcsProvider) { existingFile.gcs = await gcsProvider.uploadStream(context, fileName, response.data); } } return existingFile; } async function uploadChunkToGCS(chunkPath, requestId, filename = null) { if (!gcs) return null; const dirName = requestId || uuidv4(); // Use provided filename or generate LLM-friendly naming let gcsFileName; if (filename) { gcsFileName = `${dirName}/${filename}`; } else { const fileExtension = path.extname(chunkPath); const shortId = generateShortId(); gcsFileName = `${dirName}/${shortId}${fileExtension}`; } await gcs .bucket(GCS_BUCKETNAME) .upload(chunkPath, { destination: gcsFileName }); return `gs://${GCS_BUCKETNAME}/${gcsFileName}`; } export { saveFileToBlob, deleteBlob, deleteGCS, uploadBlob, cleanup, cleanupGCS, gcsUrlExists, ensureGCSUpload, gcs, uploadChunkToGCS, downloadFromGCS, getMimeTypeFromUrl, // Re-export container constants getDefaultContainerName, GCS_BUCKETNAME, AZURE_STORAGE_CONTAINER_NAME, };