UNPKG

@aj-archipelago/cortex

Version:

Cortex is a GraphQL API for AI. It provides a simple, extensible interface for using AI services from OpenAI, Azure and others.

309 lines (259 loc) 11.9 kB
import fs from 'fs/promises'; import os from 'os'; import path from 'path'; import { createReadStream, createWriteStream } from 'fs'; import { pipeline } from 'stream/promises'; import axios from 'axios'; import XLSX from 'xlsx'; import { CONVERTED_EXTENSIONS } from '../constants.js'; import { v4 as uuidv4 } from 'uuid'; import { sanitizeFilename } from '../utils/filenameUtils.js'; const MARKITDOWN_CONVERT_URL = process.env.MARKITDOWN_CONVERT_URL; if (!MARKITDOWN_CONVERT_URL) { throw new Error('MARKITDOWN_CONVERT_URL is not set'); } export class ConversionService { constructor(context) { this.context = context; } /** * Determines if a file needs conversion based on its extension * @param {string} filename - The name of the file to check * @returns {boolean} - Whether the file needs conversion */ needsConversion(filename) { // Accept either a full filename/path or a raw extension (e.g. ".docx") const input = filename.toLowerCase(); // If the input looks like an extension already, check directly if (input.startsWith('.') && !input.includes('/') && !input.includes('\\')) { return CONVERTED_EXTENSIONS.includes(input); } // Otherwise, extract the extension from the filename/path const ext = path.extname(input).toLowerCase(); return CONVERTED_EXTENSIONS.includes(ext); } /** * Converts a file to its appropriate format * @param {string} filePath - Path to the file to convert * @param {string} originalUrl - Original URL of the file (required for document conversion) * @param {boolean} forceConversion - If true, bypasses extension check and forces document conversion * @returns {Promise<{convertedPath: string, convertedName: string, converted: boolean}>} */ async convertFile(filePath, originalUrl = null, forceConversion = false) { this.context.log('Converting file:', { filePath, originalUrl, forceConversion }); // Clean the file path by removing any query parameters const cleanFilePath = sanitizeFilename(filePath.split('?')[0]); const ext = path.extname(cleanFilePath).toLowerCase(); const tempDir = await fs.mkdtemp(path.join(os.tmpdir(), 'convert-')); try { // If forceConversion is true, directly handle as document conversion if (forceConversion) { return await this._handleDocumentConversion(filePath, originalUrl, tempDir); } // Handle Excel files if (ext === '.xlsx' || ext === '.xls') { return await this._handleExcelConversion(filePath, tempDir); } // Handle documents that need markdown conversion if (['.docx', '.doc', '.ppt', '.pptx'].includes(ext)) { return await this._handleDocumentConversion(filePath, originalUrl, tempDir); } this.context.log('No conversion needed for this file type'); return { converted: false }; } catch (error) { this.context.log('Error in convertFile:', error); // Clean up temp directory on error await fs.rm(tempDir, { recursive: true, force: true }); throw error; } } /** * Ensures a file has both original and converted versions * @param {Object} fileInfo - Information about the file * @param {string} requestId - Request ID for storage * @returns {Promise<Object>} - Updated file info with conversion if needed */ async ensureConvertedVersion(fileInfo, requestId) { const { url, gcs } = fileInfo; // Remove any query parameters before extension check const extension = path.extname(url.split('?')[0]).toLowerCase(); // If file doesn't need conversion, return original info if (!this.needsConversion(extension)) { return fileInfo; } // Work with any converted info already stored inside the main hash element const convertedInfo = fileInfo.converted; let needsConversion = false; if (convertedInfo) { // Verify both primary and GCS URLs exist const primaryExists = await this._urlExists(convertedInfo?.url); const gcsExists = this._isGCSConfigured() ? await this._gcsUrlExists(convertedInfo?.gcs) : false; // If both URLs exist, return the info if (primaryExists.valid && (!this._isGCSConfigured() || gcsExists)) { return { ...fileInfo, converted: convertedInfo }; } // If either URL is missing, we need to convert needsConversion = true; this.context.log('Conversion needed - missing URLs:', { primaryExists: primaryExists.valid, gcsExists, convertedInfo }); } else { needsConversion = true; this.context.log('Conversion needed - no converted info in map'); } // If conversion is needed, create it if (needsConversion) { try { const tempDir = path.join(os.tmpdir(), `${uuidv4()}`); await fs.mkdir(tempDir); // Ensure we strip any query parameters from the URL when determining the local filename const cleanUrlPath = url.split('?')[0]; const downloadedFile = path.join(tempDir, path.basename(cleanUrlPath)); await this._downloadFile(url, downloadedFile); // Convert the file const conversion = await this.convertFile(downloadedFile, url); if (!conversion.converted) { throw new Error('File conversion failed'); } // Save converted file to primary storage const convertedSaveResult = await this._saveConvertedFile(conversion.convertedPath, requestId); if (!convertedSaveResult) { throw new Error('Failed to save converted file to primary storage'); } // If GCS is configured, also save to GCS let gcsUrl; if (this._isGCSConfigured()) { gcsUrl = await this._uploadChunkToGCS(conversion.convertedPath, requestId); } // Store converted file info const convertedFileInfo = { url: convertedSaveResult.url, gcs: gcsUrl }; // Attach converted info directly to the main file record – // the caller (index.js) will persist the updated fileInfo if (!convertedFileInfo.url) { throw new Error('Failed to get primary URL for converted file'); } // Cleanup temp files await this._cleanupTempFiles(downloadedFile, conversion.convertedPath, tempDir); return { ...fileInfo, converted: convertedFileInfo }; } catch (error) { this.context.log('Error ensuring converted version:', error); // Don't return partial conversion results return fileInfo; } } return fileInfo; } // Private helper methods async _handleExcelConversion(filePath, tempDir) { this.context.log('Handling Excel file conversion'); const csvPath = await this._xlsxToCsv(filePath); const ext = path.extname(filePath); const convertedPath = path.join( tempDir, `${path.basename(filePath, ext)}.csv`, ); await pipeline( createReadStream(csvPath, { highWaterMark: 64 * 1024 }), createWriteStream(convertedPath, { highWaterMark: 64 * 1024 }), ); await fs.unlink(csvPath); return { convertedPath, convertedName: path.basename(convertedPath), converted: true, }; } async _handleDocumentConversion(filePath, originalUrl, tempDir) { this.context.log('Handling document conversion'); if (!originalUrl) { throw new Error('Original URL is required for document conversion'); } const markdown = await this._convertToMarkdown(originalUrl); if (!markdown) { throw new Error('Markdown conversion returned empty result'); } // Remove any query parameters from the file path before processing const cleanFilePath = filePath.split('?')[0]; const ext = path.extname(cleanFilePath); // Decode the filename before using it (and ensure query params are removed) const baseFilename = decodeURIComponent(path.basename(cleanFilePath, ext)); const convertedPath = path.join(tempDir, `${baseFilename}.md`); await fs.writeFile(convertedPath, markdown); return { convertedPath, convertedName: path.basename(convertedPath), converted: true, }; } async _convertToMarkdown(fileUrl) { try { const apiUrl = `${MARKITDOWN_CONVERT_URL}${encodeURIComponent(fileUrl)}`; const response = await axios.get(apiUrl); return response.data.markdown || ''; } catch (err) { this.context.log('Error converting to markdown:', err); throw err; } } async _xlsxToCsv(filePath) { const workbook = XLSX.readFile(filePath, { type: 'buffer' }); const outputPath = filePath.replace(/\.[^/.]+$/, '.csv'); let csvContent = ''; workbook.SheetNames.forEach((sheetName) => { const sheet = workbook.Sheets[sheetName]; const csv = XLSX.utils.sheet_to_csv(sheet); csvContent += `Sheet: ${sheetName}\n${csv}\n\n`; }); await fs.writeFile(outputPath, csvContent); return outputPath; } // Storage-related methods (to be implemented by the caller) async _getFileStoreMap(key) { throw new Error('Method _getFileStoreMap must be implemented'); } async _setFileStoreMap(key, value) { throw new Error('Method _setFileStoreMap must be implemented'); } async _urlExists(url) { throw new Error('Method _urlExists must be implemented'); } async _gcsUrlExists(url) { throw new Error('Method _gcsUrlExists must be implemented'); } async _downloadFile(url, destination) { throw new Error('Method _downloadFile must be implemented'); } async _saveConvertedFile(filePath, requestId) { throw new Error('Method _saveConvertedFile must be implemented'); } async _uploadChunkToGCS(filePath, requestId) { throw new Error('Method _uploadChunkToGCS must be implemented'); } _isGCSConfigured() { throw new Error('Method _isGCSConfigured must be implemented'); } async _cleanupTempFiles(...files) { for (const file of files) { try { if (!file) continue; // Check if the file/directory exists await fs.access(file).catch(() => null); // Determine if the path is a directory or a file const stats = await fs.lstat(file).catch(() => null); if (!stats) continue; if (stats.isDirectory()) { await fs.rm(file, { recursive: true, force: true }); } else { await fs.unlink(file); } } catch (err) { this.context.log('Error cleaning up temp file:', err); } } } }