@aj-archipelago/cortex
Version:
Cortex is a GraphQL API for AI. It provides a simple, extensible interface for using AI services from OpenAI, Azure and others.
413 lines (354 loc) • 13.2 kB
JavaScript
import fs from "fs/promises";
import os from "os";
import path from "path";
import { createReadStream, createWriteStream } from "fs";
import { pipeline } from "stream/promises";
import axios from "axios";
import FormData from "form-data";
import XLSX from "xlsx";
import { CONVERTED_EXTENSIONS } from "../constants.js";
import { v4 as uuidv4 } from "uuid";
import { sanitizeFilename, generateShortId } from "../utils/filenameUtils.js";
// Read service URLs at call time to allow tests to mutate process.env
function getMarkitdownUrl() {
return process.env.MARKITDOWN_CONVERT_URL || null;
}
function getDocToPdfUrl() {
return process.env.DOC_TO_PDF_SERVICE_URL || null;
}
export class ConversionService {
constructor(context) {
this.context = context;
}
/**
* Determines if a file needs conversion based on its extension
* @param {string} filename - The name of the file to check
* @returns {boolean} - Whether the file needs conversion
*/
needsConversion(filename) {
// Accept either a full filename/path or a raw extension (e.g. ".docx")
const input = filename.toLowerCase();
// If the input looks like an extension already, check directly
if (
input.startsWith(".") &&
!input.includes("/") &&
!input.includes("\\")
) {
return CONVERTED_EXTENSIONS.includes(input);
}
// Otherwise, extract the extension from the filename/path
const ext = path.extname(input).toLowerCase();
return CONVERTED_EXTENSIONS.includes(ext);
}
/**
* Converts a file to its appropriate format
* @param {string} filePath - Path to the file to convert
* @param {string} originalUrl - Original URL of the file (required for document conversion)
* @param {boolean} forceConversion - If true, bypasses extension check and forces document conversion
* @returns {Promise<{convertedPath: string, convertedName: string, converted: boolean}>}
*/
async convertFile(filePath, originalUrl = null, forceConversion = false) {
this.context.log("Converting file:", {
filePath,
originalUrl,
forceConversion,
});
// Clean the file path by removing any query parameters
const cleanFilePath = sanitizeFilename(filePath.split("?")[0]);
const ext = path.extname(cleanFilePath).toLowerCase();
const tempDir = await fs.mkdtemp(path.join(os.tmpdir(), "convert-"));
try {
// If forceConversion is true, directly handle as document conversion
if (forceConversion) {
return await this._handleDocumentConversion(
filePath,
originalUrl,
tempDir,
);
}
// Handle Excel files
if (ext === ".xlsx" || ext === ".xls") {
return await this._handleExcelConversion(filePath, tempDir);
}
// Handle documents that need markdown conversion
if ([".docx", ".doc", ".ppt", ".pptx"].includes(ext)) {
return await this._handleDocumentConversion(
filePath,
originalUrl,
tempDir,
);
}
this.context.log("No conversion needed for this file type");
return { converted: false };
} catch (error) {
this.context.log("Error in convertFile:", error);
// Clean up temp directory on error
await fs.rm(tempDir, { recursive: true, force: true });
throw error;
}
}
/**
* Ensures a file has both original and converted versions
* @param {Object} fileInfo - Information about the file
* @param {string} requestId - Request ID for storage
* @returns {Promise<Object>} - Updated file info with conversion if needed
*/
async ensureConvertedVersion(fileInfo, requestId) {
const { url, gcs } = fileInfo;
// Remove any query parameters before extension check
const extension = path.extname(url.split("?")[0]).toLowerCase();
// If file doesn't need conversion, return original info
if (!this.needsConversion(extension)) {
return fileInfo;
}
// Work with any converted info already stored inside the main hash element
const convertedInfo = fileInfo.converted;
let needsConversion = false;
if (convertedInfo) {
// Verify both primary and GCS URLs exist
const primaryExists = await this._urlExists(convertedInfo?.url);
const gcsExists = this._isGCSConfigured()
? await this._gcsUrlExists(convertedInfo?.gcs)
: false;
// If both URLs exist, return the info
if (primaryExists.valid && (!this._isGCSConfigured() || gcsExists)) {
return { ...fileInfo, converted: convertedInfo };
}
// If either URL is missing, we need to convert
needsConversion = true;
this.context.log("Conversion needed - missing URLs:", {
primaryExists: primaryExists.valid,
gcsExists,
convertedInfo,
});
} else {
needsConversion = true;
this.context.log("Conversion needed - no converted info in map");
}
// If conversion is needed, create it
if (needsConversion) {
try {
const tempDir = path.join(os.tmpdir(), `${uuidv4()}`);
await fs.mkdir(tempDir);
// Ensure we strip any query parameters from the URL when determining the local filename
const cleanUrlPath = url.split("?")[0];
const downloadedFile = path.join(tempDir, path.basename(cleanUrlPath));
await this._downloadFile(url, downloadedFile);
// Convert the file
const conversion = await this.convertFile(downloadedFile, url);
if (!conversion.converted) {
throw new Error("File conversion failed");
}
// Save converted file to primary storage
// Container parameter is ignored - always uses default container from env var
const convertedSaveResult = await this._saveConvertedFile(
conversion.convertedPath,
requestId,
null,
);
if (!convertedSaveResult) {
throw new Error("Failed to save converted file to primary storage");
}
// If GCS is configured, also save to GCS
let gcsUrl;
if (this._isGCSConfigured()) {
gcsUrl = await this._uploadChunkToGCS(
conversion.convertedPath,
requestId,
);
}
// Store converted file info
const convertedFileInfo = {
url: convertedSaveResult.url,
gcs: gcsUrl,
};
// Attach converted info directly to the main file record –
// the caller (index.js) will persist the updated fileInfo
if (!convertedFileInfo.url) {
throw new Error("Failed to get primary URL for converted file");
}
// Cleanup temp files
await this._cleanupTempFiles(
downloadedFile,
conversion.convertedPath,
tempDir,
);
return { ...fileInfo, converted: convertedFileInfo };
} catch (error) {
this.context.log("Error ensuring converted version:", error);
// Don't return partial conversion results
return fileInfo;
}
}
return fileInfo;
}
// Private helper methods
async _handleExcelConversion(filePath, tempDir) {
this.context.log("Handling Excel file conversion");
const csvPath = await this._xlsxToCsv(filePath);
const ext = path.extname(filePath);
const convertedPath = path.join(
tempDir,
`${path.basename(filePath, ext)}.csv`,
);
await pipeline(
createReadStream(csvPath, { highWaterMark: 64 * 1024 }),
createWriteStream(convertedPath, { highWaterMark: 64 * 1024 }),
);
await fs.unlink(csvPath);
return {
convertedPath,
convertedName: path.basename(convertedPath),
converted: true,
};
}
async _handleDocumentConversion(filePath, originalUrl, tempDir) {
// Default: Try PDF conversion if service is configured
const pdfServiceUrl = getDocToPdfUrl();
if (pdfServiceUrl) {
this.context.log("PDF service configured - converting to PDF");
try {
const pdfPath = await this._convertToPDF(filePath, tempDir);
if (pdfPath) {
return {
convertedPath: pdfPath,
convertedName: path.basename(pdfPath),
converted: true,
};
}
} catch (error) {
this.context.log("PDF conversion failed, falling back to markdown:", error.message);
}
} else {
this.context.log("PDF service not configured - using markdown conversion");
}
// Fallback to markdown if PDF service not configured or conversion fails
if (!originalUrl) {
throw new Error("Original URL is required for document conversion");
}
const markdown = await this._convertToMarkdown(originalUrl);
if (!markdown) {
throw new Error("Markdown conversion failed");
}
const shortId = generateShortId();
const convertedPath = path.join(tempDir, `${shortId}.md`);
await fs.writeFile(convertedPath, markdown);
return {
convertedPath,
convertedName: path.basename(convertedPath),
converted: true,
};
}
/**
* Convert document to PDF using streaming upload
* @param {string} filePath - Local path to file
* @param {string} tempDir - Temporary directory for output
* @returns {Promise<string>} - Path to converted PDF
*/
async _convertToPDF(filePath, tempDir) {
try {
const pdfServiceUrl = getDocToPdfUrl();
if (!pdfServiceUrl) {
throw new Error("DOC_TO_PDF_SERVICE_URL is not configured");
}
this.context.log("Converting to PDF via service:", pdfServiceUrl);
// Create form data with file stream
const form = new FormData();
form.append('file', createReadStream(filePath), path.basename(filePath));
// Upload with streaming
const response = await axios({
method: 'POST',
url: pdfServiceUrl,
data: form,
headers: form.getHeaders(),
responseType: 'stream',
maxContentLength: Infinity,
maxBodyLength: Infinity,
timeout: 60000, // 60 second timeout
});
// Stream PDF to temp file using original filename with .pdf extension
const originalBase = path.basename(filePath);
const baseWithoutExt = originalBase.includes('.')
? originalBase.replace(/\.[^/.]+$/, '')
: originalBase;
const pdfPath = path.join(tempDir, `${baseWithoutExt}.pdf`);
const writer = createWriteStream(pdfPath);
await pipeline(response.data, writer);
this.context.log("PDF conversion successful:", pdfPath);
return pdfPath;
} catch (error) {
this.context.log("PDF conversion error:", error.message);
throw error;
}
}
async _convertToMarkdown(fileUrl) {
try {
const markitdownUrl = getMarkitdownUrl();
if (!markitdownUrl) {
throw new Error("MARKITDOWN_CONVERT_URL is not set");
}
const apiUrl = `${markitdownUrl}${encodeURIComponent(fileUrl)}`;
const response = await axios.get(apiUrl);
return response.data.markdown || "";
} catch (err) {
this.context.log("Error converting to markdown:", err);
throw err;
}
}
async _xlsxToCsv(filePath) {
const workbook = XLSX.readFile(filePath, { type: "buffer" });
const outputPath = filePath.replace(/\.[^/.]+$/, ".csv");
let csvContent = "";
workbook.SheetNames.forEach((sheetName) => {
const sheet = workbook.Sheets[sheetName];
const csv = XLSX.utils.sheet_to_csv(sheet);
csvContent += `Sheet: ${sheetName}\n${csv}\n\n`;
});
await fs.writeFile(outputPath, csvContent);
return outputPath;
}
// Storage-related methods (to be implemented by the caller)
async _getFileStoreMap(key) {
throw new Error("Method _getFileStoreMap must be implemented");
}
async _setFileStoreMap(key, value) {
throw new Error("Method _setFileStoreMap must be implemented");
}
async _urlExists(url) {
throw new Error("Method _urlExists must be implemented");
}
async _gcsUrlExists(url) {
throw new Error("Method _gcsUrlExists must be implemented");
}
async _downloadFile(url, destination) {
throw new Error("Method _downloadFile must be implemented");
}
async _saveConvertedFile(filePath, requestId, filename = null) {
throw new Error("Method _saveConvertedFile must be implemented");
}
async _uploadChunkToGCS(filePath, requestId) {
throw new Error("Method _uploadChunkToGCS must be implemented");
}
_isGCSConfigured() {
throw new Error("Method _isGCSConfigured must be implemented");
}
async _cleanupTempFiles(...files) {
for (const file of files) {
try {
if (!file) continue;
// Check if the file/directory exists
await fs.access(file).catch(() => null);
// Determine if the path is a directory or a file
const stats = await fs.lstat(file).catch(() => null);
if (!stats) continue;
if (stats.isDirectory()) {
await fs.rm(file, { recursive: true, force: true });
} else {
await fs.unlink(file);
}
} catch (err) {
this.context.log("Error cleaning up temp file:", err);
}
}
}
}