docuglean-ocr
Version:
An SDK for intelligent document processing using State of the Art AI models.
104 lines (103 loc) • 3.5 kB
JavaScript
var __importDefault = (this && this.__importDefault) || function (mod) {
return (mod && mod.__esModule) ? mod : { "default": mod };
};
Object.defineProperty(exports, "__esModule", { value: true });
exports.isURL = isURL;
exports.getFileType = getFileType;
exports.encodePdf = encodePdf;
exports.encodeImage = encodeImage;
exports.handleMistralOCRResponse = handleMistralOCRResponse;
exports.getSignedMistralUrl = getSignedMistralUrl;
const fs_1 = __importDefault(require("fs"));
const path_1 = __importDefault(require("path"));
const mistralai_1 = require("@mistralai/mistralai");
function isURL(filePath) {
return filePath.startsWith('http://') || filePath.startsWith('https://');
}
function getFileType(filePath) {
const extension = path_1.default.extname(filePath).toLowerCase();
switch (extension) {
case '.jpg':
case '.jpeg':
case '.png':
case '.gif':
case '.webp':
return 'image';
case '.pdf':
return 'pdf';
default:
return 'unknown';
}
}
async function encodePdf(filePath) {
try {
// Read the PDF file as a buffer
const pdfBuffer = fs_1.default.readFileSync(filePath);
// Convert the buffer to a Base64-encoded string
const base64Pdf = pdfBuffer.toString('base64');
return `data:application/pdf;base64,${base64Pdf}`;
}
catch (error) {
if (error instanceof Error) {
throw new Error(`Failed to encode PDF: ${error.message}`);
}
throw new Error('Failed to encode PDF: Unknown error');
}
}
async function encodeImage(filePath) {
try {
// Read the image file as a buffer
const imageBuffer = fs_1.default.readFileSync(filePath);
// Convert the buffer to a Base64-encoded string
const base64Image = imageBuffer.toString('base64');
return base64Image;
}
catch (error) {
if (error instanceof Error) {
throw new Error(`Failed to encode image: ${error.message}`);
}
throw new Error('Failed to encode image: Unknown error');
}
}
function handleMistralOCRResponse(response) {
if (!response.pages || response.pages.length === 0) {
throw new Error('No pages found in OCR response');
}
// Get markdown content from all pages
const markdownContent = response.pages
.sort((a, b) => a.index - b.index)
.map(page => page.markdown)
.join('\n\n');
// Get image information
const images = response.pages.flatMap(page => page.images);
return {
markdown: markdownContent,
images,
rawResponse: response
};
}
async function getSignedMistralUrl(filePath, apiKey) {
try {
const client = new mistralai_1.Mistral({ apiKey });
const fileContent = fs_1.default.readFileSync(filePath);
const fileName = path_1.default.basename(filePath);
const uploadedFile = await client.files.upload({
file: {
fileName,
content: fileContent,
},
purpose: "ocr"
});
const signedUrl = await client.files.getSignedUrl({
fileId: uploadedFile.id,
});
return signedUrl.url;
}
catch (error) {
if (error instanceof Error) {
throw new Error(`Failed to get signed URL: ${error.message}`);
}
throw new Error('Failed to get signed URL: Unknown error');
}
}
;