UNPKG

llm-extract

Version:

Modular SDK for structured text extraction from documents using LLMs

145 lines 6.06 kB
"use strict"; var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) { if (k2 === undefined) k2 = k; var desc = Object.getOwnPropertyDescriptor(m, k); if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) { desc = { enumerable: true, get: function() { return m[k]; } }; } Object.defineProperty(o, k2, desc); }) : (function(o, m, k, k2) { if (k2 === undefined) k2 = k; o[k2] = m[k]; })); var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) { Object.defineProperty(o, "default", { enumerable: true, value: v }); }) : function(o, v) { o["default"] = v; }); var __importStar = (this && this.__importStar) || (function () { var ownKeys = function(o) { ownKeys = Object.getOwnPropertyNames || function (o) { var ar = []; for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k; return ar; }; return ownKeys(o); }; return function (mod) { if (mod && mod.__esModule) return mod; var result = {}; if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]); __setModuleDefault(result, mod); return result; }; })(); var __importDefault = (this && this.__importDefault) || function (mod) { return (mod && mod.__esModule) ? mod : { "default": mod }; }; Object.defineProperty(exports, "__esModule", { value: true }); exports.ImageOCRProcessor = void 0; const config_helpers_1 = require("../utils/config-helpers"); const Tesseract = __importStar(require("tesseract.js")); const sharp_1 = __importDefault(require("sharp")); class ImageOCRProcessor { constructor() { this.supportedTypes = [ "image/jpeg", "image/png", "image/tiff", "image/tif", "image/bmp", "image/webp", ]; } async parseDocument(buffer, options = {}) { const startTime = Date.now(); const configErrors = (0, config_helpers_1.validateProcessorConfig)(options.config || {}); if (configErrors.length > 0) { throw new Error(`Invalid configuration: ${configErrors.join(", ")}`); } const config = (0, config_helpers_1.mergeProcessorConfigurations)(options.config); try { let processedBuffer = buffer; if (config.sharp.preprocessing.enhance?.sharpen || config.sharp.preprocessing.enhance?.normalize) { processedBuffer = await this.preprocessImage(buffer, config.sharp); } const { data: { text, confidence }, } = await Tesseract.recognize(processedBuffer, config.tesseract.language); const extractedText = text.trim(); const imageInfo = await (0, sharp_1.default)(buffer).metadata(); return { extractedText, metadata: { pageCount: 1, hasImages: true, processingInfo: { ocrUsed: true, ocrLanguage: config.tesseract.language, ocrConfidence: confidence, textLength: extractedText.length, preprocessed: !!(config.sharp.preprocessing.enhance?.sharpen || config.sharp.preprocessing.enhance?.normalize), }, imageInfo: { width: imageInfo.width, height: imageInfo.height, format: imageInfo.format, density: imageInfo.density, channels: imageInfo.channels, hasAlpha: imageInfo.hasAlpha, }, }, processingTimeMs: Date.now() - startTime, }; } catch (error) { throw new Error(`Image OCR processing failed: ${error.message}`); } } async preprocessImage(buffer, sharpConfig) { try { let processor = (0, sharp_1.default)(buffer); if (sharpConfig.preprocessing.resize) { processor = processor.resize(sharpConfig.preprocessing.resize); } const enhance = sharpConfig.preprocessing.enhance; if (enhance?.sharpen) { processor = typeof enhance.sharpen === "object" ? processor.sharpen() // Simplified for now : processor.sharpen(); } if (enhance?.normalize) { processor = processor.normalize(); } if (sharpConfig.preprocessing.grayscale) { processor = processor.greyscale(); } if (sharpConfig.preprocessing.contrast !== 1.0) { processor = processor.linear(sharpConfig.preprocessing.contrast, 0); } const output = sharpConfig.output; switch (output.format) { case "jpeg": processor = processor.jpeg({ quality: output.quality }); break; case "webp": processor = processor.webp({ quality: output.quality }); break; case "tiff": processor = processor.tiff({ quality: output.quality }); break; default: processor = processor.png({ quality: output.quality, compressionLevel: output.compression, }); } return await processor.toBuffer(); } catch (error) { console.warn("Image preprocessing failed, using original:", error.message); return buffer; } } } exports.ImageOCRProcessor = ImageOCRProcessor; //# sourceMappingURL=image-ocr.js.map