llm-extract
Version:
Modular SDK for structured text extraction from documents using LLMs
145 lines • 6.06 kB
JavaScript
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
var desc = Object.getOwnPropertyDescriptor(m, k);
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
desc = { enumerable: true, get: function() { return m[k]; } };
}
Object.defineProperty(o, k2, desc);
}) : (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
o[k2] = m[k];
}));
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
Object.defineProperty(o, "default", { enumerable: true, value: v });
}) : function(o, v) {
o["default"] = v;
});
var __importStar = (this && this.__importStar) || (function () {
var ownKeys = function(o) {
ownKeys = Object.getOwnPropertyNames || function (o) {
var ar = [];
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
return ar;
};
return ownKeys(o);
};
return function (mod) {
if (mod && mod.__esModule) return mod;
var result = {};
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
__setModuleDefault(result, mod);
return result;
};
})();
var __importDefault = (this && this.__importDefault) || function (mod) {
return (mod && mod.__esModule) ? mod : { "default": mod };
};
Object.defineProperty(exports, "__esModule", { value: true });
exports.ImageOCRProcessor = void 0;
const config_helpers_1 = require("../utils/config-helpers");
const Tesseract = __importStar(require("tesseract.js"));
const sharp_1 = __importDefault(require("sharp"));
class ImageOCRProcessor {
constructor() {
this.supportedTypes = [
"image/jpeg",
"image/png",
"image/tiff",
"image/tif",
"image/bmp",
"image/webp",
];
}
async parseDocument(buffer, options = {}) {
const startTime = Date.now();
const configErrors = (0, config_helpers_1.validateProcessorConfig)(options.config || {});
if (configErrors.length > 0) {
throw new Error(`Invalid configuration: ${configErrors.join(", ")}`);
}
const config = (0, config_helpers_1.mergeProcessorConfigurations)(options.config);
try {
let processedBuffer = buffer;
if (config.sharp.preprocessing.enhance?.sharpen || config.sharp.preprocessing.enhance?.normalize) {
processedBuffer = await this.preprocessImage(buffer, config.sharp);
}
const { data: { text, confidence }, } = await Tesseract.recognize(processedBuffer, config.tesseract.language);
const extractedText = text.trim();
const imageInfo = await (0, sharp_1.default)(buffer).metadata();
return {
extractedText,
metadata: {
pageCount: 1,
hasImages: true,
processingInfo: {
ocrUsed: true,
ocrLanguage: config.tesseract.language,
ocrConfidence: confidence,
textLength: extractedText.length,
preprocessed: !!(config.sharp.preprocessing.enhance?.sharpen || config.sharp.preprocessing.enhance?.normalize),
},
imageInfo: {
width: imageInfo.width,
height: imageInfo.height,
format: imageInfo.format,
density: imageInfo.density,
channels: imageInfo.channels,
hasAlpha: imageInfo.hasAlpha,
},
},
processingTimeMs: Date.now() - startTime,
};
}
catch (error) {
throw new Error(`Image OCR processing failed: ${error.message}`);
}
}
async preprocessImage(buffer, sharpConfig) {
try {
let processor = (0, sharp_1.default)(buffer);
if (sharpConfig.preprocessing.resize) {
processor = processor.resize(sharpConfig.preprocessing.resize);
}
const enhance = sharpConfig.preprocessing.enhance;
if (enhance?.sharpen) {
processor =
typeof enhance.sharpen === "object"
? processor.sharpen() // Simplified for now
: processor.sharpen();
}
if (enhance?.normalize) {
processor = processor.normalize();
}
if (sharpConfig.preprocessing.grayscale) {
processor = processor.greyscale();
}
if (sharpConfig.preprocessing.contrast !== 1.0) {
processor = processor.linear(sharpConfig.preprocessing.contrast, 0);
}
const output = sharpConfig.output;
switch (output.format) {
case "jpeg":
processor = processor.jpeg({ quality: output.quality });
break;
case "webp":
processor = processor.webp({ quality: output.quality });
break;
case "tiff":
processor = processor.tiff({ quality: output.quality });
break;
default:
processor = processor.png({
quality: output.quality,
compressionLevel: output.compression,
});
}
return await processor.toBuffer();
}
catch (error) {
console.warn("Image preprocessing failed, using original:", error.message);
return buffer;
}
}
}
exports.ImageOCRProcessor = ImageOCRProcessor;
//# sourceMappingURL=image-ocr.js.map
;