UNPKG

n8n-nodes-capivision

Version:

OCR multiengine com visão apurada de capivara — Tesseract, OCR.space, AWS Textract e suporte a layout inteligente.

www.linkedin.com/in/thawammichels/

ThawamMichels/OCR-CAPIVISION

565 lines • 26.4 kB

JavaScript

"use strict"; var __importDefault = (this && this.__importDefault) || function (mod) { return (mod && mod.__esModule) ? mod : { "default": mod }; }; Object.defineProperty(exports, "__esModule", { value: true }); exports.CapivisionOcr = void 0; const tesseract_js_1 = require("tesseract.js"); const client_textract_1 = require("@aws-sdk/client-textract"); const openai_1 = __importDefault(require("openai")); const axios_1 = __importDefault(require("axios")); const pdf_js_extract_1 = require("pdf.js-extract"); class OcrExtractor { constructor() { this.pdfExtract = new pdf_js_extract_1.PDFExtract(); } async extractFromPdf(pdfBuffer) { try { const data = await this.pdfExtract.extractBuffer(pdfBuffer); return data.pages.map(page => page.content.map(item => item.str).join(' ')).join('\n\n'); } catch (error) { throw new Error(`Erro ao extrair texto do PDF: ${error.message}`); } } async validateAndProcessFile(binaryData) { try { const mimeType = binaryData.mimeType || ''; if (!binaryData.data) { throw new Error('Dados binários não encontrados'); } const data = Buffer.from(binaryData.data, 'base64'); if (!data || data.length === 0) { throw new Error('Falha ao converter dados base64 para buffer'); } if (mimeType.startsWith('image/')) { return { type: 'image', data }; } else if (mimeType === 'application/pdf') { return { type: 'pdf', data }; } else { throw new Error(`Tipo de arquivo não suportado: ${mimeType}. Use PDF ou imagem.`); } } catch (error) { throw new Error(`Erro ao validar arquivo: ${error.message}`); } } async extractText(result, engine, fileType) { try { switch (engine) { case 'tesseract': if (fileType === 'pdf') { throw new Error('Tesseract.js não suporta PDF diretamente no n8n. Use OCR.space ou AWS Textract para PDFs.'); } return result.text || ''; case 'ocrspace': if (!result.ParsedResults || !result.ParsedResults[0]) { throw new Error('OCR.space não retornou resultados válidos'); } return result.ParsedResults[0].ParsedText || ''; case 'textract': if (!result.Blocks) { throw new Error('AWS Textract não retornou blocos de texto'); } return result.Blocks .filter((block) => block.BlockType === 'LINE') .map((block) => block.Text) .join('\n') || ''; default: throw new Error(`Engine "${engine}" não suportada`); } } catch (error) { throw new Error(`Erro ao extrair texto: ${error.message}`); } } async extractJson(result, engine, layout) { try { const text = await this.extractText(result, engine, 'image'); if (Object.keys(layout).length === 0) { return { text }; } const structuredData = {}; for (const [field, coords] of Object.entries(layout)) { const { x, y, w, h } = coords; structuredData[field] = `Valor extraído para ${field}`; } return structuredData; } catch (error) { throw new Error(`Erro ao extrair JSON: ${error.message}`); } } async extractCsv(result, engine, layout) { try { const jsonData = await this.extractJson(result, engine, layout); if (Object.keys(layout).length === 0) { return jsonData.text; } const headers = Object.keys(jsonData); const values = Object.values(jsonData); return `${headers.join(',')}\n${values.join(',')}`; } catch (error) { throw new Error(`Erro ao extrair CSV: ${error.message}`); } } } async function analyzeWithAI(text, credentials) { var _a, _b; try { const openai = new openai_1.default({ apiKey: credentials.apiKey, }); const response = await openai.chat.completions.create({ model: 'gpt-4', messages: [ { role: 'system', content: 'Você é um assistente especializado em análise de texto extraído por OCR. Analise o texto fornecido e forneça um resumo estruturado com os principais pontos e informações relevantes.', }, { role: 'user', content: text, }, ], temperature: 0.7, }); return ((_b = (_a = response.choices[0]) === null || _a === void 0 ? void 0 : _a.message) === null || _b === void 0 ? void 0 : _b.content) || 'Não foi possível analisar o texto com IA.'; } catch (error) { throw new Error(`Erro na análise com IA: ${error.message}`); } } class CapivisionOcr { constructor() { this.description = { displayName: 'CAPIVISION OCR', name: 'capivisionOcr', icon: 'file:icon.svg', group: ['transform'], version: 1, description: 'OCR multiengine com visão apurada de capivara', defaults: { name: 'CAPIVISION OCR', }, inputs: ['main'], outputs: ['main'], credentials: [ { name: 'ocrSpaceApi', required: true, displayOptions: { show: { engine: ['ocrspace'], }, }, }, { name: 'awsTextractApi', required: true, displayOptions: { show: { engine: ['textract'], }, }, }, { name: 'openAiApi', required: true, displayOptions: { show: { treatmentMethod: ['ocr_ai'], }, }, }, ], properties: [ { displayName: 'Mecanismo OCR', name: 'engine', type: 'options', options: [ { name: 'Tesseract.js (apenas imagens)', value: 'tesseract', description: 'Melhor para imagens simples e texto bem definido', }, { name: 'OCR.space (imagens e PDF)', value: 'ocrspace', description: 'Suporta PDF e vários idiomas', }, { name: 'AWS Textract (imagens e PDF)', value: 'textract', description: 'Melhor para documentos complexos e formulários', }, ], default: 'tesseract', required: true, }, { displayName: 'Tipo de Entrada', name: 'imageFormat', type: 'options', options: [ { name: 'Binário', value: 'binary', }, { name: 'Base64', value: 'base64', }, ], default: 'binary', required: true, }, { displayName: 'Input Binário', name: 'binaryPropertyName', type: 'string', default: 'data', required: true, displayOptions: { show: { imageFormat: ['binary'], }, }, description: 'Nome do campo que contém a imagem/PDF', }, { displayName: 'String Base64', name: 'base64String', type: 'string', typeOptions: { rows: 4, }, default: '', required: true, displayOptions: { show: { imageFormat: ['base64'], }, }, description: 'String base64 da imagem (pode incluir ou não o cabeçalho data:image)', }, { displayName: 'Método de Tratamento', name: 'treatmentMethod', type: 'options', options: [ { name: 'Apenas OCR (Sem IA)', value: 'ocr_only', }, { name: 'OCR + IA', value: 'ocr_ai', }, ], default: 'ocr_only', required: true, }, { displayName: 'Mecanismo IA', name: 'aiEngine', type: 'options', displayOptions: { show: { treatmentMethod: ['ocr_ai'], }, }, options: [ { name: 'ChatGPT', value: 'chatgpt', }, ], default: 'chatgpt', required: true, }, { displayName: 'Modelo IA', name: 'aiModel', type: 'options', displayOptions: { show: { treatmentMethod: ['ocr_ai'], aiEngine: ['chatgpt'], }, }, options: [ { name: 'GPT-4o-mini', value: 'gpt-4-mini', }, { name: 'GPT-4o', value: 'gpt-4', }, ], default: 'gpt-4-mini', required: true, }, { displayName: 'Formato de Saída', name: 'outputFormat', type: 'options', options: [ { name: 'Texto Puro', value: 'text', }, { name: 'JSON Estruturado', value: 'json', }, { name: 'CSV', value: 'csv', }, ], default: 'text', required: true, }, { displayName: 'Preset de Layout (opcional)', name: 'layoutPreset', type: 'json', default: '{}', required: false, description: 'JSON com estrutura de coordenadas para extração', }, ], }; } async execute() { var _a, _b, _c, _d; try { const items = this.getInputData(); const returnData = []; const extractor = new OcrExtractor(); for (let i = 0; i < items.length; i++) { let engine = ''; let treatmentMethod = ''; let outputFormat = ''; let fileType = ''; try { engine = this.getNodeParameter('engine', i); outputFormat = this.getNodeParameter('outputFormat', i); treatmentMethod = this.getNodeParameter('treatmentMethod', i); const imageFormat = this.getNodeParameter('imageFormat', i); let imageData; let processedFile; if (imageFormat === 'binary') { const binaryPropertyName = this.getNodeParameter('binaryPropertyName', i); const binaryData = items[i].binary; if (!binaryData) { throw new Error('Nenhum dado binário encontrado!'); } const binaryProperty = binaryData[binaryPropertyName]; if (!binaryProperty) { throw new Error(`Nenhum dado binário encontrado no campo "${binaryPropertyName}"!`); } processedFile = await extractor.validateAndProcessFile(binaryProperty); imageData = processedFile.data; fileType = processedFile.type; } else { const base64String = this.getNodeParameter('base64String', i); if (!base64String) { throw new Error('String base64 não fornecida!'); } const base64Clean = base64String.replace(/^data:image\/[a-zA-Z+]+;base64,/, ''); imageData = Buffer.from(base64Clean, 'base64'); fileType = 'image'; } let extractedText = ''; switch (engine) { case 'tesseract': if (fileType === 'pdf') { throw new Error('Tesseract.js não suporta PDF diretamente no n8n. Use OCR.space ou AWS Textract para PDFs.'); } console.log('Iniciando processamento Tesseract...'); console.log('Tamanho da imagem:', imageData.length, 'bytes'); try { const worker = await (0, tesseract_js_1.createWorker)({ langPath: 'https://tessdata.projectnaptha.com/4.0.0', logger: m => console.log('Tesseract Log:', JSON.stringify(m)), errorHandler: e => console.error('Tesseract Error:', e), }); console.log('Worker Tesseract criado'); try { await worker.loadLanguage('por'); console.log('Idioma carregado'); await worker.initialize('por'); console.log('Worker inicializado'); const result = await worker.recognize(imageData); console.log('Reconhecimento concluído'); if (!result || !result.data) { throw new Error('Resultado do Tesseract inválido'); } extractedText = result.data.text || ''; if (!extractedText.trim()) { throw new Error('Nenhum texto extraído da imagem'); } console.log('Texto extraído com sucesso'); } finally { await worker.terminate(); console.log('Worker terminado'); } } catch (tesseractError) { console.error('Erro detalhado do Tesseract:', tesseractError); throw new Error(`Erro no processamento do Tesseract: ${tesseractError.message || 'Erro desconhecido'}\nStack: ${tesseractError.stack || 'Sem stack trace'}`); } break; case 'ocrspace': const credentials = await this.getCredentials('ocrSpaceApi'); const formData = new FormData(); formData.append('apikey', credentials.apiKey); formData.append('language', 'por'); formData.append('isOverlayRequired', 'true'); formData.append('detectOrientation', 'true'); formData.append('scale', 'true'); formData.append('OCREngine', '2'); if (fileType === 'pdf') { formData.append('file', new Blob([imageData], { type: 'application/pdf' })); } else { formData.append('file', new Blob([imageData])); } const response = await axios_1.default.post('https://api.ocr.space/parse/image', formData, { headers: { 'Content-Type': 'multipart/form-data' }, }); extractedText = response.data.ParsedResults[0].ParsedText || ''; break; case 'textract': const awsCredentials = await this.getCredentials('awsTextractApi'); const textract = new client_textract_1.TextractClient({ region: awsCredentials.region, credentials: { accessKeyId: awsCredentials.accessKeyId, secretAccessKey: awsCredentials.secretAccessKey, }, }); if (fileType === 'pdf') { const command = new client_textract_1.AnalyzeDocumentCommand({ Document: { Bytes: imageData, }, FeatureTypes: ['FORMS', 'TABLES', 'QUERIES'], }); const result = await textract.send(command); extractedText = ((_a = result.Blocks) === null || _a === void 0 ? void 0 : _a.filter(block => block.BlockType === 'LINE').map(block => block.Text).join('\n')) || ''; } else { const command = new client_textract_1.DetectDocumentTextCommand({ Document: { Bytes: imageData, }, }); const result = await textract.send(command); extractedText = ((_b = result.Blocks) === null || _b === void 0 ? void 0 : _b.filter(block => block.BlockType === 'LINE').map(block => block.Text).join('\n')) || ''; } break; } let finalOutput = extractedText; if (treatmentMethod === 'ocr_ai' && extractedText) { try { const credentials = await this.getCredentials('openAiApi'); const aiEngine = this.getNodeParameter('aiEngine', i); const aiModel = this.getNodeParameter('aiModel', i); if (aiEngine === 'chatgpt') { const openai = new openai_1.default({ apiKey: credentials.apiKey, }); const aiResponse = await openai.chat.completions.create({ model: aiModel === 'gpt-4-mini' ? 'gpt-4-1106-preview' : 'gpt-4', messages: [ { role: 'system', content: 'Você é um especialista em análise de documentos. Analise o texto fornecido e extraia as informações mais relevantes.', }, { role: 'user', content: `Analise este texto e extraia as informações mais importantes:\n\n${extractedText}`, }, ], temperature: 0.3, }); const aiAnalysis = ((_d = (_c = aiResponse.choices[0]) === null || _c === void 0 ? void 0 : _c.message) === null || _d === void 0 ? void 0 : _d.content) || 'Não foi possível analisar o texto com IA.'; if (outputFormat === 'json') { finalOutput = { original_text: extractedText, ai_analysis: aiAnalysis, }; } else if (outputFormat === 'csv') { finalOutput = `Texto Original,Análise IA\n"${extractedText.replace(/"/g, '""')}","${aiAnalysis.replace(/"/g, '""')}"`; } else { finalOutput = `=== Texto Original ===\n${extractedText}\n\n=== Análise IA ===\n${aiAnalysis}`; } } } catch (aiError) { throw new Error(`Erro na análise com IA: ${aiError.message}`); } } returnData.push({ json: { success: true, timestamp: new Date().toISOString(), engine, fileType, treatmentMethod, outputFormat, data: finalOutput, metadata: { processedAt: new Date().toISOString(), engineVersion: { tesseract: '4.1.1', openai: '4.97.0', }, }, }, }); } catch (error) { console.error('Erro completo:', error); returnData.push({ json: { success: false, timestamp: new Date().toISOString(), error: { message: error.message || 'Erro desconhecido', type: error.name || 'ProcessingError', stack: error.stack || 'Sem stack trace', context: { engine, fileType, treatmentMethod, outputFormat, originalError: error.toString(), details: typeof error === 'object' ? JSON.stringify(error) : 'Erro não é um objeto' }, }, }, }); } } return [returnData]; } catch (error) { throw new Error(`Erro global na execução: ${error.message}\nDetalhes: ${error.stack}`); } } } exports.CapivisionOcr = CapivisionOcr; //# sourceMappingURL=CapivisionOcr.node.js.map