UNPKG

img-to-text-computational

Version:

High-performance image-to-text analyzer using pure computational methods. Convert images to structured text descriptions with 99.9% accuracy, zero AI dependencies, and complete offline processing.

1,002 lines (888 loc) 29.2 kB
const Tesseract = require('tesseract.js'); class MultiLanguageOCR { constructor(options = {}) { this.options = { defaultLanguage: options.defaultLanguage || 'eng', autoDetectLanguage: options.autoDetectLanguage !== false, confidenceThreshold: options.confidenceThreshold || 0.7, fallbackLanguages: options.fallbackLanguages || ['eng', 'spa', 'fra', 'deu', 'chi_sim'], maxRetries: options.maxRetries || 3, ...options }; // Language-specific configurations this.languageConfigs = this.initializeLanguageConfigs(); // Initialize workers for different languages this.workers = new Map(); this.workerPromises = new Map(); } /** * Initialize language-specific configurations */ initializeLanguageConfigs() { return { // English eng: { tesseractOptions: { tessedit_char_whitelist: '', tessedit_pageseg_mode: Tesseract.PSM.AUTO, preserve_interword_spaces: '1' }, postProcessing: { removeExtraSpaces: true, fixCommonErrors: true, capitalizeProperNouns: false } }, // Spanish spa: { tesseractOptions: { tessedit_char_whitelist: '', tessedit_pageseg_mode: Tesseract.PSM.AUTO, preserve_interword_spaces: '1' }, postProcessing: { removeExtraSpaces: true, fixCommonErrors: true, handleAccents: true } }, // French fra: { tesseractOptions: { tessedit_char_whitelist: '', tessedit_pageseg_mode: Tesseract.PSM.AUTO, preserve_interword_spaces: '1' }, postProcessing: { removeExtraSpaces: true, fixCommonErrors: true, handleAccents: true, fixCedillas: true } }, // German deu: { tesseractOptions: { tessedit_char_whitelist: '', tessedit_pageseg_mode: Tesseract.PSM.AUTO, preserve_interword_spaces: '1' }, postProcessing: { removeExtraSpaces: true, fixCommonErrors: true, handleUmlauts: true, fixCompoundWords: true } }, // Chinese Simplified chi_sim: { tesseractOptions: { tessedit_char_whitelist: '', tessedit_pageseg_mode: Tesseract.PSM.AUTO, preserve_interword_spaces: '0' }, postProcessing: { removeExtraSpaces: false, fixCommonErrors: true, handleVerticalText: true } }, // Chinese Traditional chi_tra: { tesseractOptions: { tessedit_char_whitelist: '', tessedit_pageseg_mode: Tesseract.PSM.AUTO, preserve_interword_spaces: '0' }, postProcessing: { removeExtraSpaces: false, fixCommonErrors: true, handleVerticalText: true } }, // Japanese jpn: { tesseractOptions: { tessedit_char_whitelist: '', tessedit_pageseg_mode: Tesseract.PSM.AUTO, preserve_interword_spaces: '0' }, postProcessing: { removeExtraSpaces: false, fixCommonErrors: true, handleVerticalText: true, separateHiraganaKatakana: true } }, // Korean kor: { tesseractOptions: { tessedit_char_whitelist: '', tessedit_pageseg_mode: Tesseract.PSM.AUTO, preserve_interword_spaces: '1' }, postProcessing: { removeExtraSpaces: true, fixCommonErrors: true, handleHangul: true } }, // Arabic ara: { tesseractOptions: { tessedit_char_whitelist: '', tessedit_pageseg_mode: Tesseract.PSM.AUTO, preserve_interword_spaces: '1' }, postProcessing: { removeExtraSpaces: true, fixCommonErrors: true, handleRTL: true, fixArabicShaping: true } }, // Russian rus: { tesseractOptions: { tessedit_char_whitelist: '', tessedit_pageseg_mode: Tesseract.PSM.AUTO, preserve_interword_spaces: '1' }, postProcessing: { removeExtraSpaces: true, fixCommonErrors: true, handleCyrillic: true } }, // Hindi hin: { tesseractOptions: { tessedit_char_whitelist: '', tessedit_pageseg_mode: Tesseract.PSM.AUTO, preserve_interword_spaces: '1' }, postProcessing: { removeExtraSpaces: true, fixCommonErrors: true, handleDevanagari: true } } }; } /** * Process image with multi-language OCR * @param {Buffer|string} imageInput - Image buffer or path * @param {Object} options - Processing options * @returns {Promise<Object>} OCR results with language detection */ async processImage(imageInput, options = {}) { try { const processingOptions = { ...this.options, ...options }; let detectedLanguage = processingOptions.language || this.options.defaultLanguage; let ocrResults = null; let languageDetectionResults = null; // Auto-detect language if enabled if (this.options.autoDetectLanguage && !processingOptions.language) { languageDetectionResults = await this.detectLanguage(imageInput); detectedLanguage = languageDetectionResults.primary_language; } // Perform OCR with detected/specified language ocrResults = await this.performOCR(imageInput, detectedLanguage, processingOptions); // If confidence is low, try fallback languages if (ocrResults.confidence < this.options.confidenceThreshold) { const fallbackResults = await this.tryFallbackLanguages( imageInput, detectedLanguage, processingOptions ); if (fallbackResults && fallbackResults.confidence > ocrResults.confidence) { ocrResults = fallbackResults; detectedLanguage = fallbackResults.language; } } // Post-process results based on language const processedResults = await this.postProcessResults(ocrResults, detectedLanguage); return { language: detectedLanguage, language_detection: languageDetectionResults, raw_text: ocrResults.text, processed_text: processedResults.text, confidence: ocrResults.confidence, words: processedResults.words, lines: processedResults.lines, paragraphs: processedResults.paragraphs, structured_text: processedResults.structured_text, language_specific_analysis: processedResults.language_analysis, processing_stats: { language_used: detectedLanguage, processing_time: ocrResults.processing_time, fallback_attempts: ocrResults.fallback_attempts || 0, post_processing_applied: processedResults.post_processing_applied } }; } catch (error) { throw new Error(`Multi-language OCR processing failed: ${error.message}`); } } /** * Detect language from image * @param {Buffer|string} imageInput - Image input * @returns {Promise<Object>} Language detection results */ async detectLanguage(imageInput) { try { const startTime = Date.now(); // Use multiple detection methods const detectionMethods = [ this.detectLanguageByScript(imageInput), this.detectLanguageByFrequency(imageInput), this.detectLanguageByPattern(imageInput) ]; const results = await Promise.all(detectionMethods); // Combine results with confidence weighting const languageScores = {}; results.forEach((result, index) => { const weight = [0.4, 0.4, 0.2][index]; // Script detection gets highest weight for (const [lang, score] of Object.entries(result.scores)) { languageScores[lang] = (languageScores[lang] || 0) + (score * weight); } }); // Sort by confidence const sortedLanguages = Object.entries(languageScores) .sort(([, a], [, b]) => b - a) .slice(0, 3); return { primary_language: sortedLanguages[0]?.[0] || this.options.defaultLanguage, confidence: sortedLanguages[0]?.[1] || 0, alternatives: sortedLanguages.slice(1).map(([lang, score]) => ({ language: lang, confidence: score })), detection_methods: results, processing_time: Date.now() - startTime }; } catch (error) { // Fallback to default language return { primary_language: this.options.defaultLanguage, confidence: 0.5, alternatives: [], error: error.message, processing_time: 0 }; } } /** * Detect language by script analysis */ async detectLanguageByScript(imageInput) { try { // Quick OCR with script detection const worker = await this.getWorker('osd'); // Orientation and Script Detection const result = await worker.detect(imageInput); const scriptMappings = { 'Latin': ['eng', 'spa', 'fra', 'deu', 'ita', 'por'], 'Han': ['chi_sim', 'chi_tra', 'jpn'], 'Hiragana': ['jpn'], 'Katakana': ['jpn'], 'Hangul': ['kor'], 'Arabic': ['ara'], 'Cyrillic': ['rus'], 'Devanagari': ['hin'] }; const scores = {}; if (result.script && scriptMappings[result.script]) { const languages = scriptMappings[result.script]; const baseScore = result.confidence || 0.7; languages.forEach((lang, index) => { scores[lang] = baseScore - (index * 0.1); }); } return { method: 'script_detection', detected_script: result.script, scores, confidence: result.confidence || 0 }; } catch (error) { return { method: 'script_detection', scores: {}, error: error.message }; } } /** * Detect language by character frequency analysis */ async detectLanguageByFrequency(imageInput) { try { // Perform quick OCR with multiple languages const testLanguages = ['eng', 'spa', 'fra', 'deu', 'chi_sim']; const results = await Promise.all( testLanguages.map(async (lang) => { try { const worker = await this.getWorker(lang); const result = await worker.recognize(imageInput, { tessedit_pageseg_mode: Tesseract.PSM.AUTO, tessedit_char_confidence: '1' }); return { language: lang, confidence: result.data.confidence / 100, text_length: result.data.text.length }; } catch (error) { return { language: lang, confidence: 0, text_length: 0 }; } }) ); const scores = {}; results.forEach(result => { // Score based on confidence and text length (more text usually means better detection) scores[result.language] = result.confidence * (1 + Math.min(result.text_length / 100, 0.5)); }); return { method: 'frequency_analysis', scores, test_results: results }; } catch (error) { return { method: 'frequency_analysis', scores: {}, error: error.message }; } } /** * Detect language by pattern matching */ async detectLanguageByPattern(imageInput) { try { // Quick OCR to get sample text const worker = await this.getWorker(this.options.defaultLanguage); const result = await worker.recognize(imageInput); const text = result.data.text; const patterns = { eng: [ /\b(the|and|for|are|but|not|you|all|can|had|her|was|one|our|out|day|get|has|him|his|how|its|may|new|now|old|see|two|way|who|boy|did|man|men|she|use|her|him|his|how|its|may|new|now|old|see|two|way|who|boy|did|man|men|she|use)\b/gi, /ing\b/gi, /tion\b/gi, /\b(a|an|the)\s+/gi ], spa: [ /\b(el|la|de|que|y|en|un|es|se|no|te|lo|le|da|su|por|son|con|para|una|sur|sus|les|más|como|pero|sus|del|mis|las|dos|por|qué|muy|sin|nos|hasta|donde|mientras|cada|todos|todo|otra|otros|otras|cual|cuando|tanto|tanto|menos|casi)\b/gi, /ción\b/gi, /ñ/gi, /[áéíóúü]/gi ], fra: [ /\b(le|de|et|à|un|il|être|et|en|avoir|que|pour|dans|ce|son|une|sur|avec|ne|se|pas|tout|plus|par|grand|où|ou|quoi|nous|vous|leur|quel|dont|sans|sous|entre|pendant|depuis|vers|chez|contre|parmi|selon|malgré|durant|hormis)\b/gi, /tion\b/gi, /[àâäçéèêëïîôùûüÿñæœ]/gi, /\bç[aou]/gi ], deu: [ /\b(der|die|und|in|den|von|zu|das|mit|sich|des|auf|für|ist|im|dem|nicht|ein|eine|als|auch|es|an|werden|aus|er|hat|dass|sie|nach|wird|bei|einer|um|am|sind|noch|wie|einem|über|einen|so|zum|war|haben|nur|oder|aber|vor|zur|bis|unter|während|ohne)\b/gi, /ung\b/gi, /[äöüß]/gi, /\bsch/gi ], chi_sim: [ /[一二三四五六七八九十]/g, /[的了在是我有他这为之大来以个中上们]/g, /[你我他她它们]/g ], jpn: [ /[ひらがな]/g, /[カタカナ]/g, /[です、ます、した、する]/g ], kor: [ /[가-힣]/g, /[ㄱ-ㅎㅏ-ㅣ]/g ], ara: [ /[ا-ي]/g, /\b(في|من|إلى|على|عن|مع|هذا|هذه|التي|الذي|كان|كانت)\b/gi ] }; const scores = {}; for (const [lang, langPatterns] of Object.entries(patterns)) { let score = 0; let totalMatches = 0; langPatterns.forEach(pattern => { const matches = text.match(pattern) || []; totalMatches += matches.length; }); // Normalize score by text length score = totalMatches / Math.max(text.length / 100, 1); scores[lang] = Math.min(score, 1); } return { method: 'pattern_matching', scores, text_sample: text.substring(0, 200) }; } catch (error) { return { method: 'pattern_matching', scores: {}, error: error.message }; } } /** * Perform OCR with specific language */ async performOCR(imageInput, language, options = {}) { try { const startTime = Date.now(); const worker = await this.getWorker(language); const config = this.languageConfigs[language] || this.languageConfigs.eng; const tesseractOptions = { ...config.tesseractOptions, ...options.tesseractOptions }; const result = await worker.recognize(imageInput, tesseractOptions); return { language, text: result.data.text, confidence: result.data.confidence / 100, words: result.data.words, lines: result.data.lines, paragraphs: result.data.paragraphs, processing_time: Date.now() - startTime }; } catch (error) { throw new Error(`OCR processing failed for language ${language}: ${error.message}`); } } /** * Try fallback languages if primary detection has low confidence */ async tryFallbackLanguages(imageInput, primaryLanguage, options) { const fallbackLanguages = this.options.fallbackLanguages.filter(lang => lang !== primaryLanguage); let bestResult = null; let attempts = 0; for (const language of fallbackLanguages) { if (attempts >= this.options.maxRetries) break; try { const result = await this.performOCR(imageInput, language, options); attempts++; if (!bestResult || result.confidence > bestResult.confidence) { bestResult = { ...result, fallback_attempts: attempts }; } // If we get good confidence, stop trying if (result.confidence > this.options.confidenceThreshold) { break; } } catch (error) { attempts++; continue; } } return bestResult; } /** * Post-process OCR results based on language */ async postProcessResults(ocrResults, language) { try { const config = this.languageConfigs[language] || this.languageConfigs.eng; const postProcessing = config.postProcessing; let processedText = ocrResults.text; const appliedProcessing = []; // Remove extra spaces if (postProcessing.removeExtraSpaces) { processedText = processedText.replace(/\s+/g, ' ').trim(); appliedProcessing.push('remove_extra_spaces'); } // Fix common OCR errors if (postProcessing.fixCommonErrors) { processedText = this.fixCommonOCRErrors(processedText, language); appliedProcessing.push('fix_common_errors'); } // Language-specific processing if (postProcessing.handleAccents && (language === 'spa' || language === 'fra')) { processedText = this.fixAccentedCharacters(processedText, language); appliedProcessing.push('fix_accents'); } if (postProcessing.handleUmlauts && language === 'deu') { processedText = this.fixGermanUmlauts(processedText); appliedProcessing.push('fix_umlauts'); } if (postProcessing.handleRTL && language === 'ara') { processedText = this.fixArabicText(processedText); appliedProcessing.push('fix_rtl'); } if (postProcessing.handleVerticalText && (language === 'chi_sim' || language === 'chi_tra' || language === 'jpn')) { processedText = this.fixAsianVerticalText(processedText, language); appliedProcessing.push('fix_vertical_text'); } // Process words and lines with language-specific handling const processedWords = this.processWords(ocrResults.words, language); const processedLines = this.processLines(ocrResults.lines, language); const structuredText = this.createStructuredText(processedWords, processedLines, language); // Language-specific analysis const languageAnalysis = await this.performLanguageSpecificAnalysis(processedText, language); return { text: processedText, words: processedWords, lines: processedLines, paragraphs: ocrResults.paragraphs, structured_text: structuredText, language_analysis: languageAnalysis, post_processing_applied: appliedProcessing }; } catch (error) { // Return original results if post-processing fails return { text: ocrResults.text, words: ocrResults.words, lines: ocrResults.lines, paragraphs: ocrResults.paragraphs, structured_text: [], language_analysis: {}, post_processing_applied: [], error: error.message }; } } /** * Fix common OCR errors based on language */ fixCommonOCRErrors(text, language) { const commonErrors = { eng: [ [/\b0\b/g, 'O'], // Zero to O [/\bl\b/g, 'I'], // lowercase l to I [/rn/g, 'm'], // rn to m [/vv/g, 'w'], // vv to w [/\|/g, 'l'] // pipe to l ], spa: [ [/\b0\b/g, 'O'], [/\bl\b/g, 'I'], [/rn/g, 'm'], [/ñ/g, 'ñ'] // Ensure ñ is preserved ], fra: [ [/\b0\b/g, 'O'], [/\bl\b/g, 'I'], [/rn/g, 'm'], [/ç/g, 'ç'] // Ensure ç is preserved ] }; const errors = commonErrors[language] || commonErrors.eng; let processedText = text; errors.forEach(([pattern, replacement]) => { processedText = processedText.replace(pattern, replacement); }); return processedText; } /** * Fix accented characters for Spanish and French */ fixAccentedCharacters(text, language) { const accentFixes = { spa: [ [/a'/g, 'á'], [/e'/g, 'é'], [/i'/g, 'í'], [/o'/g, 'ó'], [/u'/g, 'ú'], [/A'/g, 'Á'], [/E'/g, 'É'], [/I'/g, 'Í'], [/O'/g, 'Ó'], [/U'/g, 'Ú'], [/n~/g, 'ñ'], [/N~/g, 'Ñ'] ], fra: [ [/a'/g, 'á'], [/e'/g, 'é'], [/i'/g, 'í'], [/o'/g, 'ó'], [/u'/g, 'ú'], [/a`/g, 'à'], [/e`/g, 'è'], [/u`/g, 'ù'], [/a\^/g, 'â'], [/e\^/g, 'ê'], [/i\^/g, 'î'], [/o\^/g, 'ô'], [/u\^/g, 'û'], [/c,/g, 'ç'], [/C,/g, 'Ç'] ] }; const fixes = accentFixes[language] || []; let processedText = text; fixes.forEach(([pattern, replacement]) => { processedText = processedText.replace(pattern, replacement); }); return processedText; } /** * Fix German umlauts */ fixGermanUmlauts(text) { const umlautFixes = [ [/ae/g, 'ä'], [/oe/g, 'ö'], [/ue/g, 'ü'], [/ss/g, 'ß'], [/Ae/g, 'Ä'], [/Oe/g, 'Ö'], [/Ue/g, 'Ü'] ]; let processedText = text; umlautFixes.forEach(([pattern, replacement]) => { processedText = processedText.replace(pattern, replacement); }); return processedText; } /** * Fix Arabic text (RTL handling) */ fixArabicText(text) { // Basic Arabic text processing // In a real implementation, this would handle proper RTL text direction return text.trim(); } /** * Fix Asian vertical text */ fixAsianVerticalText(text, language) { // Basic processing for vertical text // In a real implementation, this would handle vertical text layout return text.replace(/\n+/g, '\n').trim(); } /** * Process words with language-specific handling */ processWords(words, language) { if (!words) return []; return words.map(word => ({ ...word, text: this.processWordText(word.text, language), language, confidence: word.confidence / 100 })); } /** * Process word text based on language */ processWordText(text, language) { // Apply language-specific word processing switch (language) { case 'deu': // German compound word handling return this.processGermanCompounds(text); case 'chi_sim': case 'chi_tra': // Chinese word segmentation return this.processChineseSegmentation(text); default: return text; } } /** * Process German compound words */ processGermanCompounds(text) { // Simplified compound word processing // In a real implementation, this would use proper German compound analysis return text; } /** * Process Chinese word segmentation */ processChineseSegmentation(text) { // Simplified Chinese segmentation // In a real implementation, this would use proper Chinese word segmentation return text; } /** * Process lines with language-specific handling */ processLines(lines, language) { if (!lines) return []; return lines.map(line => ({ ...line, text: this.processLineText(line.text, language), language, confidence: line.confidence / 100 })); } /** * Process line text based on language */ processLineText(text, language) { // Apply language-specific line processing return text; } /** * Create structured text elements */ createStructuredText(words, lines, language) { const structuredText = []; lines.forEach((line, index) => { const lineWords = words.filter(word => word.bbox && line.bbox && word.bbox.y0 >= line.bbox.y0 - 5 && word.bbox.y1 <= line.bbox.y1 + 5 ); structuredText.push({ id: `line_${index}`, type: this.classifyTextType(line.text, language), text: line.text, position: { x: line.bbox?.x0 || 0, y: line.bbox?.y0 || 0, width: (line.bbox?.x1 || 0) - (line.bbox?.x0 || 0), height: (line.bbox?.y1 || 0) - (line.bbox?.y0 || 0) }, confidence: line.confidence, language, words: lineWords, font_info: { estimated_size: this.estimateFontSize(line.bbox), size_category: this.categorizeFontSize(this.estimateFontSize(line.bbox)) } }); }); return structuredText; } /** * Classify text type based on content and language */ classifyTextType(text, language) { // Language-specific text classification patterns const patterns = { header: { eng: /^[A-Z][A-Za-z\s]{2,50}$/, spa: /^[A-ZÁÉÍÓÚÑÜ][A-Za-záéíóúñü\s]{2,50}$/, fra: /^[A-ZÀÂÄÉÈÊËÏÎÔÙÛÜŸÇ][A-Za-zàâäéèêëïîôùûüÿç\s]{2,50}$/ }, button: { eng: /^(Click|Submit|Send|Save|Login|Register|Continue|Next|Back|Cancel|OK)$/i, spa: /^(Hacer clic|Enviar|Guardar|Iniciar sesión|Registrar|Continuar|Siguiente|Atrás|Cancelar)$/i, fra: /^(Cliquer|Soumettre|Envoyer|Sauvegarder|Connexion|S'inscrire|Continuer|Suivant|Retour|Annuler)$/i } }; // Check header patterns const headerPattern = patterns.header[language] || patterns.header.eng; if (headerPattern.test(text.trim())) { return 'header'; } // Check button patterns const buttonPattern = patterns.button[language] || patterns.button.eng; if (buttonPattern.test(text.trim())) { return 'button'; } // Default classification if (text.length < 50) { return 'label'; } else { return 'paragraph'; } } /** * Perform language-specific analysis */ async performLanguageSpecificAnalysis(text, language) { const analysis = { language, character_count: text.length, word_count: 0, sentence_count: 0, language_features: {} }; // Basic word and sentence counting const words = text.split(/\s+/).filter(word => word.length > 0); analysis.word_count = words.length; // Language-specific sentence detection const sentencePatterns = { eng: /[.!?]+/g, spa: /[.!?¿¡]+/g, fra: /[.!?]+/g, deu: /[.!?]+/g, chi_sim: /[。!?]+/g, jpn: /[。!?]+/g, ara: /[.!؟]+/g }; const sentencePattern = sentencePatterns[language] || sentencePatterns.eng; const sentences = text.split(sentencePattern).filter(s => s.trim().length > 0); analysis.sentence_count = sentences.length; // Language-specific features switch (language) { case 'spa': analysis.language_features = { has_tildes: /[ñÑ]/.test(text), has_accents: /[áéíóúÁÉÍÓÚ]/.test(text), inverted_punctuation: /[¿¡]/.test(text) }; break; case 'fra': analysis.language_features = { has_accents: /[àâäéèêëïîôùûüÿç]/.test(text), has_cedillas: /[çÇ]/.test(text) }; break; case 'deu': analysis.language_features = { has_umlauts: /[äöüÄÖÜß]/.test(text), compound_words: this.detectGermanCompounds(text) }; break; case 'chi_sim': case 'chi_tra': analysis.language_features = { character_density: text.length / words.length, has_numbers: /[一二三四五六七八九十]/.test(text) }; break; } return analysis; } /** * Detect German compound words */ detectGermanCompounds(text) { // Simplified compound detection const words = text.split(/\s+/); return words.filter(word => word.length > 10).length; } /** * Get or create Tesseract worker for language */ async getWorker(language) { if (this.workers.has(language)) { return this.workers.get(language); } if (this.workerPromises.has(language)) { return await this.workerPromises.get(language); } const workerPromise = this.createWorker(language); this.workerPromises.set(language, workerPromise); const worker = await workerPromise; this.workers.set(language, worker); this.workerPromises.delete(language); return worker; } /** * Create new Tesseract worker */ async createWorker(language) { const worker = await Tesseract.createWorker(); await worker.loadLanguage(language); await worker.initialize(language); return worker; } /** * Estimate font size from bounding box */ estimateFontSize(bbox) { if (!bbox) return 12; return Math.max(8, Math.min(72, bbox.y1 - bbox.y0)); } /** * Categorize font size */ categorizeFontSize(size) { if (size < 12) return 'small'; if (size < 18) return 'medium'; if (size < 24) return 'large'; return 'extra_large'; } /** * Cleanup workers */ async cleanup() { const workers = Array.from(this.workers.values()); await Promise.all(workers.map(worker => worker.terminate())); this.workers.clear(); this.workerPromises.clear(); } } module.exports = MultiLanguageOCR;