UNPKG

pdf-ocr-cli

Version:

A CLI tool for OCR processing of PDF files using Mistral API with optional LLM verification

github.com/luandro/pdf-ocr

luandro/pdf-ocr

110 lines (105 loc) • 3.87 kB

JavaScript

"use strict"; var __importDefault = (this && this.__importDefault) || function (mod) { return (mod && mod.__esModule) ? mod : { "default": mod }; }; Object.defineProperty(exports, "__esModule", { value: true }); exports.verifyContent = verifyContent; const dotenv_1 = __importDefault(require("dotenv")); const together_ai_1 = require("together-ai"); const constants_1 = require("./constants"); // Load environment variables dotenv_1.default.config(); /** * Verifies and improves OCR text using Together.ai free LLM * @param text - The OCR text to verify and improve * @param options - Content verification options * @returns Improved text * @throws Error if verification fails or API key is missing */ async function verifyContent(text, options = {}) { // Set default options const opts = { verbose: options.verbose ?? false, timeout: options.timeout ?? 30000, maxTokens: options.maxTokens ?? 1000, temperature: options.temperature ?? 0.7, topP: options.topP ?? 0.9, }; // Check if API key is set if (!process.env.TOGETHER_API_KEY) { throw new Error('TOGETHER_API_KEY environment variable is not set'); } // If text is empty, return empty string if (!text || text.trim().length === 0) { if (opts.verbose) { console.log('No text to verify'); } return ''; } try { // Initialize Together client const together = new together_ai_1.Together({ apiKey: process.env.TOGETHER_API_KEY, }); if (opts.verbose) { console.log('Verifying OCR text with Together.ai free LLM...'); } // Create the prompt for the LLM const prompt = createPrompt(text); if (opts.verbose) { console.log('Sending prompt to Together.ai free LLM...'); } // Call the Together.ai free model const response = await together.chat.completions.create({ model: constants_1.VERIFICATION_MODEL, messages: [ { role: 'system', content: constants_1.VERIFICATION_SYSTEM_PROMPT, }, { role: 'user', content: prompt, }, ], max_tokens: opts.maxTokens, temperature: opts.temperature, top_p: opts.topP, }); // Extract the improved text from the response const improvedText = response.choices[0]?.message?.content || text; if (opts.verbose) { console.log('Content verification complete'); console.log(`Original text length: ${text.length}`); console.log(`Improved text length: ${improvedText.length}`); } return improvedText; } catch (error) { const errorMessage = error instanceof Error ? error.message : String(error); if (opts.verbose) { console.error(`Content verification failed: ${errorMessage}`); } // If verification fails, return the original text return text; } } /** * Creates a prompt for the LLM to verify and improve OCR text * @param text - The OCR text to verify and improve * @returns Prompt for the LLM */ function createPrompt(text) { return ` I have some text that was extracted from a PDF using OCR. The OCR process may have introduced errors, such as: - Misrecognized characters - Broken words - Missing punctuation - Incorrect formatting - Garbled text Please fix any errors you find in the text while preserving the original meaning and structure. If you encounter text that seems completely nonsensical, try to make a reasonable guess based on context, but don't invent new content. Here is the OCR text: ${text} Please provide the corrected version of the text. `; }