pdf-ocr-cli
Version:
A CLI tool for OCR processing of PDF files using Mistral API with optional LLM verification
110 lines (105 loc) • 3.87 kB
JavaScript
;
var __importDefault = (this && this.__importDefault) || function (mod) {
return (mod && mod.__esModule) ? mod : { "default": mod };
};
Object.defineProperty(exports, "__esModule", { value: true });
exports.verifyContent = verifyContent;
const dotenv_1 = __importDefault(require("dotenv"));
const together_ai_1 = require("together-ai");
const constants_1 = require("./constants");
// Load environment variables
dotenv_1.default.config();
/**
* Verifies and improves OCR text using Together.ai free LLM
* @param text - The OCR text to verify and improve
* @param options - Content verification options
* @returns Improved text
* @throws Error if verification fails or API key is missing
*/
async function verifyContent(text, options = {}) {
// Set default options
const opts = {
verbose: options.verbose ?? false,
timeout: options.timeout ?? 30000,
maxTokens: options.maxTokens ?? 1000,
temperature: options.temperature ?? 0.7,
topP: options.topP ?? 0.9,
};
// Check if API key is set
if (!process.env.TOGETHER_API_KEY) {
throw new Error('TOGETHER_API_KEY environment variable is not set');
}
// If text is empty, return empty string
if (!text || text.trim().length === 0) {
if (opts.verbose) {
console.log('No text to verify');
}
return '';
}
try {
// Initialize Together client
const together = new together_ai_1.Together({
apiKey: process.env.TOGETHER_API_KEY,
});
if (opts.verbose) {
console.log('Verifying OCR text with Together.ai free LLM...');
}
// Create the prompt for the LLM
const prompt = createPrompt(text);
if (opts.verbose) {
console.log('Sending prompt to Together.ai free LLM...');
}
// Call the Together.ai free model
const response = await together.chat.completions.create({
model: constants_1.VERIFICATION_MODEL,
messages: [
{
role: 'system',
content: constants_1.VERIFICATION_SYSTEM_PROMPT,
},
{
role: 'user',
content: prompt,
},
],
max_tokens: opts.maxTokens,
temperature: opts.temperature,
top_p: opts.topP,
});
// Extract the improved text from the response
const improvedText = response.choices[0]?.message?.content || text;
if (opts.verbose) {
console.log('Content verification complete');
console.log(`Original text length: ${text.length}`);
console.log(`Improved text length: ${improvedText.length}`);
}
return improvedText;
}
catch (error) {
const errorMessage = error instanceof Error ? error.message : String(error);
if (opts.verbose) {
console.error(`Content verification failed: ${errorMessage}`);
}
// If verification fails, return the original text
return text;
}
}
/**
* Creates a prompt for the LLM to verify and improve OCR text
* @param text - The OCR text to verify and improve
* @returns Prompt for the LLM
*/
function createPrompt(text) {
return `
I have some text that was extracted from a PDF using OCR. The OCR process may have introduced errors, such as:
- Misrecognized characters
- Broken words
- Missing punctuation
- Incorrect formatting
- Garbled text
Please fix any errors you find in the text while preserving the original meaning and structure. If you encounter text that seems completely nonsensical, try to make a reasonable guess based on context, but don't invent new content.
Here is the OCR text:
${text}
Please provide the corrected version of the text.
`;
}