UNPKG

llm-extract

Version:

Modular SDK for structured text extraction from documents using LLMs

104 lines 4.55 kB
"use strict"; Object.defineProperty(exports, "__esModule", { value: true }); const worker_threads_1 = require("worker_threads"); const azure_openai_1 = require("../providers/azure-openai"); if (worker_threads_1.parentPort) { const data = worker_threads_1.workerData; const { chunkText, globalOffset, options, llmConfig, workerId } = data; async function processChunk() { const startTime = Date.now(); try { if (!chunkText || typeof chunkText !== 'string') { throw new Error('Invalid chunk text provided to worker'); } if (!llmConfig || !llmConfig.apiKey || !llmConfig.endpoint) { throw new Error('Invalid LLM configuration provided to worker'); } const llmProvider = new azure_openai_1.AzureOpenAIProvider(llmConfig); const prompt = buildLangExtractPrompt(chunkText, options); const llmRequest = { prompt, systemPrompt: 'You are an expert at extracting structured information from text. Follow the instructions exactly and return valid JSON.', temperature: options.temperature || 0.1, maxTokens: options.maxTokens || 2000 }; const response = await llmProvider.generateCompletion(llmRequest); const extractions = parseExtractionsFromResponse(response.content); const groundedExtractions = addSourceGrounding(extractions, chunkText, globalOffset); const result = { chunkIndex: 0, // Will be set by main thread startOffset: globalOffset, text: chunkText, extractions: groundedExtractions, workerId, processingTimeMs: Date.now() - startTime }; worker_threads_1.parentPort.postMessage({ success: true, result }); } catch (error) { worker_threads_1.parentPort.postMessage({ success: false, error: `Worker ${workerId} error: ${error.message}`, workerId }); } } function buildLangExtractPrompt(chunkText, options) { let prompt = `${options.promptDescription}\n\n`; if (options.examples && options.examples.length > 0) { prompt += "Examples:\n"; for (const example of options.examples) { prompt += `Text: "${example.text}"\n`; prompt += `Extractions: ${JSON.stringify(example.extractions)}\n\n`; } } prompt += `Now extract from this text:\n"${chunkText}"\n\n`; prompt += 'Return a JSON array of extraction objects with the format: [{"extraction_class": "class_name", "extraction_text": "extracted_text"}]'; return prompt; } function parseExtractionsFromResponse(content) { try { const jsonMatch = content.match(/\[[\s\S]*\]/); if (!jsonMatch) { console.warn('No JSON array found in response'); return []; } const parsed = JSON.parse(jsonMatch[0]); return Array.isArray(parsed) ? parsed : []; } catch (error) { console.warn('Failed to parse extractions:', error); return []; } } function addSourceGrounding(extractions, chunkText, globalOffset) { if (!Array.isArray(extractions) || !chunkText) { return extractions || []; } return extractions.map(extraction => { if (!extraction || !extraction.extraction_text) { return extraction; } const interval = findTextInterval(extraction.extraction_text, chunkText, globalOffset); return { ...extraction, char_interval: interval, alignment_status: interval ? 'exact' : 'not_found' }; }); } function findTextInterval(searchText, sourceText, globalOffset) { if (!searchText || !sourceText || typeof searchText !== 'string' || typeof sourceText !== 'string') { return undefined; } const index = sourceText.toLowerCase().indexOf(searchText.toLowerCase()); if (index === -1) { return undefined; } const startChar = globalOffset + index; const endChar = startChar + searchText.length; return [startChar, endChar]; } processChunk(); } //# sourceMappingURL=chunk-worker-script.js.map