llm-extract
Version:
Modular SDK for structured text extraction from documents using LLMs
104 lines • 4.55 kB
JavaScript
Object.defineProperty(exports, "__esModule", { value: true });
const worker_threads_1 = require("worker_threads");
const azure_openai_1 = require("../providers/azure-openai");
if (worker_threads_1.parentPort) {
const data = worker_threads_1.workerData;
const { chunkText, globalOffset, options, llmConfig, workerId } = data;
async function processChunk() {
const startTime = Date.now();
try {
if (!chunkText || typeof chunkText !== 'string') {
throw new Error('Invalid chunk text provided to worker');
}
if (!llmConfig || !llmConfig.apiKey || !llmConfig.endpoint) {
throw new Error('Invalid LLM configuration provided to worker');
}
const llmProvider = new azure_openai_1.AzureOpenAIProvider(llmConfig);
const prompt = buildLangExtractPrompt(chunkText, options);
const llmRequest = {
prompt,
systemPrompt: 'You are an expert at extracting structured information from text. Follow the instructions exactly and return valid JSON.',
temperature: options.temperature || 0.1,
maxTokens: options.maxTokens || 2000
};
const response = await llmProvider.generateCompletion(llmRequest);
const extractions = parseExtractionsFromResponse(response.content);
const groundedExtractions = addSourceGrounding(extractions, chunkText, globalOffset);
const result = {
chunkIndex: 0, // Will be set by main thread
startOffset: globalOffset,
text: chunkText,
extractions: groundedExtractions,
workerId,
processingTimeMs: Date.now() - startTime
};
worker_threads_1.parentPort.postMessage({ success: true, result });
}
catch (error) {
worker_threads_1.parentPort.postMessage({
success: false,
error: `Worker ${workerId} error: ${error.message}`,
workerId
});
}
}
function buildLangExtractPrompt(chunkText, options) {
let prompt = `${options.promptDescription}\n\n`;
if (options.examples && options.examples.length > 0) {
prompt += "Examples:\n";
for (const example of options.examples) {
prompt += `Text: "${example.text}"\n`;
prompt += `Extractions: ${JSON.stringify(example.extractions)}\n\n`;
}
}
prompt += `Now extract from this text:\n"${chunkText}"\n\n`;
prompt += 'Return a JSON array of extraction objects with the format: [{"extraction_class": "class_name", "extraction_text": "extracted_text"}]';
return prompt;
}
function parseExtractionsFromResponse(content) {
try {
const jsonMatch = content.match(/\[[\s\S]*\]/);
if (!jsonMatch) {
console.warn('No JSON array found in response');
return [];
}
const parsed = JSON.parse(jsonMatch[0]);
return Array.isArray(parsed) ? parsed : [];
}
catch (error) {
console.warn('Failed to parse extractions:', error);
return [];
}
}
function addSourceGrounding(extractions, chunkText, globalOffset) {
if (!Array.isArray(extractions) || !chunkText) {
return extractions || [];
}
return extractions.map(extraction => {
if (!extraction || !extraction.extraction_text) {
return extraction;
}
const interval = findTextInterval(extraction.extraction_text, chunkText, globalOffset);
return {
...extraction,
char_interval: interval,
alignment_status: interval ? 'exact' : 'not_found'
};
});
}
function findTextInterval(searchText, sourceText, globalOffset) {
if (!searchText || !sourceText || typeof searchText !== 'string' || typeof sourceText !== 'string') {
return undefined;
}
const index = sourceText.toLowerCase().indexOf(searchText.toLowerCase());
if (index === -1) {
return undefined;
}
const startChar = globalOffset + index;
const endChar = startChar + searchText.length;
return [startChar, endChar];
}
processChunk();
}
//# sourceMappingURL=chunk-worker-script.js.map
;