UNPKG

@thecodingwhale/cv-processor

Version:

CV Processor to extract structured data from PDF resumes using TypeScript

252 lines (247 loc) 9.92 kB
"use strict"; Object.defineProperty(exports, "__esModule", { value: true }); exports.createOpenAIProvider = createOpenAIProvider; const child_process_1 = require("child_process"); const openai_1 = require("openai"); const util_1 = require("util"); const execAsync = (0, util_1.promisify)(child_process_1.exec); /** * Creates an OpenAI provider with functional implementation */ function createOpenAIProvider(config) { // Initialize the OpenAI client const openai = new openai_1.OpenAI({ apiKey: config.apiKey, }); /** * Calculate token usage and estimated cost */ function calculateTokenUsage(promptTokens, completionTokens) { const totalTokens = promptTokens + completionTokens; // Calculate estimated cost based on model and token usage // Pricing as of May 2024 - subject to change let estimatedCost = 0; const model = config.model.toLowerCase(); if (model.includes('gpt-4') || model.includes('gpt4') || model.includes('4o')) { // GPT-4 and GPT-4o pricing tiers if (model.includes('32k')) { // GPT-4 32k context estimatedCost = (promptTokens / 1000) * 0.06 + (completionTokens / 1000) * 0.12; } else if (model.includes('o')) { // GPT-4o estimatedCost = (totalTokens / 1000) * 0.015; } else { // Standard GPT-4 estimatedCost = (promptTokens / 1000) * 0.03 + (completionTokens / 1000) * 0.06; } } else if (model.includes('gpt-3.5') || model.includes('gpt3')) { // GPT-3.5 Turbo estimatedCost = (totalTokens / 1000) * 0.0015; } return { promptTokens, completionTokens, totalTokens, estimatedCost, }; } /** * Repair malformed JSON */ function repairJSON(jsonStr) { // Implement JSON repair logic let repairedJson = jsonStr.trim(); // Fix common JSON errors // 1. Missing closing brackets/braces const openBraces = (repairedJson.match(/\{/g) || []).length; const closeBraces = (repairedJson.match(/\}/g) || []).length; const openBrackets = (repairedJson.match(/\[/g) || []).length; const closeBrackets = (repairedJson.match(/\]/g) || []).length; // Add missing closing braces for (let i = 0; i < openBraces - closeBraces; i++) { repairedJson += '}'; } // Add missing closing brackets for (let i = 0; i < openBrackets - closeBrackets; i++) { repairedJson += ']'; } return repairedJson; } /** * Retry with explicit instructions to get valid JSON */ async function retryWithValidJSON(text, dataSchema, instructions, originalTokenUsage) { try { console.log('Retrying with explicit JSON format instructions'); const retryPrompt = ` ${instructions} IMPORTANT: Your response MUST be valid JSON that strictly conforms to this schema: ${JSON.stringify(dataSchema, null, 2)} Do not include any non-JSON text in your response. Do not use markdown code blocks. Return only the JSON object. `; const completion = await openai.chat.completions.create({ model: config.model || 'gpt-4o', temperature: 0.1, // Lower temperature for more deterministic output max_tokens: config.maxTokens || 4096, response_format: { type: 'json_object' }, messages: [ { role: 'system', content: retryPrompt, }, { role: 'user', content: text, }, ], }); const responseText = completion.choices[0]?.message?.content || '{}'; // Calculate token usage const retryTokenUsage = calculateTokenUsage(completion.usage?.prompt_tokens || 0, completion.usage?.completion_tokens || 0); // Combine original and retry token usage const combinedTokenUsage = { promptTokens: (originalTokenUsage?.promptTokens || 0) + retryTokenUsage.promptTokens, completionTokens: (originalTokenUsage?.completionTokens || 0) + retryTokenUsage.completionTokens, totalTokens: (originalTokenUsage?.totalTokens || 0) + retryTokenUsage.totalTokens, estimatedCost: (originalTokenUsage?.estimatedCost || 0) + retryTokenUsage.estimatedCost, }; try { const parsedResponse = JSON.parse(responseText); return { ...parsedResponse, tokenUsage: combinedTokenUsage, }; } catch (jsonError) { console.error('Error parsing JSON from retry attempt:', jsonError); // Return a minimal valid object as fallback return { tokenUsage: combinedTokenUsage, }; } } catch (error) { console.error('Error in retry attempt:', error); throw error; } } // Return the API implementation return { /** * Process text using the OpenAI API */ async processText(text, prompt) { try { const completion = await openai.chat.completions.create({ model: config.model || 'gpt-4o', temperature: config.temperature || 0.2, max_tokens: config.maxTokens || 4096, messages: [ { role: 'system', content: prompt, }, { role: 'user', content: text, }, ], }); // Calculate token usage and estimated cost const tokenUsage = calculateTokenUsage(completion.usage?.prompt_tokens || 0, completion.usage?.completion_tokens || 0); return { text: completion.choices[0]?.message?.content || '', tokenUsage, }; } catch (error) { console.error('Error processing text with OpenAI:', error); throw error; } }, /** * Extract structured data from text */ async extractStructuredData(text, dataSchema, instructions) { try { const prompt = ` ${instructions} Extract information from the following text according to this JSON schema: ${JSON.stringify(dataSchema, null, 2)} Your response should be valid JSON that matches this schema. `; const completion = await openai.chat.completions.create({ model: config.model || 'gpt-4o', temperature: config.temperature || 0.2, max_tokens: config.maxTokens || 4096, response_format: { type: 'json_object' }, messages: [ { role: 'system', content: prompt, }, { role: 'user', content: text, }, ], }); const responseText = completion.choices[0]?.message?.content || '{}'; // Calculate token usage and estimated cost const tokenUsage = calculateTokenUsage(completion.usage?.prompt_tokens || 0, completion.usage?.completion_tokens || 0); try { // Try to parse the JSON response const parsedResponse = JSON.parse(responseText); return { ...parsedResponse, tokenUsage, }; } catch (jsonError) { console.error('Error parsing JSON from OpenAI response:', jsonError); // Try to repair the JSON const repairedJson = repairJSON(responseText); try { // Try to parse the repaired JSON const parsedResponse = JSON.parse(repairedJson); console.log('Successfully repaired JSON response'); return { ...parsedResponse, tokenUsage, }; } catch (repairError) { console.error('Failed to repair JSON:', repairError); // If all else fails, retry with a more explicit instruction return retryWithValidJSON(text, dataSchema, instructions, tokenUsage); } } } catch (error) { console.error('Error extracting structured data with OpenAI:', error); throw error; } }, /** * Get model information */ getModelInfo() { return { provider: 'openai', model: config.model || 'gpt-4o', }; }, }; }