@thecodingwhale/cv-processor
Version:
CV Processor to extract structured data from PDF resumes using TypeScript
252 lines (247 loc) • 9.92 kB
JavaScript
"use strict";
Object.defineProperty(exports, "__esModule", { value: true });
exports.createOpenAIProvider = createOpenAIProvider;
const child_process_1 = require("child_process");
const openai_1 = require("openai");
const util_1 = require("util");
const execAsync = (0, util_1.promisify)(child_process_1.exec);
/**
* Creates an OpenAI provider with functional implementation
*/
function createOpenAIProvider(config) {
// Initialize the OpenAI client
const openai = new openai_1.OpenAI({
apiKey: config.apiKey,
});
/**
* Calculate token usage and estimated cost
*/
function calculateTokenUsage(promptTokens, completionTokens) {
const totalTokens = promptTokens + completionTokens;
// Calculate estimated cost based on model and token usage
// Pricing as of May 2024 - subject to change
let estimatedCost = 0;
const model = config.model.toLowerCase();
if (model.includes('gpt-4') ||
model.includes('gpt4') ||
model.includes('4o')) {
// GPT-4 and GPT-4o pricing tiers
if (model.includes('32k')) {
// GPT-4 32k context
estimatedCost =
(promptTokens / 1000) * 0.06 + (completionTokens / 1000) * 0.12;
}
else if (model.includes('o')) {
// GPT-4o
estimatedCost = (totalTokens / 1000) * 0.015;
}
else {
// Standard GPT-4
estimatedCost =
(promptTokens / 1000) * 0.03 + (completionTokens / 1000) * 0.06;
}
}
else if (model.includes('gpt-3.5') || model.includes('gpt3')) {
// GPT-3.5 Turbo
estimatedCost = (totalTokens / 1000) * 0.0015;
}
return {
promptTokens,
completionTokens,
totalTokens,
estimatedCost,
};
}
/**
* Repair malformed JSON
*/
function repairJSON(jsonStr) {
// Implement JSON repair logic
let repairedJson = jsonStr.trim();
// Fix common JSON errors
// 1. Missing closing brackets/braces
const openBraces = (repairedJson.match(/\{/g) || []).length;
const closeBraces = (repairedJson.match(/\}/g) || []).length;
const openBrackets = (repairedJson.match(/\[/g) || []).length;
const closeBrackets = (repairedJson.match(/\]/g) || []).length;
// Add missing closing braces
for (let i = 0; i < openBraces - closeBraces; i++) {
repairedJson += '}';
}
// Add missing closing brackets
for (let i = 0; i < openBrackets - closeBrackets; i++) {
repairedJson += ']';
}
return repairedJson;
}
/**
* Retry with explicit instructions to get valid JSON
*/
async function retryWithValidJSON(text, dataSchema, instructions, originalTokenUsage) {
try {
console.log('Retrying with explicit JSON format instructions');
const retryPrompt = `
${instructions}
IMPORTANT: Your response MUST be valid JSON that strictly conforms to this schema:
${JSON.stringify(dataSchema, null, 2)}
Do not include any non-JSON text in your response.
Do not use markdown code blocks.
Return only the JSON object.
`;
const completion = await openai.chat.completions.create({
model: config.model || 'gpt-4o',
temperature: 0.1, // Lower temperature for more deterministic output
max_tokens: config.maxTokens || 4096,
response_format: { type: 'json_object' },
messages: [
{
role: 'system',
content: retryPrompt,
},
{
role: 'user',
content: text,
},
],
});
const responseText = completion.choices[0]?.message?.content || '{}';
// Calculate token usage
const retryTokenUsage = calculateTokenUsage(completion.usage?.prompt_tokens || 0, completion.usage?.completion_tokens || 0);
// Combine original and retry token usage
const combinedTokenUsage = {
promptTokens: (originalTokenUsage?.promptTokens || 0) +
retryTokenUsage.promptTokens,
completionTokens: (originalTokenUsage?.completionTokens || 0) +
retryTokenUsage.completionTokens,
totalTokens: (originalTokenUsage?.totalTokens || 0) + retryTokenUsage.totalTokens,
estimatedCost: (originalTokenUsage?.estimatedCost || 0) +
retryTokenUsage.estimatedCost,
};
try {
const parsedResponse = JSON.parse(responseText);
return {
...parsedResponse,
tokenUsage: combinedTokenUsage,
};
}
catch (jsonError) {
console.error('Error parsing JSON from retry attempt:', jsonError);
// Return a minimal valid object as fallback
return {
tokenUsage: combinedTokenUsage,
};
}
}
catch (error) {
console.error('Error in retry attempt:', error);
throw error;
}
}
// Return the API implementation
return {
/**
* Process text using the OpenAI API
*/
async processText(text, prompt) {
try {
const completion = await openai.chat.completions.create({
model: config.model || 'gpt-4o',
temperature: config.temperature || 0.2,
max_tokens: config.maxTokens || 4096,
messages: [
{
role: 'system',
content: prompt,
},
{
role: 'user',
content: text,
},
],
});
// Calculate token usage and estimated cost
const tokenUsage = calculateTokenUsage(completion.usage?.prompt_tokens || 0, completion.usage?.completion_tokens || 0);
return {
text: completion.choices[0]?.message?.content || '',
tokenUsage,
};
}
catch (error) {
console.error('Error processing text with OpenAI:', error);
throw error;
}
},
/**
* Extract structured data from text
*/
async extractStructuredData(text, dataSchema, instructions) {
try {
const prompt = `
${instructions}
Extract information from the following text according to this JSON schema:
${JSON.stringify(dataSchema, null, 2)}
Your response should be valid JSON that matches this schema.
`;
const completion = await openai.chat.completions.create({
model: config.model || 'gpt-4o',
temperature: config.temperature || 0.2,
max_tokens: config.maxTokens || 4096,
response_format: { type: 'json_object' },
messages: [
{
role: 'system',
content: prompt,
},
{
role: 'user',
content: text,
},
],
});
const responseText = completion.choices[0]?.message?.content || '{}';
// Calculate token usage and estimated cost
const tokenUsage = calculateTokenUsage(completion.usage?.prompt_tokens || 0, completion.usage?.completion_tokens || 0);
try {
// Try to parse the JSON response
const parsedResponse = JSON.parse(responseText);
return {
...parsedResponse,
tokenUsage,
};
}
catch (jsonError) {
console.error('Error parsing JSON from OpenAI response:', jsonError);
// Try to repair the JSON
const repairedJson = repairJSON(responseText);
try {
// Try to parse the repaired JSON
const parsedResponse = JSON.parse(repairedJson);
console.log('Successfully repaired JSON response');
return {
...parsedResponse,
tokenUsage,
};
}
catch (repairError) {
console.error('Failed to repair JSON:', repairError);
// If all else fails, retry with a more explicit instruction
return retryWithValidJSON(text, dataSchema, instructions, tokenUsage);
}
}
}
catch (error) {
console.error('Error extracting structured data with OpenAI:', error);
throw error;
}
},
/**
* Get model information
*/
getModelInfo() {
return {
provider: 'openai',
model: config.model || 'gpt-4o',
};
},
};
}