UNPKG

@thecodingwhale/cv-processor

Version:

CV Processor to extract structured data from PDF resumes using TypeScript

232 lines (226 loc) 9.71 kB
"use strict"; Object.defineProperty(exports, "__esModule", { value: true }); exports.AzureOpenAIProvider = void 0; const jsonrepair_1 = require("jsonrepair"); const openai_1 = require("openai"); const data_1 = require("../utils/data"); const AZURE_OPENAI_PRICING = { 'gpt-4': { input: 0.03, output: 0.06 }, 'gpt-4-turbo': { input: 0.01, output: 0.03 }, 'gpt-4o': { input: 0.0025, output: 0.01 }, 'gpt-4.1': { input: 0.002, output: 0.008 }, 'gpt-4.1-mini': { input: 0.0006, output: 0.0024 }, 'gpt-4.1-nano': { input: 0.0001, output: 0.0004 }, 'gpt-3.5-turbo': { input: 0.002, output: 0.006 }, // Add more models as needed default: { input: 0.002, output: 0.008 }, // Default fallback pricing }; // O series models that require special parameter handling (no temperature) const O_SERIES_MODELS = ['o1', 'o1-mini', 'o3', 'o3-mini', 'o4-mini']; class AzureOpenAIProvider { constructor(config) { this.config = config; // Make sure we have a deployment name if (!config.deploymentName) { console.warn(`[AzureOpenAIProvider] No deploymentName provided, using model name "${config.model}" as the deployment name`); } const deploymentName = config.model ? config.model : config.deploymentName; console.log(`[AzureOpenAIProvider] Using deployment: ${deploymentName}`); // Initialize Azure OpenAI client according to documentation this.client = new openai_1.AzureOpenAI({ apiKey: config.apiKey, endpoint: config.endpoint, apiVersion: config.apiVersion || '2024-04-01-preview', deployment: deploymentName, }); } /** * Calculate estimated cost based on token usage and model */ calculateCost(promptTokens, completionTokens, model) { // First try to match by specific model name let pricing = AZURE_OPENAI_PRICING[model]; // If not found, try to match by partial model name if (!pricing) { const matchingKey = Object.keys(AZURE_OPENAI_PRICING).find((key) => model.toLowerCase().includes(key.toLowerCase())); pricing = matchingKey ? AZURE_OPENAI_PRICING[matchingKey] : AZURE_OPENAI_PRICING['default']; } const inputCost = (promptTokens / 1000) * pricing.input; const outputCost = (completionTokens / 1000) * pricing.output; return inputCost + outputCost; } /** * Estimate token count based on text content */ estimateTokenCount(text) { // Simple estimation: ~4 characters per token for English text return Math.ceil(text.length / 4); } async extractStructuredDataFromImages(imageUrls, dataSchema, instructions) { try { const prompt = ` ${instructions} Extract information from the following document according to this JSON schema: ${JSON.stringify(dataSchema, null, 2)} Your response should be valid JSON that matches this schema. `; let completion; // Create messages with the images const messages = [ { role: 'system', content: prompt, }, { role: 'user', content: [ { type: 'text', text: 'Please analyze this document:', }, ...imageUrls.map((imageUrl) => ({ type: 'image_url', image_url: { url: imageUrl, }, })), ], }, ]; const model = this.config.model || ''; const isOSeriesModel = O_SERIES_MODELS.some((m) => model.includes(m)); // Create request parameters for vision model const requestParams = { messages: messages, model: 'gpt-4.1', // Required by OpenAI SDK but ignored by Azure }; // Only add temperature for non-O series models if (!isOSeriesModel) { requestParams.temperature = this.config.temperature || 0; } completion = await this.client.chat.completions.create(requestParams); const responseText = completion.choices[0]?.message?.content || '{}'; // Extract token usage information const promptTokens = completion.usage?.prompt_tokens || this.estimateTokenCount(prompt + JSON.stringify(imageUrls)); const completionTokens = completion.usage?.completion_tokens || this.estimateTokenCount(responseText); const totalTokens = completion.usage?.total_tokens || promptTokens + completionTokens; // Calculate estimated cost const modelName = this.config.deploymentName || this.config.model || 'gpt-4.1'; const estimatedCost = this.calculateCost(promptTokens, completionTokens, modelName); // Create token usage object const tokenUsage = { promptTokens, completionTokens, totalTokens, estimatedCost, }; try { let fixedJson; try { fixedJson = (0, jsonrepair_1.jsonrepair)(responseText); } catch (err) { try { fixedJson = (0, jsonrepair_1.jsonrepair)(responseText); } catch (err) { console.error('❌ Could not repair JSON:', err); throw new Error(`AI returned invalid JSON: ${err}`); } } const parsedJson = JSON.parse(fixedJson); return { ...(0, data_1.replaceUUIDv4Placeholders)(parsedJson), tokenUsage, }; } catch (jsonError) { console.error('Error parsing JSON from OpenAI response:', jsonError); throw jsonError; } } catch (error) { console.error('Error extracting structured data with Azure OpenAI:', error); throw error; } } async extractStructuredDataFromText(texts, dataSchema, instructions, categories) { try { const prompt = ` ${instructions} Extract information from the following text according to this JSON schema: ${JSON.stringify(dataSchema, null, 2)} Your response should be valid JSON that matches this schema. Text content: ${texts.join('\n\n')} `; const model = this.config.model || ''; const isOSeriesModel = O_SERIES_MODELS.some((m) => model.includes(m)); // Create request parameters const requestParams = { model: this.config.model, // Required by OpenAI SDK but ignored by Azure messages: [ { role: 'system', content: prompt, }, ], }; // Only add temperature for non-O series models if (!isOSeriesModel) { requestParams.temperature = this.config.temperature || 0; } const completion = await this.client.chat.completions.create(requestParams); const responseText = completion.choices[0]?.message?.content || '{}'; // Extract token usage information const promptTokens = completion.usage?.prompt_tokens || this.estimateTokenCount(prompt); const completionTokens = completion.usage?.completion_tokens || this.estimateTokenCount(responseText); const totalTokens = completion.usage?.total_tokens || promptTokens + completionTokens; // Calculate estimated cost const modelName = this.config.deploymentName || this.config.model || 'gpt-4.1'; const estimatedCost = this.calculateCost(promptTokens, completionTokens, modelName); // Create token usage object const tokenUsage = { promptTokens, completionTokens, totalTokens, estimatedCost, }; try { let fixedJson; try { fixedJson = (0, jsonrepair_1.jsonrepair)(responseText); } catch (err) { console.error('❌ Could not repair JSON:', err); throw new Error(`AI returned invalid JSON: ${err}`); } const parsedJson = JSON.parse(fixedJson); return { ...(0, data_1.replaceUUIDv4Placeholders)(parsedJson), tokenUsage, }; } catch (jsonError) { console.error('Error parsing JSON from Azure OpenAI response:', jsonError); throw jsonError; } } catch (error) { console.error('Error extracting structured data with Azure OpenAI:', error); throw error; } } getModelInfo() { return { provider: 'azure', model: this.config.deploymentName || this.config.model, }; } } exports.AzureOpenAIProvider = AzureOpenAIProvider;