@thecodingwhale/cv-processor
Version:
CV Processor to extract structured data from PDF resumes using TypeScript
232 lines (226 loc) • 9.71 kB
JavaScript
;
Object.defineProperty(exports, "__esModule", { value: true });
exports.AzureOpenAIProvider = void 0;
const jsonrepair_1 = require("jsonrepair");
const openai_1 = require("openai");
const data_1 = require("../utils/data");
const AZURE_OPENAI_PRICING = {
'gpt-4': { input: 0.03, output: 0.06 },
'gpt-4-turbo': { input: 0.01, output: 0.03 },
'gpt-4o': { input: 0.0025, output: 0.01 },
'gpt-4.1': { input: 0.002, output: 0.008 },
'gpt-4.1-mini': { input: 0.0006, output: 0.0024 },
'gpt-4.1-nano': { input: 0.0001, output: 0.0004 },
'gpt-3.5-turbo': { input: 0.002, output: 0.006 },
// Add more models as needed
default: { input: 0.002, output: 0.008 }, // Default fallback pricing
};
// O series models that require special parameter handling (no temperature)
const O_SERIES_MODELS = ['o1', 'o1-mini', 'o3', 'o3-mini', 'o4-mini'];
class AzureOpenAIProvider {
constructor(config) {
this.config = config;
// Make sure we have a deployment name
if (!config.deploymentName) {
console.warn(`[AzureOpenAIProvider] No deploymentName provided, using model name "${config.model}" as the deployment name`);
}
const deploymentName = config.model ? config.model : config.deploymentName;
console.log(`[AzureOpenAIProvider] Using deployment: ${deploymentName}`);
// Initialize Azure OpenAI client according to documentation
this.client = new openai_1.AzureOpenAI({
apiKey: config.apiKey,
endpoint: config.endpoint,
apiVersion: config.apiVersion || '2024-04-01-preview',
deployment: deploymentName,
});
}
/**
* Calculate estimated cost based on token usage and model
*/
calculateCost(promptTokens, completionTokens, model) {
// First try to match by specific model name
let pricing = AZURE_OPENAI_PRICING[model];
// If not found, try to match by partial model name
if (!pricing) {
const matchingKey = Object.keys(AZURE_OPENAI_PRICING).find((key) => model.toLowerCase().includes(key.toLowerCase()));
pricing = matchingKey
? AZURE_OPENAI_PRICING[matchingKey]
: AZURE_OPENAI_PRICING['default'];
}
const inputCost = (promptTokens / 1000) * pricing.input;
const outputCost = (completionTokens / 1000) * pricing.output;
return inputCost + outputCost;
}
/**
* Estimate token count based on text content
*/
estimateTokenCount(text) {
// Simple estimation: ~4 characters per token for English text
return Math.ceil(text.length / 4);
}
async extractStructuredDataFromImages(imageUrls, dataSchema, instructions) {
try {
const prompt = `
${instructions}
Extract information from the following document according to this JSON schema:
${JSON.stringify(dataSchema, null, 2)}
Your response should be valid JSON that matches this schema.
`;
let completion;
// Create messages with the images
const messages = [
{
role: 'system',
content: prompt,
},
{
role: 'user',
content: [
{
type: 'text',
text: 'Please analyze this document:',
},
...imageUrls.map((imageUrl) => ({
type: 'image_url',
image_url: {
url: imageUrl,
},
})),
],
},
];
const model = this.config.model || '';
const isOSeriesModel = O_SERIES_MODELS.some((m) => model.includes(m));
// Create request parameters for vision model
const requestParams = {
messages: messages,
model: 'gpt-4.1', // Required by OpenAI SDK but ignored by Azure
};
// Only add temperature for non-O series models
if (!isOSeriesModel) {
requestParams.temperature = this.config.temperature || 0;
}
completion = await this.client.chat.completions.create(requestParams);
const responseText = completion.choices[0]?.message?.content || '{}';
// Extract token usage information
const promptTokens = completion.usage?.prompt_tokens ||
this.estimateTokenCount(prompt + JSON.stringify(imageUrls));
const completionTokens = completion.usage?.completion_tokens ||
this.estimateTokenCount(responseText);
const totalTokens = completion.usage?.total_tokens || promptTokens + completionTokens;
// Calculate estimated cost
const modelName = this.config.deploymentName || this.config.model || 'gpt-4.1';
const estimatedCost = this.calculateCost(promptTokens, completionTokens, modelName);
// Create token usage object
const tokenUsage = {
promptTokens,
completionTokens,
totalTokens,
estimatedCost,
};
try {
let fixedJson;
try {
fixedJson = (0, jsonrepair_1.jsonrepair)(responseText);
}
catch (err) {
try {
fixedJson = (0, jsonrepair_1.jsonrepair)(responseText);
}
catch (err) {
console.error('❌ Could not repair JSON:', err);
throw new Error(`AI returned invalid JSON: ${err}`);
}
}
const parsedJson = JSON.parse(fixedJson);
return {
...(0, data_1.replaceUUIDv4Placeholders)(parsedJson),
tokenUsage,
};
}
catch (jsonError) {
console.error('Error parsing JSON from OpenAI response:', jsonError);
throw jsonError;
}
}
catch (error) {
console.error('Error extracting structured data with Azure OpenAI:', error);
throw error;
}
}
async extractStructuredDataFromText(texts, dataSchema, instructions, categories) {
try {
const prompt = `
${instructions}
Extract information from the following text according to this JSON schema:
${JSON.stringify(dataSchema, null, 2)}
Your response should be valid JSON that matches this schema.
Text content:
${texts.join('\n\n')}
`;
const model = this.config.model || '';
const isOSeriesModel = O_SERIES_MODELS.some((m) => model.includes(m));
// Create request parameters
const requestParams = {
model: this.config.model, // Required by OpenAI SDK but ignored by Azure
messages: [
{
role: 'system',
content: prompt,
},
],
};
// Only add temperature for non-O series models
if (!isOSeriesModel) {
requestParams.temperature = this.config.temperature || 0;
}
const completion = await this.client.chat.completions.create(requestParams);
const responseText = completion.choices[0]?.message?.content || '{}';
// Extract token usage information
const promptTokens = completion.usage?.prompt_tokens || this.estimateTokenCount(prompt);
const completionTokens = completion.usage?.completion_tokens ||
this.estimateTokenCount(responseText);
const totalTokens = completion.usage?.total_tokens || promptTokens + completionTokens;
// Calculate estimated cost
const modelName = this.config.deploymentName || this.config.model || 'gpt-4.1';
const estimatedCost = this.calculateCost(promptTokens, completionTokens, modelName);
// Create token usage object
const tokenUsage = {
promptTokens,
completionTokens,
totalTokens,
estimatedCost,
};
try {
let fixedJson;
try {
fixedJson = (0, jsonrepair_1.jsonrepair)(responseText);
}
catch (err) {
console.error('❌ Could not repair JSON:', err);
throw new Error(`AI returned invalid JSON: ${err}`);
}
const parsedJson = JSON.parse(fixedJson);
return {
...(0, data_1.replaceUUIDv4Placeholders)(parsedJson),
tokenUsage,
};
}
catch (jsonError) {
console.error('Error parsing JSON from Azure OpenAI response:', jsonError);
throw jsonError;
}
}
catch (error) {
console.error('Error extracting structured data with Azure OpenAI:', error);
throw error;
}
}
getModelInfo() {
return {
provider: 'azure',
model: this.config.deploymentName || this.config.model,
};
}
}
exports.AzureOpenAIProvider = AzureOpenAIProvider;