@thecodingwhale/cv-processor
Version:
CV Processor to extract structured data from PDF resumes using TypeScript
269 lines (235 loc) • 7.98 kB
text/typescript
import { jsonrepair } from 'jsonrepair'
import { OpenAI } from 'openai'
import { AIModelConfig, AIProvider, TokenUsageInfo } from '../types/AIProvider'
import { replaceUUIDv4Placeholders } from '../utils/data'
/**
* Pricing information for OpenAI models (USD per 1K tokens)
*/
interface ModelPricing {
input: number
output: number
}
const OPENAI_PRICING: Record<string, ModelPricing> = {
'gpt-4o-2024-11-20': { input: 0.0025, output: 0.01 },
'gpt-4o-2024-08-06': { input: 0.0025, output: 0.01 },
'gpt-4o-2024-05-13': { input: 0.0025, output: 0.01 },
'gpt-4o': { input: 0.0025, output: 0.01 },
'gpt-4o-mini': { input: 0.00015, output: 0.0006 },
'gpt-4.5-preview': { input: 0.075, output: 0.15 },
'gpt-4.1': { input: 0.002, output: 0.008 },
'gpt-4.1-mini': { input: 0.0004, output: 0.0016 },
'gpt-4.1-nano': { input: 0.0001, output: 0.0004 },
'gpt-4-turbo': { input: 0.01, output: 0.03 },
'gpt-4': { input: 0.03, output: 0.06 },
'gpt-3.5-turbo': { input: 0.0005, output: 0.0015 },
o3: { input: 0.01, output: 0.04 },
'o3-mini': { input: 0.0011, output: 0.0044 },
'o4-mini': { input: 0.0011, output: 0.0044 },
o1: { input: 0.015, output: 0.06 },
'o1-mini': { input: 0.0011, output: 0.0044 },
// Default
default: { input: 0.0025, output: 0.01 }, // Default fallback pricing
}
// O series models that require special parameter handling
const O_SERIES_MODELS = ['o1', 'o1-mini', 'o3', 'o3-mini', 'o4-mini']
export class OpenAIProvider implements AIProvider {
private openai: OpenAI
private config: AIModelConfig
constructor(config: AIModelConfig) {
this.config = config
this.openai = new OpenAI({
apiKey: config.apiKey,
})
}
/**
* Calculate estimated cost based on token usage and model
*/
private calculateCost(
promptTokens: number,
completionTokens: number,
model: string
): number {
const pricing = OPENAI_PRICING[model] || OPENAI_PRICING['default']
const inputCost = (promptTokens / 1000) * pricing.input
const outputCost = (completionTokens / 1000) * pricing.output
return inputCost + outputCost
}
async extractStructuredDataFromImages<T>(
imageUrls: string[],
dataSchema: object,
instructions: string
): Promise<T & { tokenUsage?: TokenUsageInfo }> {
try {
const prompt = `
${instructions}
Extract information from the following text according to this JSON schema:
${JSON.stringify(dataSchema, null, 2)}
Your response should be valid JSON that matches this schema.
`
const model = this.config.model || 'gpt-4o'
const isOSeriesModel = O_SERIES_MODELS.includes(model)
// Create the request parameters based on the model
const requestParams: any = {
model: model,
response_format: { type: 'json_object' },
messages: [
{
role: 'system',
content: prompt,
},
{
role: 'user',
content: [
{
type: 'text' as const,
text: 'Please analyze this document:',
},
...imageUrls.map((imageUrl) => ({
type: 'image_url' as const,
image_url: {
url: imageUrl,
},
})),
],
},
],
}
// Add appropriate parameters based on model series
if (isOSeriesModel) {
requestParams.max_completion_tokens = this.config.maxTokens || 4096
} else {
requestParams.temperature = this.config.temperature || 0
requestParams.max_tokens = this.config.maxTokens || 4096
}
const completion = await this.openai.chat.completions.create(
requestParams
)
const responseText = completion.choices[0]?.message?.content || '{}'
// Extract token usage information
const promptTokens = completion.usage?.prompt_tokens || 0
const completionTokens = completion.usage?.completion_tokens || 0
const totalTokens = completion.usage?.total_tokens || 0
// Calculate estimated cost
const estimatedCost = this.calculateCost(
promptTokens,
completionTokens,
model
)
// Create token usage object
const tokenUsage: TokenUsageInfo = {
promptTokens,
completionTokens,
totalTokens,
estimatedCost,
}
try {
let fixedJson
try {
fixedJson = jsonrepair(responseText)
} catch (err) {
try {
fixedJson = jsonrepair(responseText)
} catch (err) {
console.error('❌ Could not repair JSON:', err)
throw new Error(`AI returned invalid JSON: ${err}`)
}
}
const parsedJson = JSON.parse(fixedJson)
return {
...replaceUUIDv4Placeholders(parsedJson),
tokenUsage,
}
} catch (jsonError) {
console.error('Error parsing JSON from OpenAI response:', jsonError)
throw jsonError
}
} catch (error) {
console.error('Error extracting structured data with OpenAI:', error)
throw error
}
}
async extractStructuredDataFromText<T>(
texts: string[],
dataSchema: object,
instructions: string,
categories?: object[]
): Promise<T & { tokenUsage?: TokenUsageInfo }> {
try {
const prompt = `
${instructions}
Extract information from the following text according to this JSON schema:
${JSON.stringify(dataSchema, null, 2)}
Your response should be valid JSON that matches this schema.
Text content:
${texts.join('\n\n')}
`
const model = this.config.model || 'gpt-4o'
const isOSeriesModel = O_SERIES_MODELS.includes(model)
// Create the request parameters based on the model
const requestParams: any = {
model: model,
response_format: { type: 'json_object' },
messages: [
{
role: 'system',
content: prompt,
},
],
}
// Add appropriate parameters based on model series
if (isOSeriesModel) {
requestParams.max_completion_tokens = this.config.maxTokens || 4096
} else {
requestParams.temperature = this.config.temperature || 0
requestParams.max_tokens = this.config.maxTokens || 4096
}
const completion = await this.openai.chat.completions.create(
requestParams
)
const responseText = completion.choices[0]?.message?.content || '{}'
// Extract token usage information
const promptTokens = completion.usage?.prompt_tokens || 0
const completionTokens = completion.usage?.completion_tokens || 0
const totalTokens = completion.usage?.total_tokens || 0
// Calculate estimated cost
const estimatedCost = this.calculateCost(
promptTokens,
completionTokens,
model
)
// Create token usage object
const tokenUsage: TokenUsageInfo = {
promptTokens,
completionTokens,
totalTokens,
estimatedCost,
}
try {
let fixedJson
try {
fixedJson = jsonrepair(responseText)
} catch (err) {
console.error('❌ Could not repair JSON:', err)
throw new Error(`AI returned invalid JSON: ${err}`)
}
const parsedJson = JSON.parse(fixedJson)
return {
...replaceUUIDv4Placeholders(parsedJson),
tokenUsage,
}
} catch (jsonError) {
console.error('Error parsing JSON from OpenAI response:', jsonError)
throw jsonError
}
} catch (error) {
console.error('Error extracting structured data with OpenAI:', error)
throw error
}
}
getModelInfo(): { provider: string; model: string } {
return {
provider: 'openai',
model: this.config.model || 'gpt-4o',
}
}
}