@thecodingwhale/cv-processor
Version:
CV Processor to extract structured data from PDF resumes using TypeScript
543 lines (472 loc) • 16.4 kB
text/typescript
import {
BedrockRuntimeClient,
ConverseCommand,
} from '@aws-sdk/client-bedrock-runtime'
import { jsonrepair } from 'jsonrepair'
import { AIModelConfig, AIProvider, TokenUsageInfo } from '../types/AIProvider'
import { replaceUUIDv4Placeholders } from '../utils/data'
import { transformCredits } from '../utils/filtering'
export interface AWSBedrockConfig extends Omit<AIModelConfig, 'apiKey'> {
region?: string
accessKeyId?: string
secretAccessKey?: string
apiKey?: string
}
/**
* Pricing information for AWS Bedrock models (USD per 1K tokens)
*/
interface ModelPricing {
input: number
output: number
}
const AWS_BEDROCK_PRICING: Record<string, ModelPricing> = {
// Amazon models
'apac.amazon.nova-micro-v1:0': { input: 0.000035, output: 0.00014 },
'apac.amazon.nova-lite-v1:0': { input: 0.00006, output: 0.00024 },
'apac.amazon.nova-v1:0': { input: 0.0008, output: 0.0032 },
'apac.amazon.nova-v1-premium:0': { input: 0.001, output: 0.004 },
'apac.amazon.nova-premier-v1:0': { input: 0.0025, output: 0.0125 },
// Default
default: { input: 0.00006, output: 0.00024 },
}
export class AWSBedrockProvider implements AIProvider {
private client: BedrockRuntimeClient
private config: AWSBedrockConfig
constructor(config: AWSBedrockConfig) {
this.config = config
console.log(
`[AWSBedrockProvider] Initializing with model: ${
config.model || 'apac.amazon.nova-micro-v1:0'
}`
)
// Allow more flexible authentication methods
const credentials: any = {}
// Option 1: Use accessKeyId and secretAccessKey from config if provided
if (config.accessKeyId && config.secretAccessKey) {
console.log('[AWSBedrockProvider] Using credentials from config')
credentials.accessKeyId = config.accessKeyId
credentials.secretAccessKey = config.secretAccessKey
}
// Option 2: Use environment variables if available
else if (
process.env.AWS_ACCESS_KEY_ID &&
process.env.AWS_SECRET_ACCESS_KEY
) {
console.log(
'[AWSBedrockProvider] Using credentials from environment variables'
)
credentials.accessKeyId = process.env.AWS_ACCESS_KEY_ID
credentials.secretAccessKey = process.env.AWS_SECRET_ACCESS_KEY
}
// Option 3: Use general apiKey if provided (for compatibility with the common interface)
else if (config.apiKey) {
console.log('[AWSBedrockProvider] Using apiKey as accessKeyId')
credentials.accessKeyId = config.apiKey
// Try to use AWS_SECRET_ACCESS_KEY from environment if available
credentials.secretAccessKey = process.env.AWS_SECRET_ACCESS_KEY || ''
}
// Option 4: Let AWS SDK handle credentials from ~/.aws/credentials
else {
console.log(
'[AWSBedrockProvider] No explicit credentials provided, using default AWS profile'
)
}
const awsConfig = {
region: config.region || process.env.AWS_REGION || 'us-east-1',
credentials:
Object.keys(credentials).length > 0 ? credentials : undefined,
}
this.client = new BedrockRuntimeClient(awsConfig)
}
/**
* Calculate estimated cost based on token usage and model
*/
private calculateCost(
promptTokens: number,
completionTokens: number,
model: string
): number {
// First try to match by specific model name
let pricing = AWS_BEDROCK_PRICING[model]
// If not found, try to match by partial model name
if (!pricing) {
const matchingKey = Object.keys(AWS_BEDROCK_PRICING).find((key) =>
model.toLowerCase().includes(key.toLowerCase())
)
pricing = matchingKey
? AWS_BEDROCK_PRICING[matchingKey]
: AWS_BEDROCK_PRICING['default']
}
const inputCost = (promptTokens / 1000) * pricing.input
const outputCost = (completionTokens / 1000) * pricing.output
return inputCost + outputCost
}
/**
* Estimate token count based on text content
*/
private estimateTokenCount(text: string): number {
// Simple estimation: ~4 characters per token for English text
return Math.ceil(text.length / 4)
}
/**
* Extract all searchable terms from categories (names + keywords)
*/
private extractCategoryTerms(categories?: object[]): string[] {
if (!categories || categories.length === 0) {
return []
}
const terms: string[] = []
for (const category of categories) {
const cat = category as any
// Add category name if it exists
if (cat.name && typeof cat.name === 'string') {
terms.push(cat.name.toLowerCase())
}
// Add keywords if they exist
if (cat.keywords && Array.isArray(cat.keywords)) {
for (const keyword of cat.keywords) {
if (typeof keyword === 'string') {
terms.push(keyword.toLowerCase())
}
}
}
}
return terms
}
/**
* Check if text contains any category-related terms (case-insensitive, substring matching)
*/
private textMatchesCategories(
text: string,
categoryTerms: string[]
): boolean {
if (categoryTerms.length === 0) {
return true // If no categories provided, include all texts
}
const lowerText = text.toLowerCase()
return categoryTerms.some((term) => lowerText.includes(term))
}
/**
* Filter texts to only include those that match category terms
*/
private filterTextsByCategories(
texts: string[],
categories?: object[]
): string[] {
const categoryTerms = this.extractCategoryTerms(categories)
if (categoryTerms.length === 0) {
console.log(
'[AWSBedrockProvider] No category terms found, including all texts'
)
return texts
}
console.log(
`[AWSBedrockProvider] Filtering texts using ${categoryTerms.length} category terms`
)
console.log(
'[AWSBedrockProvider] Category terms:',
categoryTerms.join(', ')
)
const filteredTexts = texts.filter((text) =>
this.textMatchesCategories(text, categoryTerms)
)
console.log(
`[AWSBedrockProvider] Filtered ${texts.length} texts down to ${filteredTexts.length} matching texts`
)
return filteredTexts
}
async extractStructuredDataFromImages<T>(
imageUrls: string[],
dataSchema: object,
instructions: string
): Promise<T & { tokenUsage?: TokenUsageInfo }> {
try {
console.log(`[AWSBedrockProvider] Processing ${imageUrls.length} images`)
console.log(
`[AWSBedrockProvider] Extracting structured data with AWS Bedrock`
)
const modelId = this.config.model || 'apac.amazon.nova-lite-v1:0'
const prompt = `
${instructions}
Extract information from the following images according to this JSON schema:
${JSON.stringify(dataSchema, null, 2)}
Your response should be valid JSON that matches this schema.
IMPORTANT: Return ONLY the JSON object, with no additional text or markdown formatting.
`
// Create content array with the first item being the text prompt
const content: any[] = [{ text: prompt }]
// Add image blocks for each valid image URL
for (const imageUrl of imageUrls) {
try {
if (!imageUrl.startsWith('data:')) {
console.warn(
`[AWSBedrockProvider] Invalid image URL format: ${imageUrl.substring(
0,
20
)}...`
)
continue
}
// Extract MIME type and base64 content
const match = imageUrl.match(/^data:image\/([a-zA-Z]+);base64,(.*)$/)
if (!match) {
console.warn(`[AWSBedrockProvider] Could not parse image data URL`)
continue
}
const format = match[1].toLowerCase()
const base64Data = match[2]
// Convert base64 to binary (Uint8Array)
const binaryData = Buffer.from(base64Data, 'base64')
// Ensure format is one of the supported formats by the API
const apiFormat = format === 'jpg' ? 'jpeg' : format
if (!['png', 'jpeg', 'gif', 'webp'].includes(apiFormat)) {
console.warn(
`[AWSBedrockProvider] Unsupported image format: ${format}`
)
continue
}
console.log(
`[AWSBedrockProvider] Adding image (${apiFormat}, ${binaryData.length} bytes)`
)
// Add image block to content array using type assertion
content.push({
image: {
format: apiFormat as 'png' | 'jpeg' | 'gif' | 'webp',
source: {
bytes: binaryData,
},
},
})
} catch (err) {
console.error(`[AWSBedrockProvider] Error processing image: ${err}`)
}
}
// Create the command using the Converse API format
const command = new ConverseCommand({
modelId: modelId,
messages: [
{
role: 'user',
content: content,
},
],
inferenceConfig: {
maxTokens: this.config.maxTokens || 4096,
temperature: this.config.temperature || 0,
topP: 0.9,
},
})
console.log(`[AWSBedrockProvider] Sending request to model ${modelId}`)
const response = await this.client.send(command)
// Extract text from the response
let responseText = ''
if (
response.output &&
response.output.message &&
response.output.message.content
) {
for (const content of response.output.message.content) {
if (content.text) {
responseText += content.text
}
}
} else {
console.warn(
'[AWSBedrockProvider] Unexpected response structure:',
response
)
responseText = JSON.stringify(response)
}
console.log(
`[AWSBedrockProvider] Response received (length: ${responseText.length})`
)
console.log(
'[AWSBedrockProvider] Response preview:',
responseText.substring(0, 200)
)
// AWS Bedrock doesn't always provide token usage in a standard way
// Use estimation for token usage
const promptTokens = this.estimateTokenCount(
prompt + JSON.stringify(imageUrls)
)
const completionTokens = this.estimateTokenCount(responseText)
const totalTokens = promptTokens + completionTokens
// Calculate estimated cost
const estimatedCost = this.calculateCost(
promptTokens,
completionTokens,
modelId
)
// Create token usage object
const tokenUsage: TokenUsageInfo = {
promptTokens,
completionTokens,
totalTokens,
estimatedCost,
}
try {
let fixedJson
try {
fixedJson = jsonrepair(responseText)
} catch (err) {
try {
fixedJson = jsonrepair(responseText)
} catch (err) {
console.error('❌ Could not repair JSON:', err)
throw new Error(`AI returned invalid JSON: ${err}`)
}
}
const parsedJson = JSON.parse(fixedJson)
return {
...replaceUUIDv4Placeholders(parsedJson),
tokenUsage,
}
} catch (jsonError) {
console.error(
'[AWSBedrockProvider] Error parsing JSON response:',
jsonError
)
console.error('[AWSBedrockProvider] Raw response:', responseText)
throw new Error('Failed to parse AI response as JSON')
}
} catch (error) {
console.error('Error extracting structured data with AWS Bedrock:', error)
throw error
}
}
async extractStructuredDataFromText<T>(
texts: string[],
dataSchema: object,
instructions: string,
categories?: object[]
): Promise<T & { tokenUsage?: TokenUsageInfo }> {
try {
// Pre-cleanup: Filter texts to match expected categories
const filteredTexts = this.filterTextsByCategories(texts, categories)
// Early exit if no texts match categories
if (filteredTexts.length === 0) {
console.warn(
'[AWSBedrockProvider] No texts match the expected categories after filtering'
)
// Return empty result with zero token usage
const emptyResult = {} as T & {
tokenUsage?: TokenUsageInfo
resume: []
resume_show_years: false
empty_result: true
}
emptyResult.tokenUsage = {
promptTokens: 0,
completionTokens: 0,
totalTokens: 0,
estimatedCost: 0,
}
emptyResult.resume = []
emptyResult.resume_show_years = false
emptyResult.empty_result = true
return emptyResult
}
const modelId = this.config.model || 'apac.amazon.nova-lite-v1:0'
const prompt = `
Pre defined categories that will be used to pull the category_id from:
${JSON.stringify(categories, null, 2)}
${instructions}
Extract information from the following text according to this JSON schema:
${JSON.stringify(dataSchema, null, 2)}
Your response should be valid JSON that matches this schema.
IMPORTANT: Return ONLY the JSON object, with no additional text or markdown formatting.
Text content:
${filteredTexts.join('\n\n')}
`
// Create the command using the Converse API format
const command = new ConverseCommand({
modelId: modelId,
messages: [
{
role: 'user',
content: [{ text: prompt }],
},
],
inferenceConfig: {
maxTokens: this.config.maxTokens || 4096,
temperature: this.config.temperature || 0,
topP: 0.9,
},
})
console.log(`[AWSBedrockProvider] Sending request to model ${modelId}`)
const response = await this.client.send(command)
// Extract text from the response
let responseText = ''
if (
response.output &&
response.output.message &&
response.output.message.content
) {
for (const content of response.output.message.content) {
if (content.text) {
responseText += content.text
}
}
} else {
console.warn(
'[AWSBedrockProvider] Unexpected response structure:',
response
)
responseText = JSON.stringify(response)
}
// AWS Bedrock doesn't always provide token usage in a standard way
// Use estimation for token usage
const promptTokens = this.estimateTokenCount(prompt)
const completionTokens = this.estimateTokenCount(responseText)
const totalTokens = promptTokens + completionTokens
// Calculate estimated cost
const estimatedCost = this.calculateCost(
promptTokens,
completionTokens,
modelId
)
// Create token usage object
const tokenUsage: TokenUsageInfo = {
promptTokens,
completionTokens,
totalTokens,
estimatedCost,
}
try {
let fixedJson
try {
fixedJson = jsonrepair(responseText)
} catch (err) {
console.error('❌ Could not repair JSON:', err)
throw new Error(`AI returned invalid JSON: ${err}`)
}
let parsedJson = JSON.parse(fixedJson)
console.log(`[AWSBedrockProvider] old parsedJson: ${parsedJson}`)
if (parsedJson.credits) {
parsedJson = transformCredits(parsedJson)
}
console.log(`[AWSBedrockProvider] new parsedJson: ${parsedJson}`)
return {
...replaceUUIDv4Placeholders(parsedJson),
tokenUsage,
}
} catch (jsonError) {
console.error(
'[AWSBedrockProvider] Error parsing JSON response:',
jsonError
)
console.error('[AWSBedrockProvider] Raw response:', responseText)
throw new Error('Failed to parse AI response as JSON')
}
} catch (error) {
console.error('Error extracting structured data with AWS Bedrock:', error)
throw error
}
}
getModelInfo(): { provider: string; model: string } {
return {
provider: 'aws',
model: this.config.model || 'apac.amazon.nova-lite-v1:0',
}
}
}