UNPKG

@thecodingwhale/cv-processor

Version:

CV Processor to extract structured data from PDF resumes using TypeScript

382 lines (375 loc) 17.4 kB
"use strict"; Object.defineProperty(exports, "__esModule", { value: true }); exports.AWSBedrockProvider = void 0; const client_bedrock_runtime_1 = require("@aws-sdk/client-bedrock-runtime"); const jsonrepair_1 = require("jsonrepair"); const data_1 = require("../utils/data"); const filtering_1 = require("../utils/filtering"); const AWS_BEDROCK_PRICING = { // Amazon models 'apac.amazon.nova-micro-v1:0': { input: 0.000035, output: 0.00014 }, 'apac.amazon.nova-lite-v1:0': { input: 0.00006, output: 0.00024 }, 'apac.amazon.nova-v1:0': { input: 0.0008, output: 0.0032 }, 'apac.amazon.nova-v1-premium:0': { input: 0.001, output: 0.004 }, 'apac.amazon.nova-premier-v1:0': { input: 0.0025, output: 0.0125 }, // Default default: { input: 0.00006, output: 0.00024 }, }; class AWSBedrockProvider { constructor(config) { this.config = config; console.log(`[AWSBedrockProvider] Initializing with model: ${config.model || 'apac.amazon.nova-micro-v1:0'}`); // Allow more flexible authentication methods const credentials = {}; // Option 1: Use accessKeyId and secretAccessKey from config if provided if (config.accessKeyId && config.secretAccessKey) { console.log('[AWSBedrockProvider] Using credentials from config'); credentials.accessKeyId = config.accessKeyId; credentials.secretAccessKey = config.secretAccessKey; } // Option 2: Use environment variables if available else if (process.env.AWS_ACCESS_KEY_ID && process.env.AWS_SECRET_ACCESS_KEY) { console.log('[AWSBedrockProvider] Using credentials from environment variables'); credentials.accessKeyId = process.env.AWS_ACCESS_KEY_ID; credentials.secretAccessKey = process.env.AWS_SECRET_ACCESS_KEY; } // Option 3: Use general apiKey if provided (for compatibility with the common interface) else if (config.apiKey) { console.log('[AWSBedrockProvider] Using apiKey as accessKeyId'); credentials.accessKeyId = config.apiKey; // Try to use AWS_SECRET_ACCESS_KEY from environment if available credentials.secretAccessKey = process.env.AWS_SECRET_ACCESS_KEY || ''; } // Option 4: Let AWS SDK handle credentials from ~/.aws/credentials else { console.log('[AWSBedrockProvider] No explicit credentials provided, using default AWS profile'); } const awsConfig = { region: config.region || process.env.AWS_REGION || 'us-east-1', credentials: Object.keys(credentials).length > 0 ? credentials : undefined, }; this.client = new client_bedrock_runtime_1.BedrockRuntimeClient(awsConfig); } /** * Calculate estimated cost based on token usage and model */ calculateCost(promptTokens, completionTokens, model) { // First try to match by specific model name let pricing = AWS_BEDROCK_PRICING[model]; // If not found, try to match by partial model name if (!pricing) { const matchingKey = Object.keys(AWS_BEDROCK_PRICING).find((key) => model.toLowerCase().includes(key.toLowerCase())); pricing = matchingKey ? AWS_BEDROCK_PRICING[matchingKey] : AWS_BEDROCK_PRICING['default']; } const inputCost = (promptTokens / 1000) * pricing.input; const outputCost = (completionTokens / 1000) * pricing.output; return inputCost + outputCost; } /** * Estimate token count based on text content */ estimateTokenCount(text) { // Simple estimation: ~4 characters per token for English text return Math.ceil(text.length / 4); } /** * Extract all searchable terms from categories (names + keywords) */ extractCategoryTerms(categories) { if (!categories || categories.length === 0) { return []; } const terms = []; for (const category of categories) { const cat = category; // Add category name if it exists if (cat.name && typeof cat.name === 'string') { terms.push(cat.name.toLowerCase()); } // Add keywords if they exist if (cat.keywords && Array.isArray(cat.keywords)) { for (const keyword of cat.keywords) { if (typeof keyword === 'string') { terms.push(keyword.toLowerCase()); } } } } return terms; } /** * Check if text contains any category-related terms (case-insensitive, substring matching) */ textMatchesCategories(text, categoryTerms) { if (categoryTerms.length === 0) { return true; // If no categories provided, include all texts } const lowerText = text.toLowerCase(); return categoryTerms.some((term) => lowerText.includes(term)); } /** * Filter texts to only include those that match category terms */ filterTextsByCategories(texts, categories) { const categoryTerms = this.extractCategoryTerms(categories); if (categoryTerms.length === 0) { console.log('[AWSBedrockProvider] No category terms found, including all texts'); return texts; } console.log(`[AWSBedrockProvider] Filtering texts using ${categoryTerms.length} category terms`); console.log('[AWSBedrockProvider] Category terms:', categoryTerms.join(', ')); const filteredTexts = texts.filter((text) => this.textMatchesCategories(text, categoryTerms)); console.log(`[AWSBedrockProvider] Filtered ${texts.length} texts down to ${filteredTexts.length} matching texts`); return filteredTexts; } async extractStructuredDataFromImages(imageUrls, dataSchema, instructions) { try { console.log(`[AWSBedrockProvider] Processing ${imageUrls.length} images`); console.log(`[AWSBedrockProvider] Extracting structured data with AWS Bedrock`); const modelId = this.config.model || 'apac.amazon.nova-lite-v1:0'; const prompt = ` ${instructions} Extract information from the following images according to this JSON schema: ${JSON.stringify(dataSchema, null, 2)} Your response should be valid JSON that matches this schema. IMPORTANT: Return ONLY the JSON object, with no additional text or markdown formatting. `; // Create content array with the first item being the text prompt const content = [{ text: prompt }]; // Add image blocks for each valid image URL for (const imageUrl of imageUrls) { try { if (!imageUrl.startsWith('data:')) { console.warn(`[AWSBedrockProvider] Invalid image URL format: ${imageUrl.substring(0, 20)}...`); continue; } // Extract MIME type and base64 content const match = imageUrl.match(/^data:image\/([a-zA-Z]+);base64,(.*)$/); if (!match) { console.warn(`[AWSBedrockProvider] Could not parse image data URL`); continue; } const format = match[1].toLowerCase(); const base64Data = match[2]; // Convert base64 to binary (Uint8Array) const binaryData = Buffer.from(base64Data, 'base64'); // Ensure format is one of the supported formats by the API const apiFormat = format === 'jpg' ? 'jpeg' : format; if (!['png', 'jpeg', 'gif', 'webp'].includes(apiFormat)) { console.warn(`[AWSBedrockProvider] Unsupported image format: ${format}`); continue; } console.log(`[AWSBedrockProvider] Adding image (${apiFormat}, ${binaryData.length} bytes)`); // Add image block to content array using type assertion content.push({ image: { format: apiFormat, source: { bytes: binaryData, }, }, }); } catch (err) { console.error(`[AWSBedrockProvider] Error processing image: ${err}`); } } // Create the command using the Converse API format const command = new client_bedrock_runtime_1.ConverseCommand({ modelId: modelId, messages: [ { role: 'user', content: content, }, ], inferenceConfig: { maxTokens: this.config.maxTokens || 4096, temperature: this.config.temperature || 0, topP: 0.9, }, }); console.log(`[AWSBedrockProvider] Sending request to model ${modelId}`); const response = await this.client.send(command); // Extract text from the response let responseText = ''; if (response.output && response.output.message && response.output.message.content) { for (const content of response.output.message.content) { if (content.text) { responseText += content.text; } } } else { console.warn('[AWSBedrockProvider] Unexpected response structure:', response); responseText = JSON.stringify(response); } console.log(`[AWSBedrockProvider] Response received (length: ${responseText.length})`); console.log('[AWSBedrockProvider] Response preview:', responseText.substring(0, 200)); // AWS Bedrock doesn't always provide token usage in a standard way // Use estimation for token usage const promptTokens = this.estimateTokenCount(prompt + JSON.stringify(imageUrls)); const completionTokens = this.estimateTokenCount(responseText); const totalTokens = promptTokens + completionTokens; // Calculate estimated cost const estimatedCost = this.calculateCost(promptTokens, completionTokens, modelId); // Create token usage object const tokenUsage = { promptTokens, completionTokens, totalTokens, estimatedCost, }; try { let fixedJson; try { fixedJson = (0, jsonrepair_1.jsonrepair)(responseText); } catch (err) { try { fixedJson = (0, jsonrepair_1.jsonrepair)(responseText); } catch (err) { console.error('❌ Could not repair JSON:', err); throw new Error(`AI returned invalid JSON: ${err}`); } } const parsedJson = JSON.parse(fixedJson); return { ...(0, data_1.replaceUUIDv4Placeholders)(parsedJson), tokenUsage, }; } catch (jsonError) { console.error('[AWSBedrockProvider] Error parsing JSON response:', jsonError); console.error('[AWSBedrockProvider] Raw response:', responseText); throw new Error('Failed to parse AI response as JSON'); } } catch (error) { console.error('Error extracting structured data with AWS Bedrock:', error); throw error; } } async extractStructuredDataFromText(texts, dataSchema, instructions, categories) { try { // Pre-cleanup: Filter texts to match expected categories const filteredTexts = this.filterTextsByCategories(texts, categories); // Early exit if no texts match categories if (filteredTexts.length === 0) { console.warn('[AWSBedrockProvider] No texts match the expected categories after filtering'); // Return empty result with zero token usage const emptyResult = {}; emptyResult.tokenUsage = { promptTokens: 0, completionTokens: 0, totalTokens: 0, estimatedCost: 0, }; emptyResult.resume = []; emptyResult.resume_show_years = false; emptyResult.empty_result = true; return emptyResult; } const modelId = this.config.model || 'apac.amazon.nova-lite-v1:0'; const prompt = ` Pre defined categories that will be used to pull the category_id from: ${JSON.stringify(categories, null, 2)} ${instructions} Extract information from the following text according to this JSON schema: ${JSON.stringify(dataSchema, null, 2)} Your response should be valid JSON that matches this schema. IMPORTANT: Return ONLY the JSON object, with no additional text or markdown formatting. Text content: ${filteredTexts.join('\n\n')} `; // Create the command using the Converse API format const command = new client_bedrock_runtime_1.ConverseCommand({ modelId: modelId, messages: [ { role: 'user', content: [{ text: prompt }], }, ], inferenceConfig: { maxTokens: this.config.maxTokens || 4096, temperature: this.config.temperature || 0, topP: 0.9, }, }); console.log(`[AWSBedrockProvider] Sending request to model ${modelId}`); const response = await this.client.send(command); // Extract text from the response let responseText = ''; if (response.output && response.output.message && response.output.message.content) { for (const content of response.output.message.content) { if (content.text) { responseText += content.text; } } } else { console.warn('[AWSBedrockProvider] Unexpected response structure:', response); responseText = JSON.stringify(response); } // AWS Bedrock doesn't always provide token usage in a standard way // Use estimation for token usage const promptTokens = this.estimateTokenCount(prompt); const completionTokens = this.estimateTokenCount(responseText); const totalTokens = promptTokens + completionTokens; // Calculate estimated cost const estimatedCost = this.calculateCost(promptTokens, completionTokens, modelId); // Create token usage object const tokenUsage = { promptTokens, completionTokens, totalTokens, estimatedCost, }; try { let fixedJson; try { fixedJson = (0, jsonrepair_1.jsonrepair)(responseText); } catch (err) { console.error('❌ Could not repair JSON:', err); throw new Error(`AI returned invalid JSON: ${err}`); } let parsedJson = JSON.parse(fixedJson); console.log(`[AWSBedrockProvider] old parsedJson: ${parsedJson}`); if (parsedJson.credits) { parsedJson = (0, filtering_1.transformCredits)(parsedJson); } console.log(`[AWSBedrockProvider] new parsedJson: ${parsedJson}`); return { ...(0, data_1.replaceUUIDv4Placeholders)(parsedJson), tokenUsage, }; } catch (jsonError) { console.error('[AWSBedrockProvider] Error parsing JSON response:', jsonError); console.error('[AWSBedrockProvider] Raw response:', responseText); throw new Error('Failed to parse AI response as JSON'); } } catch (error) { console.error('Error extracting structured data with AWS Bedrock:', error); throw error; } } getModelInfo() { return { provider: 'aws', model: this.config.model || 'apac.amazon.nova-lite-v1:0', }; } } exports.AWSBedrockProvider = AWSBedrockProvider;