@thecodingwhale/cv-processor
Version:
CV Processor to extract structured data from PDF resumes using TypeScript
487 lines (486 loc) • 21.6 kB
JavaScript
;
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
var desc = Object.getOwnPropertyDescriptor(m, k);
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
desc = { enumerable: true, get: function() { return m[k]; } };
}
Object.defineProperty(o, k2, desc);
}) : (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
o[k2] = m[k];
}));
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
Object.defineProperty(o, "default", { enumerable: true, value: v });
}) : function(o, v) {
o["default"] = v;
});
var __importStar = (this && this.__importStar) || (function () {
var ownKeys = function(o) {
ownKeys = Object.getOwnPropertyNames || function (o) {
var ar = [];
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
return ar;
};
return ownKeys(o);
};
return function (mod) {
if (mod && mod.__esModule) return mod;
var result = {};
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
__setModuleDefault(result, mod);
return result;
};
})();
Object.defineProperty(exports, "__esModule", { value: true });
exports.AICVProcessor = void 0;
const cheerio = __importStar(require("cheerio"));
const fs = __importStar(require("fs"));
const path = __importStar(require("path"));
const playwright_1 = require("playwright");
const AIProvider_1 = require("./types/AIProvider");
const EmptinessPercentageCalculator_1 = require("./utils/EmptinessPercentageCalculator");
const reportGenerator_1 = require("./utils/reportGenerator");
/**
* AI-powered CV Processor class to extract structured data from PDF resumes
*/
class AICVProcessor {
// private industryContext: string // Store industry context for patterns
/**
* Initialize the AI CV processor
*/
constructor(aiProvider, options = {}) {
this.aiProvider = aiProvider;
this.verbose = options.verbose || false;
this.instructionsPath =
options.instructionsPath || path.join(process.cwd(), 'instructions.txt');
this.expectedTotalFields = options.expectedTotalFields;
this.categories = options.categories || [];
if (this.verbose) {
console.log('AI CV Processor initialized');
console.log(`Using instructions from: ${this.instructionsPath}`);
if (this.expectedTotalFields) {
console.log(`Expected total fields: ${this.expectedTotalFields}`);
}
}
}
/**
* Validate if a string is a proper URL
*/
isValidUrl(url) {
try {
const urlObj = new URL(url);
return urlObj.protocol === 'http:' || urlObj.protocol === 'https:';
}
catch {
return false;
}
}
/**
* Load instructions from the specified file
* Falls back to default instructions if file cannot be read
*/
async loadInstructions() {
try {
// Check if instructions file exists
if (fs.existsSync(this.instructionsPath)) {
const instructions = await fs.promises.readFile(this.instructionsPath, 'utf8');
if (this.verbose) {
console.log(`Successfully loaded instructions from ${this.instructionsPath}`);
}
return instructions;
}
else {
console.warn(`Instructions file not found: ${this.instructionsPath}`);
return null;
}
}
catch (error) {
console.error(`Error loading instructions file: ${error}`);
return null;
}
}
/**
* Estimate token count based on text content
* This is a fallback when actual token counts aren't available
*/
estimateTokenCount(text) {
// Simple estimation: ~4 characters per token for English text
return Math.ceil(text.length / 4);
}
/**
* Process a URL and extract structured CV information using AI
*/
async processUrlToTexts(url) {
console.log(`Processing CV from URL: ${url}`);
// Track start time for processing
const startTime = new Date().getTime();
try {
// Validate URL
if (!this.isValidUrl(url)) {
throw new Error(`Invalid URL provided: ${url}`);
}
if (this.verbose) {
console.log(`Fetching content from URL: ${url}`);
}
// Use Playwright to fetch content with 5-second wait for dynamic content
const html = await this.fetchUrlWithPlaywright(url);
if (this.verbose) {
console.log(`Fetched ${html.length} characters of HTML content`);
}
// Parse HTML and extract text using Cheerio
const $ = cheerio.load(html);
if (this.verbose) {
console.log(`HTML snippet: ${html.substring(0, 1000)}...`);
}
// Remove script, style, and other non-content elements
$('script, style, noscript, iframe, nav, header, footer, aside, form').remove();
// First, try to get all text content
let allText = $('body').text().trim();
if (this.verbose) {
console.log(`All body text (first 500 chars): ${allText.substring(0, 500)}...`);
}
// Extract text from relevant elements
const textElements = [
'p',
'div',
'span',
'h1',
'h2',
'h3',
'h4',
'h5',
'h6',
'li',
'td',
'th',
'article',
'section',
'main',
'content',
];
let extractedText = '';
textElements.forEach((selector) => {
$(selector).each((_, element) => {
const text = $(element).text().trim();
if (text && text.length > 10) {
extractedText += text + '\n';
}
});
});
// If specific element extraction fails, use all body text
if (!extractedText || extractedText.length < 100) {
extractedText = allText;
if (this.verbose) {
console.log('Using all body text as specific element extraction yielded insufficient content');
}
}
// Clean up the extracted text
const cleanedText = extractedText
.replace(/\s+/g, ' ') // Replace multiple spaces with single space
.replace(/\n\s*\n/g, '\n') // Remove empty lines
.trim();
if (this.verbose) {
console.log(`Extracted ${cleanedText.length} characters of clean text`);
console.log(`Clean text preview (first 500 chars): ${cleanedText.substring(0, 500)}...`);
}
if (!cleanedText || cleanedText.length < 20) {
throw new Error('Insufficient text content extracted from URL');
}
// Define the data schema to match our CVData type
const dataSchema = {
type: 'object',
properties: {
credits: {
type: 'array',
items: {
type: 'object',
properties: {
projectTitle: { type: 'string' },
type: { type: 'string' }, // e.g., 'Film', 'TV', 'Commercial', 'Theatre'
role: { type: 'string' },
productionCompany: { type: 'string' },
director: { type: 'string' },
year: { type: 'string' },
location: { type: 'string' },
link: { type: 'string' }, // optional trailer or scene
},
},
},
},
};
// Load instructions from file
const instructions = await this.loadInstructions();
if (!instructions) {
throw new Error('No instructions found');
}
// Use AI to extract structured data from the cleaned text
const cvData = await this.aiProvider.extractStructuredDataFromText([cleanedText], dataSchema, instructions, this.categories);
// Calculate processing time
const processingTime = (new Date().getTime() - startTime) / 1000;
console.log(`[AICVProcessor] URL processing completed in ${processingTime.toFixed(2)} seconds`);
// Add metadata
cvData.metadata = {
processedDate: new Date().toISOString(),
processingTime: processingTime,
conversionType: AIProvider_1.ConversionType.UrlToTexts,
...this.aiProvider.getModelInfo(),
};
// Add token usage information if available from AI provider
if (cvData.tokenUsage) {
cvData.metadata.tokenUsage = {
inputTokens: cvData.tokenUsage.promptTokens,
outputTokens: cvData.tokenUsage.completionTokens,
totalTokens: cvData.tokenUsage.totalTokens,
estimatedCost: cvData.tokenUsage.estimatedCost,
};
if (this.verbose) {
console.log(`[AICVProcessor] Token usage:`, cvData.metadata.tokenUsage);
}
}
else {
// Estimate tokens if not provided by the AI provider
const estimatedInputTokens = this.estimateTokenCount(instructions + cleanedText);
const estimatedOutputTokens = this.estimateTokenCount(JSON.stringify(cvData));
cvData.metadata.tokenUsage = {
inputTokens: estimatedInputTokens,
outputTokens: estimatedOutputTokens,
totalTokens: estimatedInputTokens + estimatedOutputTokens,
};
if (this.verbose) {
console.log(`[AICVProcessor] Estimated token usage:`, cvData.metadata.tokenUsage);
}
}
return cvData;
}
catch (error) {
console.error(`Error processing URL: ${error instanceof Error ? error.message : String(error)}`);
throw error;
}
}
/**
* Fetch URL content using Playwright with 5-second wait for dynamic content
*/
async fetchUrlWithPlaywright(url) {
let browser = null;
let page = null;
try {
if (this.verbose) {
console.log('Launching Playwright browser...');
}
// Launch browser
browser = await playwright_1.chromium.launch({
headless: true,
args: ['--no-sandbox', '--disable-setuid-sandbox'],
});
page = await browser.newPage();
// Set user agent to mimic a real browser using the context method
await page.setExtraHTTPHeaders({
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
});
if (this.verbose) {
console.log('Navigating to URL...');
}
// Navigate to the page
await page.goto(url, {
waitUntil: 'domcontentloaded',
timeout: 60000,
});
if (this.verbose) {
console.log('Page loaded, waiting 5 seconds for dynamic content...');
}
// Wait 5 seconds for dynamic content to load (your requested timeout)
await page.waitForTimeout(5000);
// Try to wait for some content to appear
try {
await page.waitForSelector('body', { timeout: 5000 });
if (this.verbose) {
console.log('Content detected on page');
}
}
catch (selectorError) {
if (this.verbose) {
console.log('Proceeding with current content...');
}
}
// Get the page content
const html = await page.content();
if (this.verbose) {
console.log(`Successfully fetched ${html.length} characters using Playwright`);
}
return html;
}
catch (error) {
throw new Error(`Playwright failed: ${error instanceof Error ? error.message : String(error)}`);
}
finally {
if (page) {
await page.close();
}
if (browser) {
await browser.close();
}
}
}
/**
* Process a CV PDF and extract structured information using AI
*/
async processCv(texts, conversionType = AIProvider_1.ConversionType.PdfToImages) {
// Check if input is a URL and conversionType is UrlToTexts
// note: the processUrlToTexts needs to be call outside this function
// since it calls the url to pdf before getting the texts
// if (conversionType === ConversionType.UrlToTexts) {
// return this.processUrlToTexts(pdfPath)
// }
// Track start time for processing
const startTime = new Date().getTime();
try {
// Define the data schema to match our CVData type
const dataSchema = {
type: 'object',
properties: {
credits: {
type: 'array',
items: {
type: 'object',
properties: {
projectTitle: { type: 'string' },
type: { type: 'string' }, // e.g., 'Film', 'TV', 'Commercial', 'Theatre'
role: { type: 'string' },
productionCompany: { type: 'string' },
director: { type: 'string' },
year: { type: 'string' },
location: { type: 'string' },
link: { type: 'string' }, // optional trailer or scene
},
},
},
},
};
// Load instructions from file
const instructions = await this.loadInstructions();
if (!instructions) {
throw new Error('No instructions found');
}
let cvData;
// if (conversionType === ConversionType.PdfToImages) {
// // Convert PDF to images
// inputData = await convertPdfToImages(pdfPath)
// // Use AI to extract structured data from images
// cvData = await this.aiProvider.extractStructuredDataFromImages<CVData>(
// inputData,
// dataSchema,
// instructions
// )
// } else {
// // Convert PDF to text
// inputData = await convertPdfToTexts(pdfPath)
// // Use AI to extract structured data from text
// cvData = await this.aiProvider.extractStructuredDataFromText<CVData>(
// inputData,
// dataSchema,
// instructions,
// this.categories
// )
// }
cvData = await this.aiProvider.extractStructuredDataFromText(texts, dataSchema, instructions, this.categories);
// Calculate processing time
const processingTime = (new Date().getTime() - startTime) / 1000;
console.log(`[AICVProcessor] Processing completed in ${processingTime.toFixed(2)} seconds`);
// Add metadata before accuracy evaluation
cvData.metadata = {
processedDate: new Date().toISOString(),
processingTime: processingTime,
conversionType: conversionType,
...this.aiProvider.getModelInfo(),
};
// Add token usage information if available from AI provider
if (cvData.tokenUsage) {
cvData.metadata.tokenUsage = {
inputTokens: cvData.tokenUsage.promptTokens,
outputTokens: cvData.tokenUsage.completionTokens,
totalTokens: cvData.tokenUsage.totalTokens,
estimatedCost: cvData.tokenUsage.estimatedCost,
};
if (this.verbose) {
console.log(`[AICVProcessor] Token usage:`, cvData.metadata.tokenUsage);
}
}
else {
// Estimate tokens if not provided by the AI provider
const estimatedInputTokens = this.estimateTokenCount(instructions + JSON.stringify(texts));
const estimatedOutputTokens = this.estimateTokenCount(JSON.stringify(cvData));
cvData.metadata.tokenUsage = {
inputTokens: estimatedInputTokens,
outputTokens: estimatedOutputTokens,
totalTokens: estimatedInputTokens + estimatedOutputTokens,
};
if (this.verbose) {
console.log(`[AICVProcessor] Estimated token usage:`, cvData.metadata.tokenUsage);
}
}
// Calculate emptiness percentage
const emptinessResult = EmptinessPercentageCalculator_1.EmptinessPercentageCalculator.calculateEmptinessPercentage(cvData, this.expectedTotalFields);
cvData.metadata.emptinessPercentage = emptinessResult;
// Add standard log message for emptiness percentage score (not conditional on verbose)
console.log(`[AICVProcessor] Emptiness Percentage score: ${emptinessResult.percentage}%`);
if (this.verbose) {
console.log(`[AICVProcessor] Emptiness percentage: ${emptinessResult.percentage}%`);
console.log(`[AICVProcessor] Total fields: ${emptinessResult.totalFields}, Non-empty fields: ${emptinessResult.nonEmptyFields}`);
if (emptinessResult.expectedTotalFields) {
console.log(`[AICVProcessor] Expected emptiness percentage: ${emptinessResult.expectedPercentage}% (based on ${emptinessResult.expectedTotalFields} expected fields)`);
}
}
return cvData;
}
catch (error) {
console.error(`Error processing CV: ${error}`);
throw error;
}
}
/**
* Save CV data to a JSON file
*/
saveToJson(cvData, outputPath) {
try {
// Generate a filename that includes provider, model, and timestamp
const timestamp = new Date()
.toISOString()
.replace(/:/g, '-')
.replace(/\./g, '-');
const providerName = cvData.metadata?.provider || 'unknown';
const modelName = cvData.metadata?.model || 'unknown';
const processingTime = cvData.metadata?.processingTime
? `_${cvData.metadata.processingTime.toFixed(2)}s`
: '';
// Extract base path and extension
const outputDir = path.dirname(outputPath);
const outputBaseName = path.basename(outputPath, path.extname(outputPath));
const outputExt = path.extname(outputPath);
// Create filename with provider, model, timestamp and processing time
const newOutputPath = path.join(outputDir, `${outputBaseName}_${providerName}_${modelName}${processingTime}_${timestamp}${outputExt}`);
// Create directory for output if it doesn't exist
const resultDir = path.join(outputDir, `${outputBaseName}_${timestamp.split('T')[0]}`);
if (!fs.existsSync(resultDir)) {
fs.mkdirSync(resultDir, { recursive: true });
}
// Save to the directory
const finalOutputPath = path.join(resultDir, `${providerName}_${modelName}${processingTime}${outputExt}`);
fs.writeFileSync(finalOutputPath, JSON.stringify(cvData, null, 2));
console.log(`Results saved to ${finalOutputPath}`);
// Generate and save a report for this directory
reportGenerator_1.ReportGenerator.generateAndSaveReport(resultDir, this.verbose)
.then(() => {
console.log(`Report generated for ${resultDir}`);
})
.catch((error) => {
console.error(`Error generating report: ${error}`);
});
}
catch (error) {
console.error(`Error saving JSON file: ${error}`);
throw error;
}
}
}
exports.AICVProcessor = AICVProcessor;