@thecodingwhale/cv-processor
Version:
CV Processor to extract structured data from PDF resumes using TypeScript
195 lines (169 loc) • 6.18 kB
text/typescript
/**
* CV Processor CLI - Extract structured data from CV/resume PDFs
*
* Usage:
* npx cv-processor-ts input.pdf
*
* Output:
* Creates a JSON file with the same name (input.json) containing the extracted CV data
*/
import { Command } from 'commander'
import * as dotenv from 'dotenv'
import * as fs from 'fs'
import * as path from 'path'
import { AICVProcessor } from './AICVProcessor'
import { AIProviderFactory, AIProviderType } from './ai/AIProviderFactory'
import registerCreateCsvCommand from './cli/createCsv'
import registerMergeReportsCommand from './cli/mergeReports'
import { CVData } from './types'
import { AIProvider, ConversionType } from './types/AIProvider'
import { getAIConfig } from './utils/aiConfig'
import pdfParse = require('pdf-parse')
/**
* Convert PDF to text using pdf-parse
* @param pdfPath Path to the PDF file
* @returns Array of text content from each page
*/
export async function convertPdfToTexts(pdfPath: string): Promise<string[]> {
try {
// Read the PDF file
const dataBuffer = fs.readFileSync(pdfPath)
// Parse the PDF
const data = await pdfParse(dataBuffer)
// Split the text into pages
// Note: pdf-parse doesn't provide direct page separation
// We'll use a simple heuristic to split pages based on page numbers
const pages = data.text
.split(/\n\s\d+\s\n/)
.filter((page: string) => page.trim().length > 0)
return pages
} catch (error) {
console.error('Error converting PDF to text:', error)
throw error
}
}
// Load environment variables
dotenv.config()
// Configure CLI
const program = new Command()
program
.name('cv-processor-ts')
.description('Extract structured data from CV/resume PDF')
.version('1.0.0')
program
.command('process')
.description('Process a CV/resume PDF file or URL')
.argument('<input>', 'Path to the CV/resume PDF file or URL to process')
.option(
'-o, --output <file>',
'Output JSON file (defaults to input filename with .json extension)'
)
.option('-v, --verbose', 'Verbose output')
.option(
'--use-ai [provider]',
'Use AI for processing (gemini, openai, azure, grok, aws)'
)
.option('--ai-model <model>', 'AI model to use (default depends on provider)')
.option(
'--accuracy-calculator [type]',
'Type of accuracy calculator to use (traditional, null-based)',
'traditional'
)
.option(
'--conversion-type <type>',
'Type of conversion to use (pdftoimages, pdftotexts, urltotexts)',
'pdftoimages'
)
.option(
'--instructions-path <path>',
'Path to the instructions file (defaults to instructions.txt in project root)'
)
.option(
'--expected-total-fields <number>',
'Expected total number of fields for emptiness percentage calculation',
parseInt
)
.action(async (input, options) => {
try {
// Validate input - check if it's a URL or file path
const isUrl = input.startsWith('http://') || input.startsWith('https://')
if (!isUrl && !fs.existsSync(input)) {
console.error(`Error: Input file not found: ${input}`)
process.exit(1)
}
// Determine output file
const outputFile =
options.output ||
(isUrl
? `url-${Date.now()}.json`
: `${path.basename(input, path.extname(input))}.json`)
// Process CV
const startTime = new Date()
console.log(`Starting CV processing at ${startTime.toISOString()}`)
// Use AI processing
const providerType = options.useAi as AIProviderType
console.log(`Using AI processing with provider: ${providerType}`)
// Get AI configuration
const aiConfig = getAIConfig(providerType, options.aiModel)
// Create AI provider and processor
const aiProvider = AIProviderFactory.createProvider(
providerType,
aiConfig
)
const processor = new AICVProcessor(aiProvider, {
verbose: options.verbose,
instructionsPath:
options.instructionsPath ||
path.join(process.cwd(), 'instructions.txt'),
expectedTotalFields: options.expectedTotalFields,
})
// Process the CV with the specified conversion type
const conversionType =
options.conversionType === 'pdftotexts'
? ConversionType.PdfToTexts
: options.conversionType === 'urltotexts'
? ConversionType.UrlToTexts
: ConversionType.PdfToImages
console.log(`Using conversion type: ${conversionType}`)
const cvData = await processor.processCv(input, conversionType)
processor.saveToJson(cvData, outputFile)
const processingTime = (new Date().getTime() - startTime.getTime()) / 1000
console.log(
`CV processing completed in ${processingTime.toFixed(2)} seconds`
)
} catch (error) {
console.error(`Error processing CV: ${error}`)
process.exit(1)
}
})
// Register the merge-reports command using the function from cli/mergeReports.ts
registerMergeReportsCommand(program)
// Register the create-csv command using the function from cli/createCsv.ts
registerCreateCsvCommand(program)
// For backward compatibility, make 'process' the default command
program.parse(process.argv)
// If no arguments or if only the program name is provided, show help
if (process.argv.length <= 2) {
program.help()
}
/**
* Process a CV PDF and extract structured information using AI
* @param pdfPath Path to the PDF file
* @param aiProvider AI provider to use for processing
* @param options Processing options
* @param conversionType Type of conversion to use (default: PdfToTexts)
* @returns Promise resolving to structured CV data
*/
export async function processCv(
pdfPath: string,
aiProvider: AIProvider,
options: { verbose?: boolean; instructionsPath?: string } = {},
conversionType: ConversionType = ConversionType.PdfToTexts
): Promise<CVData> {
const processor = new AICVProcessor(aiProvider, options)
const texts = await convertPdfToTexts(pdfPath)
return processor.processCv(texts, conversionType)
}
export { AICVProcessor, ConversionType }
export type { AIProvider, CVData }