@thecodingwhale/cv-processor

Version:

CV Processor to extract structured data from PDF resumes using TypeScript

91 lines (78 loc) • 2.71 kB

text/typescript

import { exec } from 'child_process' import * as fs from 'fs' import * as os from 'os' import * as path from 'path' import pdfParse from 'pdf-parse' import { promisify } from 'util' const execAsync = promisify(exec) /** * Converts a PDF file to base64-encoded PNG images using pdftoppm. * Requires poppler-utils to be installed. * * @param pdfPath - The file path of the PDF to convert * @returns A promise that resolves to an array of base64 image data URLs */ async function convertPdfToImages(pdfPath: string): Promise<string[]> { console.log(`[convertPdfToImages] Creating temp directory for PDF images`) const tempDir = fs.mkdtempSync(path.join(os.tmpdir(), 'pdf-images-')) console.log(`[convertPdfToImages] Temp directory created: ${tempDir}`) try { const command = `pdftoppm -png -r 200 "${pdfPath}" "${path.join( tempDir, 'page' )}"` console.log(`[convertPdfToImages] Executing command: ${command}`) await execAsync(command) const files = fs .readdirSync(tempDir) .filter((file) => file.endsWith('.png')) console.log( `[convertPdfToImages] Found ${files.length} image files: ${files.join( ', ' )}` ) const sortedFiles = files.map((file) => path.join(tempDir, file)).sort() console.log( `[convertPdfToImages] Sorted file paths: ${sortedFiles.join(', ')}` ) const imageUrls = sortedFiles.map((file) => { const data = fs.readFileSync(file) const base64 = data.toString('base64') console.log( `[convertPdfToImages] Converted image ${file}, size: ${base64.length} chars` ) return `data:image/png;base64,${base64}` }) console.log( `[convertPdfToImages] Returning ${imageUrls.length} base64 image URLs` ) return imageUrls } catch (error) { console.error('[convertPdfToImages] Error converting PDF to images:', error) throw error } } /** * Convert PDF to text using pdf-parse * @param pdfPath Path to the PDF file * @returns Array of text content from each page */ export async function convertPdfToTexts(pdfPath: string): Promise<string[]> { try { // Read the PDF file const dataBuffer = fs.readFileSync(pdfPath) // Parse the PDF const data = await pdfParse(dataBuffer) // Split the text into pages // Note: pdf-parse doesn't provide direct page separation // We'll use a simple heuristic to split pages based on page numbers const pages = data.text .split(/\n\s*\d+\s*\n/) .filter((page: string) => page.trim().length > 0) return pages } catch (error) { console.error('Error converting PDF to text:', error) throw error } } export { convertPdfToImages }