@thecodingwhale/cv-processor
Version:
CV Processor to extract structured data from PDF resumes using TypeScript
109 lines (108 loc) • 4.66 kB
JavaScript
;
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
var desc = Object.getOwnPropertyDescriptor(m, k);
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
desc = { enumerable: true, get: function() { return m[k]; } };
}
Object.defineProperty(o, k2, desc);
}) : (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
o[k2] = m[k];
}));
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
Object.defineProperty(o, "default", { enumerable: true, value: v });
}) : function(o, v) {
o["default"] = v;
});
var __importStar = (this && this.__importStar) || (function () {
var ownKeys = function(o) {
ownKeys = Object.getOwnPropertyNames || function (o) {
var ar = [];
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
return ar;
};
return ownKeys(o);
};
return function (mod) {
if (mod && mod.__esModule) return mod;
var result = {};
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
__setModuleDefault(result, mod);
return result;
};
})();
var __importDefault = (this && this.__importDefault) || function (mod) {
return (mod && mod.__esModule) ? mod : { "default": mod };
};
Object.defineProperty(exports, "__esModule", { value: true });
exports.TextExtractor = void 0;
const fs = __importStar(require("fs"));
const path = __importStar(require("path"));
const pdf_parse_1 = __importDefault(require("pdf-parse"));
const Tesseract = __importStar(require("tesseract.js"));
/**
* Class for extracting text from PDF documents
*/
class TextExtractor {
/**
* Extract text from a PDF file, with OCR fallback if needed
*/
async extractTextFromPDF(pdfPath) {
console.log(`Extracting text from PDF: ${pdfPath}`);
try {
// Read the PDF file
const dataBuffer = fs.readFileSync(pdfPath);
// Parse the PDF
const pdfData = await (0, pdf_parse_1.default)(dataBuffer);
const text = pdfData.text;
// Check if we got meaningful text (more than just whitespace)
if (text.trim().length > 100) {
// Assuming a CV would have at least 100 chars
console.log('Successfully extracted text from PDF');
return text;
}
// If not much text was extracted, try OCR
console.log('Not enough text extracted, trying OCR...');
return this.extractTextWithOCR(pdfPath);
}
catch (error) {
console.error(`Error extracting text from PDF: ${error}`);
// Fallback to OCR
console.log('Falling back to OCR due to error');
return this.extractTextWithOCR(pdfPath);
}
}
/**
* Extract text using OCR with Tesseract.js
* Note: This is a simplified implementation as converting PDF pages to images
* is more complex in Node.js than in Python
*/
async extractTextWithOCR(pdfPath) {
console.log('Starting OCR processing...');
try {
// For a production implementation, you would:
// 1. Convert PDF pages to images using a library like pdf2pic or pdf-poppler
// 2. Process each image with Tesseract
// 3. Combine the results
// This is a simplified placeholder that assumes you have already converted
// the first page to an image (a full implementation would loop through all pages)
const pdfName = path.basename(pdfPath, path.extname(pdfPath));
const imagePath = `${pdfName}_page_1.png`;
// Check if the image exists (in a real implementation, you'd generate this)
if (!fs.existsSync(imagePath)) {
console.warn(`Image ${imagePath} not found for OCR. Would need PDF to image conversion first.`);
return 'Error: PDF to image conversion required for OCR.';
}
// Perform OCR on the image
const { data } = await Tesseract.recognize(imagePath, 'eng');
console.log('OCR processing completed');
return data.text;
}
catch (error) {
console.error(`Error extracting text with OCR: ${error}`);
return 'Error: Could not extract text from PDF.';
}
}
}
exports.TextExtractor = TextExtractor;