@thecodingwhale/cv-processor
Version:
CV Processor to extract structured data from PDF resumes using TypeScript
140 lines (136 loc) • 4.82 kB
JavaScript
;
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
var desc = Object.getOwnPropertyDescriptor(m, k);
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
desc = { enumerable: true, get: function() { return m[k]; } };
}
Object.defineProperty(o, k2, desc);
}) : (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
o[k2] = m[k];
}));
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
Object.defineProperty(o, "default", { enumerable: true, value: v });
}) : function(o, v) {
o["default"] = v;
});
var __importStar = (this && this.__importStar) || (function () {
var ownKeys = function(o) {
ownKeys = Object.getOwnPropertyNames || function (o) {
var ar = [];
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
return ar;
};
return ownKeys(o);
};
return function (mod) {
if (mod && mod.__esModule) return mod;
var result = {};
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
__setModuleDefault(result, mod);
return result;
};
})();
var __importDefault = (this && this.__importDefault) || function (mod) {
return (mod && mod.__esModule) ? mod : { "default": mod };
};
Object.defineProperty(exports, "__esModule", { value: true });
exports.AITextExtractor = void 0;
const fs = __importStar(require("fs"));
const pdf_parse_1 = __importDefault(require("pdf-parse"));
const uuid_1 = require("uuid");
function replaceUUIDv4Placeholders(obj) {
if (typeof obj === 'object' && obj !== null) {
for (const key in obj) {
if (typeof obj[key] === 'string' && obj[key] === 'UUIDv4') {
obj[key] = (0, uuid_1.v4)();
}
else if (typeof obj[key] === 'object') {
replaceUUIDv4Placeholders(obj[key]);
}
}
}
return obj;
}
/**
* Class for extracting text from PDFs using AI models
*/
class AITextExtractor {
/**
* Initialize the AI Text Extractor
*/
constructor(aiProvider) {
this.tokenUsage = {
promptTokens: 0,
completionTokens: 0,
totalTokens: 0,
estimatedCost: 0,
};
this.aiProvider = aiProvider;
}
/**
* Extract text from PDF file using AI
*/
async extractTextFromPDF(pdfPath) {
try {
// Check if file exists
if (!fs.existsSync(pdfPath)) {
throw new Error(`PDF file does not exist: ${pdfPath}`);
}
const buffer = fs.readFileSync(pdfPath);
const parsed = await (0, pdf_parse_1.default)(buffer);
const text = this.cleanText(parsed.text);
return text;
}
catch (error) {
console.error(`Error in extractTextFromPDF: ${error}`);
throw error;
}
}
cleanText(text) {
return text
.replace(/PROFILE[\s\S]*?(?=\n\d{4})/i, '')
.replace(/NOTES[\s\S]*$/i, '')
.trim();
}
buildPrompt(text) {
return `
You are an AI data extractor for an actor's resume system. Only use these official categories:
["Commercial","Film","Television","Theatre","Print / Fashion","Training","Voice","Stunt","Corporate","MC/Presenting","Extras","Other"]
Map synonyms logically. Use "Other" if uncertain.
Schema:
{ "resume": [{ "category": "<Category>", "category_id": "<UUIDv4>", "credits": [{ "id": "<UUIDv4>", "year": "YYYY", "title": "<Title>", "role": "<Role>", "director": "<Director>", "attached_media": [] }] }], "resume_show_years": true }
Ignore profile, skills, notes.
Resume text:
${text}`;
}
/**
* Add token usage from a response to the running total
*/
addTokenUsage(usage) {
this.tokenUsage.promptTokens += usage.promptTokens || 0;
this.tokenUsage.completionTokens += usage.completionTokens || 0;
this.tokenUsage.totalTokens += usage.totalTokens || 0;
this.tokenUsage.estimatedCost =
(this.tokenUsage.estimatedCost || 0) + (usage.estimatedCost || 0);
}
/**
* Get token usage statistics
*/
getTokenUsage() {
return { ...this.tokenUsage };
}
/**
* Reset token usage statistics
*/
resetTokenUsage() {
this.tokenUsage = {
promptTokens: 0,
completionTokens: 0,
totalTokens: 0,
estimatedCost: 0,
};
}
}
exports.AITextExtractor = AITextExtractor;