UNPKG

@thecodingwhale/cv-processor

Version:

CV Processor to extract structured data from PDF resumes using TypeScript

140 lines (136 loc) 4.82 kB
"use strict"; var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) { if (k2 === undefined) k2 = k; var desc = Object.getOwnPropertyDescriptor(m, k); if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) { desc = { enumerable: true, get: function() { return m[k]; } }; } Object.defineProperty(o, k2, desc); }) : (function(o, m, k, k2) { if (k2 === undefined) k2 = k; o[k2] = m[k]; })); var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) { Object.defineProperty(o, "default", { enumerable: true, value: v }); }) : function(o, v) { o["default"] = v; }); var __importStar = (this && this.__importStar) || (function () { var ownKeys = function(o) { ownKeys = Object.getOwnPropertyNames || function (o) { var ar = []; for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k; return ar; }; return ownKeys(o); }; return function (mod) { if (mod && mod.__esModule) return mod; var result = {}; if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]); __setModuleDefault(result, mod); return result; }; })(); var __importDefault = (this && this.__importDefault) || function (mod) { return (mod && mod.__esModule) ? mod : { "default": mod }; }; Object.defineProperty(exports, "__esModule", { value: true }); exports.AITextExtractor = void 0; const fs = __importStar(require("fs")); const pdf_parse_1 = __importDefault(require("pdf-parse")); const uuid_1 = require("uuid"); function replaceUUIDv4Placeholders(obj) { if (typeof obj === 'object' && obj !== null) { for (const key in obj) { if (typeof obj[key] === 'string' && obj[key] === 'UUIDv4') { obj[key] = (0, uuid_1.v4)(); } else if (typeof obj[key] === 'object') { replaceUUIDv4Placeholders(obj[key]); } } } return obj; } /** * Class for extracting text from PDFs using AI models */ class AITextExtractor { /** * Initialize the AI Text Extractor */ constructor(aiProvider) { this.tokenUsage = { promptTokens: 0, completionTokens: 0, totalTokens: 0, estimatedCost: 0, }; this.aiProvider = aiProvider; } /** * Extract text from PDF file using AI */ async extractTextFromPDF(pdfPath) { try { // Check if file exists if (!fs.existsSync(pdfPath)) { throw new Error(`PDF file does not exist: ${pdfPath}`); } const buffer = fs.readFileSync(pdfPath); const parsed = await (0, pdf_parse_1.default)(buffer); const text = this.cleanText(parsed.text); return text; } catch (error) { console.error(`Error in extractTextFromPDF: ${error}`); throw error; } } cleanText(text) { return text .replace(/PROFILE[\s\S]*?(?=\n\d{4})/i, '') .replace(/NOTES[\s\S]*$/i, '') .trim(); } buildPrompt(text) { return ` You are an AI data extractor for an actor's resume system. Only use these official categories: ["Commercial","Film","Television","Theatre","Print / Fashion","Training","Voice","Stunt","Corporate","MC/Presenting","Extras","Other"] Map synonyms logically. Use "Other" if uncertain. Schema: { "resume": [{ "category": "<Category>", "category_id": "<UUIDv4>", "credits": [{ "id": "<UUIDv4>", "year": "YYYY", "title": "<Title>", "role": "<Role>", "director": "<Director>", "attached_media": [] }] }], "resume_show_years": true } Ignore profile, skills, notes. Resume text: ${text}`; } /** * Add token usage from a response to the running total */ addTokenUsage(usage) { this.tokenUsage.promptTokens += usage.promptTokens || 0; this.tokenUsage.completionTokens += usage.completionTokens || 0; this.tokenUsage.totalTokens += usage.totalTokens || 0; this.tokenUsage.estimatedCost = (this.tokenUsage.estimatedCost || 0) + (usage.estimatedCost || 0); } /** * Get token usage statistics */ getTokenUsage() { return { ...this.tokenUsage }; } /** * Reset token usage statistics */ resetTokenUsage() { this.tokenUsage = { promptTokens: 0, completionTokens: 0, totalTokens: 0, estimatedCost: 0, }; } } exports.AITextExtractor = AITextExtractor;