@thecodingwhale/cv-processor
Version:
CV Processor to extract structured data from PDF resumes using TypeScript
137 lines (132 loc) • 5.12 kB
JavaScript
;
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
var desc = Object.getOwnPropertyDescriptor(m, k);
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
desc = { enumerable: true, get: function() { return m[k]; } };
}
Object.defineProperty(o, k2, desc);
}) : (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
o[k2] = m[k];
}));
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
Object.defineProperty(o, "default", { enumerable: true, value: v });
}) : function(o, v) {
o["default"] = v;
});
var __importStar = (this && this.__importStar) || (function () {
var ownKeys = function(o) {
ownKeys = Object.getOwnPropertyNames || function (o) {
var ar = [];
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
return ar;
};
return ownKeys(o);
};
return function (mod) {
if (mod && mod.__esModule) return mod;
var result = {};
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
__setModuleDefault(result, mod);
return result;
};
})();
Object.defineProperty(exports, "__esModule", { value: true });
exports.AICVProcessorX = void 0;
const fs = __importStar(require("fs"));
const pdfParse = __importStar(require("pdf-parse"));
class AICVProcessorX {
constructor(aiProvider, verbose = false) {
this.tokenUsage = {
promptTokens: 0,
completionTokens: 0,
totalTokens: 0,
estimatedCost: 0,
};
this.aiProvider = aiProvider;
this.verbose = verbose;
}
async processCv(pdfPath) {
console.log(`Processing CV: ${pdfPath}`);
this.resetTokenUsage();
// :one: Extract text from PDF locally
const pdfBuffer = fs.readFileSync(pdfPath);
const parsed = await pdfParse(pdfBuffer);
let extractedText = parsed.text;
if (this.verbose) {
console.log(':white_check_mark: PDF text extracted (local)');
console.log(`Length: ${extractedText.length} characters`);
}
// :two: Pre-clean text → remove irrelevant sections
extractedText = this.cleanExtractedText(extractedText);
if (this.verbose) {
console.log(':white_check_mark: Cleaned text ready for AI processing');
}
// :three: Send cleaned text directly to AI with a structured prompt
const aiPrompt = this.buildAIPrompt(extractedText);
const aiResponse = await this.aiProvider.complete({
prompt: aiPrompt,
maxTokens: 1500, // adjust as needed
temperature: 0.1,
});
this.addTokenUsageFromResponse(aiResponse.usage);
if (this.verbose) {
console.log(':white_check_mark: AI completed pattern extraction');
}
const cvData = JSON.parse(aiResponse.text);
return cvData;
}
cleanExtractedText(text) {
// Very basic cleaning — you can enhance regex based on format
return text
.replace(/PROFILE[\s\S]*?(?=\n\d{4})/i, '') // remove profile section up to first year
.replace(/NOTES[\s\S]*$/i, '') // remove notes section at end
.trim();
}
buildAIPrompt(text) {
return `
You are an AI data extractor for an actor's resume system. I will provide you resume text. Extract credits and convert them into JSON matching this schema:
{
"resume": [
{
"category": "<Category>",
"category_id": "<UUIDv4>",
"credits": [
{ "id": "<UUIDv4>", "year": "YYYY", "title": "<Title>", "role": "<Role>", "director": "<Director>", "attached_media": [] }
]
}
],
"resume_show_years": true
}
:white_check_mark: Official categories:
["Commercial","Film","Television","Theatre","Print / Fashion","Training","Voice","Stunt","Corporate","MC/Presenting","Extras","Other"]
Categorization rules:
- Only use these categories.
- Map synonyms and related words logically to closest category.
- If unsure → assign to "Other".
- If director missing → "director": ""
- Each category + credit must have unique UUIDv4
- Ignore profile, skills, notes, memberships.
Resume text:
${text}
`;
}
addTokenUsageFromResponse(usage) {
if (usage) {
this.tokenUsage.promptTokens += usage.prompt_tokens || 0;
this.tokenUsage.completionTokens += usage.completion_tokens || 0;
this.tokenUsage.totalTokens += usage.total_tokens || 0;
this.tokenUsage.estimatedCost += ((usage.total_tokens || 0) * 0.03) / 1000; // example pricing
}
}
resetTokenUsage() {
this.tokenUsage = {
promptTokens: 0,
completionTokens: 0,
totalTokens: 0,
estimatedCost: 0,
};
}
}
exports.AICVProcessorX = AICVProcessorX;