@galihru/tvmai
Version:
Training and Evaluation AI Model Recommendation Engine for WebNN
64 lines (63 loc) • 2.19 kB
JavaScript
// src/loaders/pdfLoader.ts
import JSZip from 'jszip';
import pdfParse from 'pdf-parse';
export const loadPDFDataset = async (zipFile) => {
const zip = new JSZip();
await zip.loadAsync(zipFile);
const classes = [];
const pdfCounts = {};
let totalPDFs = 0;
let totalPages = 0;
let totalTextLength = 0;
let sampleCount = 0;
// Ambil semua entry .pdf di dalam zip
zip.forEach((relativePath, file) => {
if (!file.dir && relativePath.endsWith('.pdf')) {
const folderMatch = relativePath.match(/^([^\/]+)\//);
if (folderMatch) {
const className = folderMatch[1];
if (!classes.includes(className)) {
classes.push(className);
pdfCounts[className] = 0;
}
pdfCounts[className]++;
totalPDFs++;
}
}
});
// Proses parsing hanya tiap 20 file pertama (atau sesuai logika lama)
const fileEntries = Object.entries(pdfCounts);
const parsePromises = [];
let processed = 0;
zip.forEach((relativePath, file) => {
if (!file.dir && relativePath.endsWith('.pdf') && processed % 20 === 0) {
const match = relativePath.match(/^([^\/]+)\//);
if (match) {
parsePromises.push((async () => {
const arrayBuf = await file.async('arraybuffer');
const buffer = Buffer.from(arrayBuf);
const data = await pdfParse(buffer);
totalTextLength += data.text.length;
totalPages += data.numpages;
})());
}
}
processed++;
});
await Promise.all(parsePromises);
const avgTextLength = parsePromises.length
? totalTextLength / parsePromises.length
: 0;
const avgPages = parsePromises.length
? totalPages / parsePromises.length
: 0;
return {
type: 'pdf',
size: totalPDFs,
classes: classes.length,
classDistribution: pdfCounts,
avgTextLength,
avgPages,
vocabSize: Math.round(avgTextLength * 0.1),
};
};