nano-ai-pdf
Version:
This package helps you summarize pdfs using Gemini nano on edge or on browser, making it compliant safe, faster and free
81 lines • 3.86 kB
JavaScript
var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
return new (P || (P = Promise))(function (resolve, reject) {
function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
step((generator = generator.apply(thisArg, _arguments || [])).next());
});
};
import pdfToText from "react-pdftotext";
import { PdfSummarizerError } from "./types";
/**
* Extracts text content from a PDF file
*
* @param file - The PDF file to extract text from
* @param minTextLength - Minimum required text length (default: 100)
* @returns Promise resolving to the extracted text
* @throws PdfSummarizerError if extraction fails or text is insufficient
*/
export function extractTextFromPdf(file_1) {
return __awaiter(this, arguments, void 0, function* (file, minTextLength = 100) {
try {
console.log(`📖 Starting PDF text extraction for: ${file.name}`);
console.log(`📄 File size: ${(file.size / 1024 / 1024).toFixed(2)} MB`);
// Extract text using react-pdftotext
const rawText = yield pdfToText(file);
if (!rawText || typeof rawText !== "string") {
throw new PdfSummarizerError("PDF text extraction returned no content", "EXTRACTION_EMPTY");
}
// Clean and normalize the extracted text
const cleanedText = rawText
.replace(/\s+/g, " ")
.replace(/\n\s*\n/g, "\n\n")
.trim();
console.log(`✅ Text extraction completed: ${cleanedText.length} characters`);
// Validate minimum text length
if (cleanedText.length < minTextLength) {
throw new PdfSummarizerError(`PDF contains insufficient text content (${cleanedText.length} chars, minimum ${minTextLength} required)`, "INSUFFICIENT_TEXT");
}
return cleanedText;
}
catch (error) {
const errorMessage = error instanceof Error ? error.message : "Unknown error";
console.error(`❌ PDF text extraction failed: ${errorMessage}`);
if (error instanceof PdfSummarizerError) {
throw error;
}
throw new PdfSummarizerError(`Failed to extract text from PDF: ${errorMessage}`, "EXTRACTION_FAILED");
}
});
}
/**
* Validates a PDF file before processing
*
* @param file - The file to validate
* @param maxFileSize - Maximum allowed file size in bytes
* @throws PdfSummarizerError if validation fails
*/
export function validatePdfFile(file, maxFileSize = 10 * 1024 * 1024) {
console.log(`🔍 Validating PDF file: ${file.name}`);
// Check if file exists
if (!file) {
throw new PdfSummarizerError("No file provided", "NO_FILE");
}
// Check file type
if (file.type !== "application/pdf") {
throw new PdfSummarizerError(`Invalid file type: ${file.type}. Only PDF files are supported.`, "INVALID_FILE_TYPE");
}
// Check file size
if (file.size > maxFileSize) {
throw new PdfSummarizerError(`File size (${(file.size / 1024 / 1024).toFixed(2)} MB) exceeds maximum allowed size (${(maxFileSize /
1024 /
1024).toFixed(2)} MB)`, "FILE_TOO_LARGE");
}
// Check for empty file
if (file.size === 0) {
throw new PdfSummarizerError("PDF file is empty", "EMPTY_FILE");
}
console.log(`✅ PDF file validation passed`);
}
//# sourceMappingURL=pdf-extractor.js.map