@lobehub/chat
Version:
Lobe Chat - an open-source, high-performance chatbot framework that supports speech synthesis, multimodal, and extensible Function Call plugin system. Supports one-click free deployment of your private ChatGPT/LLM web application.
169 lines (144 loc) • 5.95 kB
text/typescript
import debug from 'debug';
import { readFile } from 'node:fs/promises';
import type { PDFDocumentProxy, PDFPageProxy } from 'pdfjs-dist';
import { getDocument, version } from 'pdfjs-dist/legacy/build/pdf.mjs';
// @ts-ignore
import * as _pdfjsWorker from 'pdfjs-dist/legacy/build/pdf.worker.mjs';
import type { TextContent } from 'pdfjs-dist/types/src/display/api';
import type { DocumentPage, FileLoaderInterface } from '../../types';
import { promptTemplate } from './prompt';
const log = debug('file-loaders:pdf');
/**
* Loads PDF files page by page using the official pdfjs-dist library.
*/
export class PdfLoader implements FileLoaderInterface {
private pdfInstance: PDFDocumentProxy | null = null;
private pdfjsWorker = _pdfjsWorker;
private async getPDFFile(filePath: string) {
// GlobalWorkerOptions.workerSrc should have been set at the module level.
// We are now relying on pdfjs-dist to use this path when it creates a worker.
log('Reading PDF file:', filePath);
const dataBuffer = await readFile(filePath);
log('PDF file read successfully, size:', dataBuffer.length, 'bytes');
const loadingTask = getDocument({
data: new Uint8Array(dataBuffer.buffer, dataBuffer.byteOffset, dataBuffer.length),
useSystemFonts: true,
});
log('PDF document loading task created');
const pdf = await loadingTask.promise;
log('PDF document loaded successfully, pages:', pdf.numPages);
return pdf;
}
async loadPages(filePath: string): Promise<DocumentPage[]> {
log('Starting to load PDF pages from:', filePath);
try {
const pdf: PDFDocumentProxy = await this.getPDFFile(filePath);
const pages: DocumentPage[] = [];
log(`Processing ${pdf.numPages} PDF pages`);
for (let i = 1; i <= pdf.numPages; i += 1) {
log(`Loading page ${i}/${pdf.numPages}`);
const page: PDFPageProxy = await pdf.getPage(i);
const content: TextContent = await page.getTextContent();
log(`Page ${i} text content retrieved, items:`, content.items.length);
// --- Revert to EXACT Simple Langchain PDFLoader Logic ---
let lastY;
const textItems = [];
for (const item of content.items) {
// Ensure 'str' exists and potentially filter empty strings if needed, though Langchain's snippet doesn't explicitly filter empties
if ('str' in item) {
if (lastY === item.transform[5] || !lastY) {
// Exact check from Langchain
textItems.push(item.str);
} else {
// Exact else from Langchain for Y change
textItems.push(`\n${item.str}`);
}
// Update lastY, Langchain's snippet doesn't use destructuring here
lastY = item.transform[5];
}
}
const pageText = textItems.join(''); // Join with empty separator
// --- End Revert to Simple Langchain Logic ---
// Clean the final text (keep null character removal)
const cleanedPageContent = pageText.replaceAll('\0', '');
// Calculate stats based on the final content
const pageLines = cleanedPageContent.split('\n');
const lineCount = pageLines.length;
const charCount = cleanedPageContent.length;
log(`Page ${i} processed, lines: ${lineCount}, chars: ${charCount}`);
pages.push({
charCount,
lineCount,
metadata: { pageNumber: i },
pageContent: cleanedPageContent,
});
// Clean up page resources
log(`Cleaning up page ${i} resources`);
page.cleanup();
}
// Clean up document resources
log('Cleaning up PDF document resources');
await pdf.destroy();
log(`PDF loading completed for ${filePath}, total pages:`, pages.length);
return pages;
} catch (e) {
const error = e as Error;
log('Error encountered while loading PDF file');
console.error(
`Error loading PDF file ${filePath} using pdfjs-dist: ${error.message}`,
error.stack,
);
const errorPage: DocumentPage = {
charCount: 0,
lineCount: 0,
metadata: {
error: `Failed to load or parse PDF file: ${error.message}`,
filePath: filePath,
},
pageContent: '',
};
log('Created error page for failed PDF loading');
return [errorPage];
}
}
/**
* Aggregates content from PDF pages.
* Uses double newline as a separator.
* @param pages Array of DocumentPage objects.
* @returns Aggregated content as a string.
*/
async aggregateContent(pages: DocumentPage[]): Promise<string> {
log('Aggregating content from', pages.length, 'PDF pages');
const validPages = pages.filter((page) => !page.metadata.error);
log(
`Found ${validPages.length} valid pages for aggregation (${pages.length - validPages.length} pages with errors filtered out)`,
);
const result = promptTemplate(validPages);
log('PDF content aggregated successfully, length:', result.length);
return result;
}
async attachDocumentMetadata(filePath: string): Promise<any> {
log('Attaching document metadata for PDF:', filePath);
const pdf: PDFDocumentProxy = await this.getPDFFile(filePath);
log('Getting PDF metadata');
const pdfMetadata =
(await pdf.getMetadata().catch((err) => {
log('Error retrieving PDF metadata');
console.error(`Error getting PDF metadata: ${err.message}`);
return null;
})) ?? null;
const pdfInfo = pdfMetadata?.info ?? {};
const metadata = pdfMetadata?.metadata ?? null;
log('PDF metadata retrieved:', {
hasInfo: !!Object.keys(pdfInfo).length,
hasMetadata: !!metadata,
});
return {
pdfInfo: pdfInfo,
// PDF info (Author, Title, etc.)
pdfMetadata: metadata,
// PDF metadata
pdfVersion: version,
};
}
}