@lobehub/chat
Version:
Lobe Chat - an open-source, high-performance chatbot framework that supports speech synthesis, multimodal, and extensible Function Call plugin system. Supports one-click free deployment of your private ChatGPT/LLM web application.
85 lines (73 loc) • 2.66 kB
text/typescript
import debug from 'debug';
import fs from 'node:fs/promises';
import mammoth from 'mammoth';
import type { DocumentPage, FileLoaderInterface } from '../../types';
const log = debug('file-loaders:docx');
/**
* Loads Word documents (.docx) using mammoth library.
* Extracts text content and basic metadata from DOCX files.
*/
export class DocxLoader implements FileLoaderInterface {
async loadPages(filePath: string): Promise<DocumentPage[]> {
log('Loading DOCX file:', filePath);
try {
// Read file as buffer
const buffer = await fs.readFile(filePath);
log('File buffer read, size:', buffer.length);
// Extract text using mammoth
const result = await mammoth.extractRawText({ buffer });
const pageContent = result.value;
log('Text extracted, length:', pageContent.length);
// Count lines and characters
const lines = pageContent.split('\n');
const lineCount = lines.length;
const charCount = pageContent.length;
log('DOCX document processed, lines:', lineCount, 'chars:', charCount);
// Create single page with extracted content
const page: DocumentPage = {
charCount,
lineCount,
metadata: {
pageNumber: 1,
},
pageContent,
};
// Handle warnings if any
if (result.messages.length > 0) {
const warnings = result.messages.filter((msg) => msg.type === 'warning');
if (warnings.length > 0) {
log('Extraction warnings:', warnings.length);
warnings.forEach((warning) => log('Warning:', warning.message));
}
}
log('DOCX loading completed');
return [page];
} catch (e) {
const error = e as Error;
log('Error encountered while loading DOCX file');
console.error(`Error loading DOCX file ${filePath}: ${error.message}`);
const errorPage: DocumentPage = {
charCount: 0,
lineCount: 0,
metadata: {
error: `Failed to load DOCX file: ${error.message}`,
},
pageContent: '',
};
log('Created error page for failed DOCX loading');
return [errorPage];
}
}
/**
* Aggregates content from DOCX pages.
* Uses double newline as a separator.
* @param pages Array of DocumentPage objects.
* @returns Aggregated content as a string.
*/
async aggregateContent(pages: DocumentPage[]): Promise<string> {
log('Aggregating content from', pages.length, 'DOCX pages');
const result = pages.map((page) => page.pageContent).join('\n\n');
log('DOCX content aggregated successfully, length:', result.length);
return result;
}
}