file2md
Version:
A TypeScript library for converting various document types (PDF, DOCX, XLSX, PPTX, HWP, HWPX) into Markdown with image and layout preservation
207 lines • 8.31 kB
JavaScript
import { promises as fs } from 'node:fs';
import { Buffer } from 'node:buffer';
import fileType from 'file-type';
import { ImageExtractor } from './utils/image-extractor.js';
import { ChartExtractor } from './utils/chart-extractor.js';
import { parsePdf } from './parsers/pdf-parser.js';
import { parseDocx } from './parsers/docx-parser.js';
import { parseXlsx } from './parsers/xlsx-parser.js';
import { parsePptx } from './parsers/pptx-parser.js';
import { parseHwp } from './parsers/hwp-parser.js';
import { FileNotFoundError, UnsupportedFormatError, InvalidFileError, SUPPORTED_MIME_TYPES } from './types/index.js';
/**
* Detect HWP format based on file signature
*/
function detectHwpFormat(buffer) {
if (buffer.length < 4) {
return 'unknown';
}
// Check for CFB/OLE2 signature (HWP binary format)
if (buffer.length >= 8) {
const cfbSignature = buffer.subarray(0, 8);
const expectedCfb = Buffer.from([0xD0, 0xCF, 0x11, 0xE0, 0xA1, 0xB1, 0x1A, 0xE1]);
if (cfbSignature.equals(expectedCfb)) {
return 'hwp';
}
}
// Check for ZIP signature (HWPX format)
const zipSignature = buffer.subarray(0, 4);
const expectedZip = Buffer.from([0x50, 0x4B, 0x03, 0x04]);
if (zipSignature.equals(expectedZip)) {
return 'hwpx';
}
return 'unknown';
}
/**
* Convert a document (PDF, DOCX, XLSX, PPTX, HWP, HWPX) to Markdown format
*
* @param input - File path (string) or Buffer containing the document data
* @param options - Conversion options
* @returns Promise resolving to conversion result with markdown and metadata
*
* @throws {FileNotFoundError} When file path doesn't exist
* @throws {UnsupportedFormatError} When file format is not supported
* @throws {InvalidFileError} When file is corrupted or invalid
* @throws {ParseError} When document parsing fails
*
* @example
* ```typescript
* // Convert from file path
* const result = await convert('./document.pdf');
* console.log(result.markdown);
*
* // Convert from buffer with options
* const buffer = await fs.readFile('./document.docx');
* const result = await convert(buffer, {
* imageDir: 'extracted-images',
* preserveLayout: true
* });
* ```
*/
export async function convert(input, options = {}) {
const startTime = Date.now();
try {
let buffer;
// Handle input type
if (typeof input === 'string') {
try {
buffer = await fs.readFile(input);
}
catch (error) {
if (error?.code === 'ENOENT') {
throw new FileNotFoundError(input);
}
throw new InvalidFileError(`Failed to read file: ${input}`, error);
}
}
else if (Buffer.isBuffer(input)) {
buffer = input;
}
else {
throw new InvalidFileError('Input must be a file path (string) or Buffer');
}
// Detect file type
let detectedType = await fileType.fromBuffer(buffer);
// Enhanced HWP/HWPX detection if file-type module fails or detects CFB/ZIP
if (!detectedType || detectedType.mime === 'application/x-cfb' || detectedType.mime === 'application/zip') {
const hwpFormat = detectHwpFormat(buffer);
if (hwpFormat !== 'unknown') {
detectedType = {
...detectedType,
ext: hwpFormat,
mime: `application/x-${hwpFormat}`
};
}
else if (!detectedType) {
throw new UnsupportedFormatError('unknown');
}
// If it's CFB/ZIP but not HWP/HWPX, let it continue with the original detection
}
// Validate supported format
const supportedMimeTypes = Object.values(SUPPORTED_MIME_TYPES);
if (!supportedMimeTypes.includes(detectedType.mime)) {
throw new UnsupportedFormatError(detectedType.mime);
}
// Setup extractors
const { imageDir = 'images', preserveLayout = true, extractCharts = true, extractImages = true, maxPages } = options;
const imageExtractor = new ImageExtractor(imageDir);
const chartExtractor = new ChartExtractor(imageExtractor);
// Parse document based on type
let markdown;
let images = [];
let charts = [];
let pageCount = 1;
let additionalMetadata = {};
switch (detectedType.mime) {
case SUPPORTED_MIME_TYPES.PDF: {
const result = await parsePdf(buffer, imageExtractor, { maxPages, preserveLayout });
markdown = result.markdown;
images = result.images || [];
pageCount = result.pageCount || 1;
additionalMetadata = result.metadata || {};
break;
}
case SUPPORTED_MIME_TYPES.DOCX: {
const result = await parseDocx(buffer, imageExtractor, chartExtractor, { preserveLayout, extractImages, extractCharts });
markdown = result.markdown;
images = result.images || [];
charts = result.charts || [];
additionalMetadata = result.metadata || {};
break;
}
case SUPPORTED_MIME_TYPES.XLSX: {
const result = await parseXlsx(buffer, imageExtractor, chartExtractor, { preserveLayout, extractCharts });
markdown = result.markdown;
charts = result.charts || [];
pageCount = result.sheetCount || 1;
additionalMetadata = result.metadata || {};
break;
}
case SUPPORTED_MIME_TYPES.PPTX: {
const result = await parsePptx(buffer, imageExtractor, chartExtractor, {
preserveLayout,
extractImages,
extractCharts
});
markdown = result.markdown;
images = result.images || [];
charts = result.charts || [];
pageCount = result.slideCount || 1;
additionalMetadata = result.metadata || {};
break;
}
case SUPPORTED_MIME_TYPES.HWP:
case SUPPORTED_MIME_TYPES.HWPX: {
const result = await parseHwp(buffer, imageExtractor, chartExtractor, {
preserveLayout,
extractImages,
extractCharts
});
markdown = result.markdown;
images = result.images || [];
charts = result.charts || [];
pageCount = 1; // Single document
additionalMetadata = result.metadata || {};
break;
}
default: {
// This should never happen due to earlier validation, but TypeScript requires it
const exhaustiveCheck = detectedType.mime;
throw new UnsupportedFormatError(exhaustiveCheck);
}
}
const endTime = Date.now();
// Build metadata
const metadata = {
fileType: detectedType.ext.toUpperCase(),
mimeType: detectedType.mime,
pageCount,
imageCount: images.length,
chartCount: charts.length,
processingTime: endTime - startTime,
additional: additionalMetadata
};
return {
markdown,
images,
charts,
metadata
};
}
catch (error) {
// Re-throw known errors
if (error instanceof FileNotFoundError ||
error instanceof UnsupportedFormatError ||
error instanceof InvalidFileError) {
throw error;
}
// Wrap unknown errors
const message = error instanceof Error ? error.message : 'Unknown conversion error';
throw new InvalidFileError(`Conversion failed: ${message}`, error);
}
}
// Export utility classes for advanced usage
export { ImageExtractor } from './utils/image-extractor.js';
export { ChartExtractor } from './utils/chart-extractor.js';
export { LayoutParser } from './utils/layout-parser.js';
//# sourceMappingURL=index.js.map