UNPKG

@hivellm/transmutation-lite

Version:

Simplified document converter for common formats (PDF, DOCX, XLSX, PPTX) to Markdown

1,205 lines (1,193 loc) 35.8 kB
#!/usr/bin/env node import { Command } from 'commander'; import { mkdir, writeFile, readdir, stat, readFile } from 'fs/promises'; import { extname, basename, dirname, join } from 'path'; import { createRequire } from 'module'; import mammoth from 'mammoth'; import * as XLSX from 'xlsx'; import TurndownService from 'turndown'; import { createHash } from 'crypto'; // src/converters/base.ts var BaseConverter = class { /** * Create base metadata object */ createMetadata(format, fileSize, extra) { return { format, fileSize, pageCount: extra?.pageCount, title: extra?.title, author: extra?.author, createdAt: extra?.createdAt, extra: extra?.extra }; } /** * Create conversion result with timing */ createResult(markdown, metadata, startTime, warnings) { return { markdown, metadata, conversionTimeMs: Date.now() - startTime, warnings }; } }; // src/types.ts var DocumentFormat = /* @__PURE__ */ ((DocumentFormat2) => { DocumentFormat2["PDF"] = "pdf"; DocumentFormat2["DOCX"] = "docx"; DocumentFormat2["XLSX"] = "xlsx"; DocumentFormat2["PPTX"] = "pptx"; DocumentFormat2["TXT"] = "txt"; DocumentFormat2["HTML"] = "html"; DocumentFormat2["UNKNOWN"] = "unknown"; return DocumentFormat2; })(DocumentFormat || {}); var ConversionError = class extends Error { constructor(message, format, cause) { super(message); this.format = format; this.cause = cause; this.name = "ConversionError"; } }; // src/converters/pdf.ts var require2 = createRequire(import.meta.url); var pdfParse = require2("pdf-parse-new"); var PdfConverter = class extends BaseConverter { getFormat() { return "pdf" /* PDF */; } canHandle(extension) { return extension.toLowerCase() === "pdf"; } async convert(buffer, options) { const startTime = Date.now(); const warnings = []; try { const data = await pdfParse(buffer); let text = data.text; if (options?.preserveFormatting !== false) { text = this.cleanupText(text); } if (options?.maxPages && options.maxPages > 0) { const pages = text.split("\f"); text = pages.slice(0, options.maxPages).join("\n\n---\n\n"); if (pages.length > options.maxPages) { warnings.push( `Only processed ${options.maxPages} of ${pages.length} pages` ); } } const metadata = this.createMetadata("pdf" /* PDF */, buffer.length, { pageCount: data.numpages, title: data.info?.Title, author: data.info?.Author, createdAt: data.info?.CreationDate ? new Date(data.info.CreationDate) : void 0, extra: { producer: data.info?.Producer, creator: data.info?.Creator } }); return this.createResult(text, metadata, startTime, warnings); } catch (error) { throw new ConversionError( `Failed to convert PDF: ${error instanceof Error ? error.message : "Unknown error"}`, "pdf" /* PDF */, error instanceof Error ? error : void 0 ); } } cleanupText(text) { text = text.replace(/[ \t]+/g, " "); text = text.replace(/\n{3,}/g, "\n\n"); text = text.replace(/\f/g, "\n\n---\n\n"); return text.trim(); } }; var DocxConverter = class extends BaseConverter { getFormat() { return "docx" /* DOCX */; } canHandle(extension) { return extension.toLowerCase() === "docx"; } async convert(buffer, options) { const startTime = Date.now(); const warnings = []; try { const result = await mammoth.convertToMarkdown( { buffer }, { styleMap: [ "p[style-name='Heading 1'] => # :fresh", "p[style-name='Heading 2'] => ## :fresh", "p[style-name='Heading 3'] => ### :fresh", "p[style-name='Heading 4'] => #### :fresh", "p[style-name='Heading 5'] => ##### :fresh", "p[style-name='Heading 6'] => ###### :fresh" ] } ); let markdown = result.value; if (result.messages.length > 0) { warnings.push( ...result.messages.map((msg) => `${msg.type}: ${msg.message}`) ); } if (options?.preserveFormatting !== false) { markdown = this.cleanupMarkdown(markdown); } const metadata = this.createMetadata("docx" /* DOCX */, buffer.length, { extra: { warnings: result.messages } }); return this.createResult(markdown, metadata, startTime, warnings); } catch (error) { throw new ConversionError( `Failed to convert DOCX: ${error instanceof Error ? error.message : "Unknown error"}`, "docx" /* DOCX */, error instanceof Error ? error : void 0 ); } } cleanupMarkdown(markdown) { markdown = markdown.replace(/\n{3,}/g, "\n\n"); markdown = markdown.trim(); return markdown; } }; var XlsxConverter = class extends BaseConverter { getFormat() { return "xlsx" /* XLSX */; } canHandle(extension) { const ext = extension.toLowerCase(); return ext === "xlsx" || ext === "xls"; } async convert(buffer, options) { const startTime = Date.now(); const warnings = []; try { const workbook = XLSX.read(buffer, { type: "buffer" }); let markdown = ""; let totalSheets = workbook.SheetNames.length; const sheetsToProcess = options?.maxPages && options.maxPages > 0 ? Math.min(options.maxPages, totalSheets) : totalSheets; if (sheetsToProcess < totalSheets) { warnings.push( `Only processed ${sheetsToProcess} of ${totalSheets} sheets` ); } for (let i = 0; i < sheetsToProcess; i++) { const sheetName = workbook.SheetNames[i]; const sheet = workbook.Sheets[sheetName]; if (i > 0) { markdown += "\n\n---\n\n"; } markdown += `# ${sheetName} `; const csvData = XLSX.utils.sheet_to_csv(sheet); if (csvData.trim()) { markdown += this.csvToMarkdownTable(csvData); } else { markdown += "*Empty sheet*\n"; warnings.push(`Sheet "${sheetName}" is empty`); } } const metadata = this.createMetadata("xlsx" /* XLSX */, buffer.length, { pageCount: totalSheets, extra: { sheetNames: workbook.SheetNames, processedSheets: sheetsToProcess } }); return this.createResult(markdown.trim(), metadata, startTime, warnings); } catch (error) { throw new ConversionError( `Failed to convert XLSX: ${error instanceof Error ? error.message : "Unknown error"}`, "xlsx" /* XLSX */, error instanceof Error ? error : void 0 ); } } csvToMarkdownTable(csv) { const lines = csv.split("\n").filter((line) => line.trim()); if (lines.length === 0) { return "*No data*\n"; } const rows = lines.map((line) => this.parseCsvLine(line)); if (rows.length === 0) { return "*No data*\n"; } const maxCols = Math.max(...rows.map((row) => row.length)); rows.forEach((row) => { while (row.length < maxCols) { row.push(""); } }); let markdown = ""; markdown += "| " + rows[0].join(" | ") + " |\n"; markdown += "| " + rows[0].map(() => "---").join(" | ") + " |\n"; for (let i = 1; i < rows.length; i++) { markdown += "| " + rows[i].join(" | ") + " |\n"; } return markdown; } parseCsvLine(line) { const cells = []; let cell = ""; let inQuotes = false; for (let i = 0; i < line.length; i++) { const char = line[i]; if (char === '"') { if (inQuotes && line[i + 1] === '"') { cell += '"'; i++; } else { inQuotes = !inQuotes; } } else if (char === "," && !inQuotes) { cells.push(cell.trim()); cell = ""; } else { cell += char; } } cells.push(cell.trim()); return cells; } }; // src/converters/pptx.ts var PptxConverter = class extends BaseConverter { getFormat() { return "pptx" /* PPTX */; } canHandle(extension) { const ext = extension.toLowerCase(); return ext === "pptx" || ext === "ppt"; } async convert(buffer, _options) { const startTime = Date.now(); const warnings = []; warnings.push( "PPTX conversion is simplified and may not extract all content" ); try { const text = await this.extractTextFromPptx(buffer); const metadata = this.createMetadata("pptx" /* PPTX */, buffer.length, { extra: { note: "Basic text extraction only" } }); return this.createResult(text, metadata, startTime, warnings); } catch (error) { throw new ConversionError( `Failed to convert PPTX: ${error instanceof Error ? error.message : "Unknown error"}`, "pptx" /* PPTX */, error instanceof Error ? error : void 0 ); } } async extractTextFromPptx(buffer) { const JSZip = (await import('jszip')).default; const zip = await JSZip.loadAsync(buffer); let markdown = "# Presentation\n\n"; markdown += "*Note: This is a basic text extraction. For full PPTX support, use a dedicated parser.*\n\n"; const slideFiles = Object.keys(zip.files).filter( (name) => name.match(/ppt\/slides\/slide\d+\.xml/) ); if (slideFiles.length === 0) { return markdown + "*No slides found*\n"; } for (const slideFile of slideFiles.sort()) { const content = await zip.files[slideFile].async("text"); const textMatches = content.match(/<a:t>([^<]*)<\/a:t>/g); if (textMatches) { const slideNum = slideFile.match(/slide(\d+)\.xml/)?.[1] || "?"; markdown += `## Slide ${slideNum} `; const texts = textMatches.map((match) => match.replace(/<\/?a:t>/g, "")).filter((text) => text.trim()); markdown += texts.join("\n\n") + "\n\n"; } } return markdown.trim(); } }; // src/converters/txt.ts var TxtConverter = class extends BaseConverter { getFormat() { return "txt" /* TXT */; } canHandle(extension) { const ext = extension.toLowerCase(); return ext === "txt" || ext === "text" || ext === "md" || ext === "markdown"; } async convert(buffer, options) { const startTime = Date.now(); try { let text = buffer.toString("utf-8"); if (options?.preserveFormatting !== false) { text = this.cleanupText(text); } const metadata = this.createMetadata("txt" /* TXT */, buffer.length, { extra: { encoding: "utf-8" } }); return this.createResult(text, metadata, startTime); } catch (error) { throw new ConversionError( `Failed to convert TXT: ${error instanceof Error ? error.message : "Unknown error"}`, "txt" /* TXT */, error instanceof Error ? error : void 0 ); } } cleanupText(text) { text = text.replace(/\r\n/g, "\n"); text = text.replace(/\n{3,}/g, "\n\n"); return text.trim(); } }; var HtmlConverter = class extends BaseConverter { turndown; constructor() { super(); this.turndown = new TurndownService({ headingStyle: "atx", codeBlockStyle: "fenced", bulletListMarker: "-" }); } getFormat() { return "html" /* HTML */; } canHandle(extension) { const ext = extension.toLowerCase(); return ext === "html" || ext === "htm"; } async convert(buffer, options) { const startTime = Date.now(); try { const html = buffer.toString("utf-8"); const bodyContent = this.extractBodyContent(html); let markdown = this.turndown.turndown(bodyContent); if (options?.preserveFormatting !== false) { markdown = this.cleanupMarkdown(markdown); } const metadata = this.createMetadata("html" /* HTML */, buffer.length, { title: this.extractTitle(html), extra: { encoding: "utf-8" } }); return this.createResult(markdown, metadata, startTime); } catch (error) { throw new ConversionError( `Failed to convert HTML: ${error instanceof Error ? error.message : "Unknown error"}`, "html" /* HTML */, error instanceof Error ? error : void 0 ); } } cleanupMarkdown(markdown) { markdown = markdown.replace(/\n{3,}/g, "\n\n"); markdown = markdown.trim(); return markdown; } extractTitle(html) { const titleMatch = html.match(/<title>([^<]*)<\/title>/i); return titleMatch ? titleMatch[1].trim() : void 0; } extractBodyContent(html) { let content = html.replace( /<script\b[^<]*(?:(?!<\/script>)<[^<]*)*<\/script>/gi, "" ); content = content.replace( /<style\b[^<]*(?:(?!<\/style>)<[^<]*)*<\/style>/gi, "" ); const bodyMatch = content.match(/<body[^>]*>([\s\S]*)<\/body>/i); if (bodyMatch) { return bodyMatch[1]; } return content; } }; var ConversionCache = class { cache; maxSize; maxAge; /** * Create a new conversion cache * * @param maxSize Maximum number of entries (default: 100) * @param maxAge Maximum age in milliseconds (default: 1 hour) */ constructor(maxSize = 100, maxAge = 36e5) { this.cache = /* @__PURE__ */ new Map(); this.maxSize = maxSize; this.maxAge = maxAge; } /** * Generate a cache key from buffer content */ generateKey(buffer, format) { const hash = createHash("sha256").update(buffer).digest("hex"); return `${format}:${hash}`; } /** * Get cached result */ get(buffer, format) { const key = this.generateKey(buffer, format); const entry = this.cache.get(key); if (!entry) { return null; } const age = Date.now() - entry.timestamp; if (age > this.maxAge) { this.cache.delete(key); return null; } entry.hits++; this.cache.delete(key); this.cache.set(key, entry); return entry.result; } /** * Store result in cache */ set(buffer, format, result) { const key = this.generateKey(buffer, format); if (this.cache.size >= this.maxSize) { const firstKey = this.cache.keys().next().value; if (firstKey) { this.cache.delete(firstKey); } } this.cache.set(key, { result, timestamp: Date.now(), hits: 0 }); } /** * Check if result is cached */ has(buffer, format) { return this.get(buffer, format) !== null; } /** * Clear all cache entries */ clear() { this.cache.clear(); } /** * Clear expired entries */ clearExpired() { const now = Date.now(); for (const [key, entry] of this.cache.entries()) { if (now - entry.timestamp > this.maxAge) { this.cache.delete(key); } } } /** * Get cache statistics */ getStats() { let totalHits = 0; for (const entry of this.cache.values()) { totalHits += entry.hits; } const hitRate = this.cache.size > 0 ? totalHits / this.cache.size : 0; return { size: this.cache.size, maxSize: this.maxSize, hitRate, totalHits }; } /** * Get cache size in bytes (approximate) */ getMemoryUsage() { let total = 0; for (const entry of this.cache.values()) { total += entry.result.markdown.length * 2; total += JSON.stringify(entry.result.metadata).length; } return total; } }; // src/logger.ts var Logger = class _Logger { level; prefix; timestamps; constructor(options = {}) { this.level = options.level ?? 2 /* WARN */; this.prefix = options.prefix ?? "[Transmutation]"; this.timestamps = options.timestamps ?? false; } /** * Set logging level */ setLevel(level) { this.level = level; } /** * Get current logging level */ getLevel() { return this.level; } /** * Format log message with prefix and timestamp */ format(level, message) { const parts = []; if (this.timestamps) { parts.push((/* @__PURE__ */ new Date()).toISOString()); } parts.push(this.prefix, `[${level}]`, message); return parts.join(" "); } /** * Log debug message */ debug(message, ...args) { if (this.level <= 0 /* DEBUG */) { console.debug(this.format("DEBUG", message), ...args); } } /** * Log info message */ info(message, ...args) { if (this.level <= 1 /* INFO */) { console.info(this.format("INFO", message), ...args); } } /** * Log warning message */ warn(message, ...args) { if (this.level <= 2 /* WARN */) { console.warn(this.format("WARN", message), ...args); } } /** * Log error message */ error(message, error, ...args) { if (this.level <= 3 /* ERROR */) { console.error(this.format("ERROR", message), error, ...args); } } /** * Create a child logger with a different prefix */ child(prefix) { return new _Logger({ level: this.level, prefix: `${this.prefix}:${prefix}`, timestamps: this.timestamps }); } }; var defaultLogger = new Logger({ level: process.env.TRANSMUTATION_LOG_LEVEL ? parseInt(process.env.TRANSMUTATION_LOG_LEVEL, 10) : 2 /* WARN */, timestamps: process.env.TRANSMUTATION_LOG_TIMESTAMPS === "true" }); // src/validation.ts function validateBuffer(buffer) { if (!Buffer.isBuffer(buffer)) { throw new ConversionError( "Invalid input: expected Buffer, got " + typeof buffer ); } if (buffer.length === 0) { throw new ConversionError("Invalid input: buffer is empty"); } const maxSize = 500 * 1024 * 1024; if (buffer.length > maxSize) { throw new ConversionError( `Invalid input: buffer size (${(buffer.length / 1024 / 1024).toFixed(2)}MB) exceeds maximum allowed size (${maxSize / 1024 / 1024}MB)` ); } } function validateFormat(format) { const validFormats = Object.values(DocumentFormat); if (!validFormats.includes(format)) { throw new ConversionError( `Invalid format: "${format}". Supported formats: ${validFormats.filter((f) => f !== "unknown" /* UNKNOWN */).join(", ")}` ); } if (format === "unknown" /* UNKNOWN */) { throw new ConversionError( "Cannot convert documents with unknown format. Please specify a valid format." ); } return format; } function validateFilePath(filePath) { if (typeof filePath !== "string") { throw new ConversionError( "Invalid file path: expected string, got " + typeof filePath ); } if (filePath.trim().length === 0) { throw new ConversionError("Invalid file path: path is empty"); } const dangerousPatterns = [ /\.\./, // Parent directory traversal /^\/etc\//, // System directories /^\/sys\//, /^\/proc\// ]; for (const pattern of dangerousPatterns) { if (pattern.test(filePath)) { throw new ConversionError( `Invalid file path: path contains potentially dangerous pattern: ${filePath}` ); } } } function validateOptions(options) { if (options === null || options === void 0) { return; } if (typeof options !== "object") { throw new ConversionError( "Invalid options: expected object, got " + typeof options ); } if ("maxPages" in options) { if (typeof options.maxPages !== "number") { throw new ConversionError( "Invalid option maxPages: expected number, got " + typeof options.maxPages ); } if (options.maxPages < 1) { throw new ConversionError("Invalid option maxPages: must be at least 1"); } if (!Number.isInteger(options.maxPages)) { throw new ConversionError("Invalid option maxPages: must be an integer"); } } if ("preserveFormatting" in options) { if (typeof options.preserveFormatting !== "boolean") { throw new ConversionError( "Invalid option preserveFormatting: expected boolean, got " + typeof options.preserveFormatting ); } } } function validateCacheConfig(config) { if (config === null || config === void 0) { return; } if (typeof config !== "object") { throw new ConversionError( "Invalid cache config: expected object, got " + typeof config ); } if ("cacheSize" in config) { if (typeof config.cacheSize !== "number") { throw new ConversionError( "Invalid cacheSize: expected number, got " + typeof config.cacheSize ); } if (config.cacheSize < 1) { throw new ConversionError("Invalid cacheSize: must be at least 1"); } if (config.cacheSize > 1e4) { throw new ConversionError("Invalid cacheSize: maximum is 10000"); } } if ("cacheMaxAge" in config) { if (typeof config.cacheMaxAge !== "number") { throw new ConversionError( "Invalid cacheMaxAge: expected number, got " + typeof config.cacheMaxAge ); } if (config.cacheMaxAge < 0) { throw new ConversionError("Invalid cacheMaxAge: must be non-negative"); } } if ("enableCache" in config) { if (typeof config.enableCache !== "boolean") { throw new ConversionError( "Invalid enableCache: expected boolean, got " + typeof config.enableCache ); } } } // src/metrics.ts var MetricsCollector = class { metrics; startTime; constructor() { this.metrics = { totalConversions: 0, successfulConversions: 0, failedConversions: 0, totalBytesProcessed: 0, totalTimeMs: 0, cacheHits: 0, cacheMisses: 0, errorsByType: /* @__PURE__ */ new Map(), conversionsByFormat: /* @__PURE__ */ new Map() }; this.startTime = Date.now(); } /** * Record a successful conversion */ recordSuccess(format, bytes, timeMs, fromCache) { this.metrics.totalConversions++; this.metrics.successfulConversions++; this.metrics.totalBytesProcessed += bytes; this.metrics.totalTimeMs += timeMs; if (fromCache) { this.metrics.cacheHits++; } else { this.metrics.cacheMisses++; } const count = this.metrics.conversionsByFormat.get(format) || 0; this.metrics.conversionsByFormat.set(format, count + 1); } /** * Record a failed conversion */ recordFailure(format, errorType) { this.metrics.totalConversions++; this.metrics.failedConversions++; const count = this.metrics.errorsByType.get(errorType) || 0; this.metrics.errorsByType.set(errorType, count + 1); const formatCount = this.metrics.conversionsByFormat.get(format) || 0; this.metrics.conversionsByFormat.set(format, formatCount + 1); } /** * Get current metrics */ getMetrics() { return { ...this.metrics }; } /** * Get metrics summary */ getSummary() { const uptime = Date.now() - this.startTime; const successRate = this.metrics.totalConversions > 0 ? this.metrics.successfulConversions / this.metrics.totalConversions * 100 : 0; const totalCacheOps = this.metrics.cacheHits + this.metrics.cacheMisses; const cacheHitRate = totalCacheOps > 0 ? this.metrics.cacheHits / totalCacheOps * 100 : 0; const avgConversionTime = this.metrics.successfulConversions > 0 ? this.metrics.totalTimeMs / this.metrics.successfulConversions : 0; const avgThroughput = this.metrics.totalTimeMs > 0 ? this.metrics.totalBytesProcessed / this.metrics.totalTimeMs * 1e3 : 0; return { uptime, successRate, cacheHitRate, avgConversionTime, avgThroughput }; } /** * Reset all metrics */ reset() { this.metrics = { totalConversions: 0, successfulConversions: 0, failedConversions: 0, totalBytesProcessed: 0, totalTimeMs: 0, cacheHits: 0, cacheMisses: 0, errorsByType: /* @__PURE__ */ new Map(), conversionsByFormat: /* @__PURE__ */ new Map() }; this.startTime = Date.now(); } /** * Export metrics as JSON */ toJSON() { const summary = this.getSummary(); return { timestamp: (/* @__PURE__ */ new Date()).toISOString(), uptime: summary.uptime, metrics: { ...this.metrics, errorsByType: Object.fromEntries(this.metrics.errorsByType), conversionsByFormat: Object.fromEntries( this.metrics.conversionsByFormat ) }, summary: { successRate: summary.successRate.toFixed(2) + "%", cacheHitRate: summary.cacheHitRate.toFixed(2) + "%", avgConversionTime: summary.avgConversionTime.toFixed(2) + "ms", avgThroughput: (summary.avgThroughput / 1024).toFixed(2) + " KB/s" } }; } }; // src/index.ts var Converter = class { converters; cache; logger; validateInput; metrics; constructor(config) { this.converters = /* @__PURE__ */ new Map(); this.logger = config?.logger || defaultLogger; this.validateInput = config?.validateInput ?? true; if (config?.collectMetrics) { this.metrics = new MetricsCollector(); this.logger.info("Metrics collection enabled"); } if (config) { validateCacheConfig(config); } if (config?.enableCache) { this.cache = new ConversionCache( config.cacheSize || 100, config.cacheMaxAge || 36e5 ); this.logger.info( `Cache enabled: size=${config.cacheSize || 100}, maxAge=${config.cacheMaxAge || 36e5}ms` ); } this.registerConverter(new PdfConverter()); this.registerConverter(new DocxConverter()); this.registerConverter(new XlsxConverter()); this.registerConverter(new PptxConverter()); this.registerConverter(new TxtConverter()); this.registerConverter(new HtmlConverter()); } /** * Register a converter */ registerConverter(converter) { this.converters.set(converter.getFormat(), converter); } /** * Detect file format from extension */ detectFormat(filePath) { const ext = extname(filePath).toLowerCase().replace(".", ""); for (const converter of this.converters.values()) { if (converter.canHandle(ext)) { return converter.getFormat(); } } return "unknown" /* UNKNOWN */; } /** * Get converter for a specific format */ getConverter(format) { const converter = this.converters.get(format); if (!converter) { throw new ConversionError(`No converter available for format: ${format}`); } return converter; } /** * Convert a file buffer to markdown */ async convertBuffer(buffer, format, options) { const startTime = Date.now(); if (this.validateInput) { validateBuffer(buffer); validateFormat(format); validateOptions(options); } this.logger.debug(`Converting ${format} buffer (${buffer.length} bytes)`); if (this.cache) { const cached = this.cache.get(buffer, format); if (cached) { this.logger.debug(`Cache hit for ${format}`); if (this.metrics) { const elapsed = Date.now() - startTime; this.metrics.recordSuccess(format, buffer.length, elapsed, true); } return cached; } this.logger.debug(`Cache miss for ${format}`); } try { const converter = this.getConverter(format); const result = await converter.convert(buffer, options); const elapsed = Date.now() - startTime; this.logger.info( `Converted ${format} in ${elapsed}ms (${(buffer.length / 1024).toFixed(2)} KB)` ); const wasFromCache = false; if (this.cache) { this.cache.set(buffer, format, result); this.logger.debug(`Cached ${format} result`); } if (this.metrics) { this.metrics.recordSuccess( format, buffer.length, elapsed, wasFromCache ); } return result; } catch (error) { const elapsed = Date.now() - startTime; this.logger.error( `Conversion failed for ${format} after ${elapsed}ms`, error ); if (this.metrics) { const errorType = error.constructor.name; this.metrics.recordFailure(format, errorType); } throw error; } } /** * Convert a file to markdown */ async convertFile(filePath, options) { if (this.validateInput) { validateFilePath(filePath); } this.logger.debug(`Converting file: ${filePath}`); const format = this.detectFormat(filePath); if (format === "unknown" /* UNKNOWN */) { const ext = extname(filePath); this.logger.error(`Unsupported file format: ${ext}`); throw new ConversionError( `Unsupported file format: ${ext}. Supported formats: ${this.getSupportedFormats().join(", ")}` ); } try { const buffer = await readFile(filePath); return this.convertBuffer(buffer, format, options); } catch (error) { if (error.code === "ENOENT") { throw new ConversionError(`File not found: ${filePath}`); } if (error.code === "EACCES") { throw new ConversionError(`Permission denied: ${filePath}`); } throw error; } } /** * Get list of supported formats */ getSupportedFormats() { return Array.from(this.converters.keys()); } /** * Check if a file format is supported */ isSupported(filePath) { return this.detectFormat(filePath) !== "unknown" /* UNKNOWN */; } /** * Clear the conversion cache */ clearCache() { if (this.cache) { this.cache.clear(); } } /** * Get cache statistics (if caching is enabled) */ getCacheStats() { return this.cache?.getStats(); } /** * Get cache memory usage in bytes (if caching is enabled) */ getCacheMemoryUsage() { return this.cache?.getMemoryUsage() || 0; } /** * Get conversion metrics (if metrics collection is enabled) */ getMetrics() { return this.metrics?.getMetrics(); } /** * Get metrics summary (if metrics collection is enabled) */ getMetricsSummary() { return this.metrics?.getSummary(); } /** * Reset metrics (if metrics collection is enabled) */ resetMetrics() { if (this.metrics) { this.metrics.reset(); } } /** * Export metrics as JSON (if metrics collection is enabled) */ exportMetrics() { return this.metrics?.toJSON(); } }; // src/cli.ts var program = new Command(); program.name("transmutation-lite").description( "Simplified document converter for PDF, DOCX, XLSX, PPTX to Markdown" ).version("0.1.0"); program.command("convert <file>").description("Convert a single file to Markdown").option("-o, --output <path>", "Output file path (default: <filename>.md)").option( "-m, --max-pages <number>", "Maximum pages/sheets to process", parseInt ).option("--no-preserve-formatting", "Disable formatting preservation").action(async (file, options) => { try { const converter = new Converter(); if (!converter.isSupported(file)) { console.error(`\u274C Unsupported file format: ${extname(file)}`); process.exit(1); } console.log(`\u{1F4C4} Converting: ${basename(file)}`); const conversionOptions = { preserveFormatting: options.preserveFormatting, maxPages: options.maxPages }; const result = await converter.convertFile(file, conversionOptions); const outputPath = options.output || file.replace(/\.[^.]+$/, ".md"); await mkdir(dirname(outputPath), { recursive: true }); await writeFile(outputPath, result.markdown, "utf-8"); console.log(`\u2705 Converted successfully!`); console.log(` Format: ${result.metadata.format}`); console.log(` Pages: ${result.metadata.pageCount || "N/A"}`); console.log(` Time: ${result.conversionTimeMs}ms`); console.log(` Output: ${outputPath}`); if (result.warnings && result.warnings.length > 0) { console.log(`\u26A0\uFE0F Warnings:`); result.warnings.forEach((warning) => console.log(` - ${warning}`)); } } catch (error) { console.error( `\u274C Conversion failed: ${error instanceof Error ? error.message : "Unknown error"}` ); process.exit(1); } }); program.command("batch <directory>").description("Convert all supported files in a directory").option( "-o, --output <path>", "Output directory (default: <directory>/output)" ).option("-r, --recursive", "Process subdirectories recursively").option( "-m, --max-pages <number>", "Maximum pages/sheets to process", parseInt ).option("--parallel <number>", "Number of parallel conversions", parseInt, 4).option("--no-preserve-formatting", "Disable formatting preservation").action(async (directory, options) => { try { const converter = new Converter(); const files = await findFiles(directory, options.recursive); const supportedFiles = files.filter( (file) => converter.isSupported(file) ); if (supportedFiles.length === 0) { console.log("\u274C No supported files found"); process.exit(1); } console.log(`\u{1F4C1} Found ${supportedFiles.length} supported files`); const outputDir = options.output || join(directory, "output"); await mkdir(outputDir, { recursive: true }); const conversionOptions = { preserveFormatting: options.preserveFormatting, maxPages: options.maxPages }; let completed = 0; let failed = 0; const batchSize = options.parallel; for (let i = 0; i < supportedFiles.length; i += batchSize) { const batch = supportedFiles.slice(i, i + batchSize); await Promise.all( batch.map(async (file) => { try { const result = await converter.convertFile( file, conversionOptions ); const relativePath = file.replace(directory, "").replace(/^[/\\]/, ""); const outputPath = join( outputDir, relativePath.replace(/\.[^.]+$/, ".md") ); await mkdir(dirname(outputPath), { recursive: true }); await writeFile(outputPath, result.markdown, "utf-8"); completed++; console.log( `\u2705 [${completed}/${supportedFiles.length}] ${basename(file)} (${result.conversionTimeMs}ms)` ); } catch (error) { failed++; console.error( `\u274C [${completed + failed}/${supportedFiles.length}] ${basename(file)}: ${error instanceof Error ? error.message : "Unknown error"}` ); } }) ); } console.log(` \u{1F4CA} Summary:`); console.log(` Total: ${supportedFiles.length}`); console.log(` Success: ${completed}`); console.log(` Failed: ${failed}`); console.log(` Output: ${outputDir}`); if (failed > 0) { process.exit(1); } } catch (error) { console.error( `\u274C Batch conversion failed: ${error instanceof Error ? error.message : "Unknown error"}` ); process.exit(1); } }); program.command("formats").description("List supported file formats").action(() => { const converter = new Converter(); const formats = converter.getSupportedFormats(); console.log("\u{1F4CB} Supported formats:"); formats.forEach((format) => { console.log(` - ${format.toUpperCase()}`); }); }); async function findFiles(dir, recursive = false) { const files = []; const entries = await readdir(dir); for (const entry of entries) { const fullPath = join(dir, entry); const stats = await stat(fullPath); if (stats.isDirectory() && recursive) { const subFiles = await findFiles(fullPath, recursive); files.push(...subFiles); } else if (stats.isFile()) { files.push(fullPath); } } return files; } program.parse(); //# sourceMappingURL=cli.js.map //# sourceMappingURL=cli.js.map