@hivellm/transmutation-lite
Version:
Simplified document converter for common formats (PDF, DOCX, XLSX, PPTX) to Markdown
1,205 lines (1,193 loc) • 35.8 kB
JavaScript
import { Command } from 'commander';
import { mkdir, writeFile, readdir, stat, readFile } from 'fs/promises';
import { extname, basename, dirname, join } from 'path';
import { createRequire } from 'module';
import mammoth from 'mammoth';
import * as XLSX from 'xlsx';
import TurndownService from 'turndown';
import { createHash } from 'crypto';
// src/converters/base.ts
var BaseConverter = class {
/**
* Create base metadata object
*/
createMetadata(format, fileSize, extra) {
return {
format,
fileSize,
pageCount: extra?.pageCount,
title: extra?.title,
author: extra?.author,
createdAt: extra?.createdAt,
extra: extra?.extra
};
}
/**
* Create conversion result with timing
*/
createResult(markdown, metadata, startTime, warnings) {
return {
markdown,
metadata,
conversionTimeMs: Date.now() - startTime,
warnings
};
}
};
// src/types.ts
var DocumentFormat = /* @__PURE__ */ ((DocumentFormat2) => {
DocumentFormat2["PDF"] = "pdf";
DocumentFormat2["DOCX"] = "docx";
DocumentFormat2["XLSX"] = "xlsx";
DocumentFormat2["PPTX"] = "pptx";
DocumentFormat2["TXT"] = "txt";
DocumentFormat2["HTML"] = "html";
DocumentFormat2["UNKNOWN"] = "unknown";
return DocumentFormat2;
})(DocumentFormat || {});
var ConversionError = class extends Error {
constructor(message, format, cause) {
super(message);
this.format = format;
this.cause = cause;
this.name = "ConversionError";
}
};
// src/converters/pdf.ts
var require2 = createRequire(import.meta.url);
var pdfParse = require2("pdf-parse-new");
var PdfConverter = class extends BaseConverter {
getFormat() {
return "pdf" /* PDF */;
}
canHandle(extension) {
return extension.toLowerCase() === "pdf";
}
async convert(buffer, options) {
const startTime = Date.now();
const warnings = [];
try {
const data = await pdfParse(buffer);
let text = data.text;
if (options?.preserveFormatting !== false) {
text = this.cleanupText(text);
}
if (options?.maxPages && options.maxPages > 0) {
const pages = text.split("\f");
text = pages.slice(0, options.maxPages).join("\n\n---\n\n");
if (pages.length > options.maxPages) {
warnings.push(
`Only processed ${options.maxPages} of ${pages.length} pages`
);
}
}
const metadata = this.createMetadata("pdf" /* PDF */, buffer.length, {
pageCount: data.numpages,
title: data.info?.Title,
author: data.info?.Author,
createdAt: data.info?.CreationDate ? new Date(data.info.CreationDate) : void 0,
extra: {
producer: data.info?.Producer,
creator: data.info?.Creator
}
});
return this.createResult(text, metadata, startTime, warnings);
} catch (error) {
throw new ConversionError(
`Failed to convert PDF: ${error instanceof Error ? error.message : "Unknown error"}`,
"pdf" /* PDF */,
error instanceof Error ? error : void 0
);
}
}
cleanupText(text) {
text = text.replace(/[ \t]+/g, " ");
text = text.replace(/\n{3,}/g, "\n\n");
text = text.replace(/\f/g, "\n\n---\n\n");
return text.trim();
}
};
var DocxConverter = class extends BaseConverter {
getFormat() {
return "docx" /* DOCX */;
}
canHandle(extension) {
return extension.toLowerCase() === "docx";
}
async convert(buffer, options) {
const startTime = Date.now();
const warnings = [];
try {
const result = await mammoth.convertToMarkdown(
{ buffer },
{
styleMap: [
"p[style-name='Heading 1'] => # :fresh",
"p[style-name='Heading 2'] => ## :fresh",
"p[style-name='Heading 3'] => ### :fresh",
"p[style-name='Heading 4'] => #### :fresh",
"p[style-name='Heading 5'] => ##### :fresh",
"p[style-name='Heading 6'] => ###### :fresh"
]
}
);
let markdown = result.value;
if (result.messages.length > 0) {
warnings.push(
...result.messages.map((msg) => `${msg.type}: ${msg.message}`)
);
}
if (options?.preserveFormatting !== false) {
markdown = this.cleanupMarkdown(markdown);
}
const metadata = this.createMetadata("docx" /* DOCX */, buffer.length, {
extra: {
warnings: result.messages
}
});
return this.createResult(markdown, metadata, startTime, warnings);
} catch (error) {
throw new ConversionError(
`Failed to convert DOCX: ${error instanceof Error ? error.message : "Unknown error"}`,
"docx" /* DOCX */,
error instanceof Error ? error : void 0
);
}
}
cleanupMarkdown(markdown) {
markdown = markdown.replace(/\n{3,}/g, "\n\n");
markdown = markdown.trim();
return markdown;
}
};
var XlsxConverter = class extends BaseConverter {
getFormat() {
return "xlsx" /* XLSX */;
}
canHandle(extension) {
const ext = extension.toLowerCase();
return ext === "xlsx" || ext === "xls";
}
async convert(buffer, options) {
const startTime = Date.now();
const warnings = [];
try {
const workbook = XLSX.read(buffer, { type: "buffer" });
let markdown = "";
let totalSheets = workbook.SheetNames.length;
const sheetsToProcess = options?.maxPages && options.maxPages > 0 ? Math.min(options.maxPages, totalSheets) : totalSheets;
if (sheetsToProcess < totalSheets) {
warnings.push(
`Only processed ${sheetsToProcess} of ${totalSheets} sheets`
);
}
for (let i = 0; i < sheetsToProcess; i++) {
const sheetName = workbook.SheetNames[i];
const sheet = workbook.Sheets[sheetName];
if (i > 0) {
markdown += "\n\n---\n\n";
}
markdown += `# ${sheetName}
`;
const csvData = XLSX.utils.sheet_to_csv(sheet);
if (csvData.trim()) {
markdown += this.csvToMarkdownTable(csvData);
} else {
markdown += "*Empty sheet*\n";
warnings.push(`Sheet "${sheetName}" is empty`);
}
}
const metadata = this.createMetadata("xlsx" /* XLSX */, buffer.length, {
pageCount: totalSheets,
extra: {
sheetNames: workbook.SheetNames,
processedSheets: sheetsToProcess
}
});
return this.createResult(markdown.trim(), metadata, startTime, warnings);
} catch (error) {
throw new ConversionError(
`Failed to convert XLSX: ${error instanceof Error ? error.message : "Unknown error"}`,
"xlsx" /* XLSX */,
error instanceof Error ? error : void 0
);
}
}
csvToMarkdownTable(csv) {
const lines = csv.split("\n").filter((line) => line.trim());
if (lines.length === 0) {
return "*No data*\n";
}
const rows = lines.map((line) => this.parseCsvLine(line));
if (rows.length === 0) {
return "*No data*\n";
}
const maxCols = Math.max(...rows.map((row) => row.length));
rows.forEach((row) => {
while (row.length < maxCols) {
row.push("");
}
});
let markdown = "";
markdown += "| " + rows[0].join(" | ") + " |\n";
markdown += "| " + rows[0].map(() => "---").join(" | ") + " |\n";
for (let i = 1; i < rows.length; i++) {
markdown += "| " + rows[i].join(" | ") + " |\n";
}
return markdown;
}
parseCsvLine(line) {
const cells = [];
let cell = "";
let inQuotes = false;
for (let i = 0; i < line.length; i++) {
const char = line[i];
if (char === '"') {
if (inQuotes && line[i + 1] === '"') {
cell += '"';
i++;
} else {
inQuotes = !inQuotes;
}
} else if (char === "," && !inQuotes) {
cells.push(cell.trim());
cell = "";
} else {
cell += char;
}
}
cells.push(cell.trim());
return cells;
}
};
// src/converters/pptx.ts
var PptxConverter = class extends BaseConverter {
getFormat() {
return "pptx" /* PPTX */;
}
canHandle(extension) {
const ext = extension.toLowerCase();
return ext === "pptx" || ext === "ppt";
}
async convert(buffer, _options) {
const startTime = Date.now();
const warnings = [];
warnings.push(
"PPTX conversion is simplified and may not extract all content"
);
try {
const text = await this.extractTextFromPptx(buffer);
const metadata = this.createMetadata("pptx" /* PPTX */, buffer.length, {
extra: {
note: "Basic text extraction only"
}
});
return this.createResult(text, metadata, startTime, warnings);
} catch (error) {
throw new ConversionError(
`Failed to convert PPTX: ${error instanceof Error ? error.message : "Unknown error"}`,
"pptx" /* PPTX */,
error instanceof Error ? error : void 0
);
}
}
async extractTextFromPptx(buffer) {
const JSZip = (await import('jszip')).default;
const zip = await JSZip.loadAsync(buffer);
let markdown = "# Presentation\n\n";
markdown += "*Note: This is a basic text extraction. For full PPTX support, use a dedicated parser.*\n\n";
const slideFiles = Object.keys(zip.files).filter(
(name) => name.match(/ppt\/slides\/slide\d+\.xml/)
);
if (slideFiles.length === 0) {
return markdown + "*No slides found*\n";
}
for (const slideFile of slideFiles.sort()) {
const content = await zip.files[slideFile].async("text");
const textMatches = content.match(/<a:t>([^<]*)<\/a:t>/g);
if (textMatches) {
const slideNum = slideFile.match(/slide(\d+)\.xml/)?.[1] || "?";
markdown += `## Slide ${slideNum}
`;
const texts = textMatches.map((match) => match.replace(/<\/?a:t>/g, "")).filter((text) => text.trim());
markdown += texts.join("\n\n") + "\n\n";
}
}
return markdown.trim();
}
};
// src/converters/txt.ts
var TxtConverter = class extends BaseConverter {
getFormat() {
return "txt" /* TXT */;
}
canHandle(extension) {
const ext = extension.toLowerCase();
return ext === "txt" || ext === "text" || ext === "md" || ext === "markdown";
}
async convert(buffer, options) {
const startTime = Date.now();
try {
let text = buffer.toString("utf-8");
if (options?.preserveFormatting !== false) {
text = this.cleanupText(text);
}
const metadata = this.createMetadata("txt" /* TXT */, buffer.length, {
extra: {
encoding: "utf-8"
}
});
return this.createResult(text, metadata, startTime);
} catch (error) {
throw new ConversionError(
`Failed to convert TXT: ${error instanceof Error ? error.message : "Unknown error"}`,
"txt" /* TXT */,
error instanceof Error ? error : void 0
);
}
}
cleanupText(text) {
text = text.replace(/\r\n/g, "\n");
text = text.replace(/\n{3,}/g, "\n\n");
return text.trim();
}
};
var HtmlConverter = class extends BaseConverter {
turndown;
constructor() {
super();
this.turndown = new TurndownService({
headingStyle: "atx",
codeBlockStyle: "fenced",
bulletListMarker: "-"
});
}
getFormat() {
return "html" /* HTML */;
}
canHandle(extension) {
const ext = extension.toLowerCase();
return ext === "html" || ext === "htm";
}
async convert(buffer, options) {
const startTime = Date.now();
try {
const html = buffer.toString("utf-8");
const bodyContent = this.extractBodyContent(html);
let markdown = this.turndown.turndown(bodyContent);
if (options?.preserveFormatting !== false) {
markdown = this.cleanupMarkdown(markdown);
}
const metadata = this.createMetadata("html" /* HTML */, buffer.length, {
title: this.extractTitle(html),
extra: {
encoding: "utf-8"
}
});
return this.createResult(markdown, metadata, startTime);
} catch (error) {
throw new ConversionError(
`Failed to convert HTML: ${error instanceof Error ? error.message : "Unknown error"}`,
"html" /* HTML */,
error instanceof Error ? error : void 0
);
}
}
cleanupMarkdown(markdown) {
markdown = markdown.replace(/\n{3,}/g, "\n\n");
markdown = markdown.trim();
return markdown;
}
extractTitle(html) {
const titleMatch = html.match(/<title>([^<]*)<\/title>/i);
return titleMatch ? titleMatch[1].trim() : void 0;
}
extractBodyContent(html) {
let content = html.replace(
/<script\b[^<]*(?:(?!<\/script>)<[^<]*)*<\/script>/gi,
""
);
content = content.replace(
/<style\b[^<]*(?:(?!<\/style>)<[^<]*)*<\/style>/gi,
""
);
const bodyMatch = content.match(/<body[^>]*>([\s\S]*)<\/body>/i);
if (bodyMatch) {
return bodyMatch[1];
}
return content;
}
};
var ConversionCache = class {
cache;
maxSize;
maxAge;
/**
* Create a new conversion cache
*
* @param maxSize Maximum number of entries (default: 100)
* @param maxAge Maximum age in milliseconds (default: 1 hour)
*/
constructor(maxSize = 100, maxAge = 36e5) {
this.cache = /* @__PURE__ */ new Map();
this.maxSize = maxSize;
this.maxAge = maxAge;
}
/**
* Generate a cache key from buffer content
*/
generateKey(buffer, format) {
const hash = createHash("sha256").update(buffer).digest("hex");
return `${format}:${hash}`;
}
/**
* Get cached result
*/
get(buffer, format) {
const key = this.generateKey(buffer, format);
const entry = this.cache.get(key);
if (!entry) {
return null;
}
const age = Date.now() - entry.timestamp;
if (age > this.maxAge) {
this.cache.delete(key);
return null;
}
entry.hits++;
this.cache.delete(key);
this.cache.set(key, entry);
return entry.result;
}
/**
* Store result in cache
*/
set(buffer, format, result) {
const key = this.generateKey(buffer, format);
if (this.cache.size >= this.maxSize) {
const firstKey = this.cache.keys().next().value;
if (firstKey) {
this.cache.delete(firstKey);
}
}
this.cache.set(key, {
result,
timestamp: Date.now(),
hits: 0
});
}
/**
* Check if result is cached
*/
has(buffer, format) {
return this.get(buffer, format) !== null;
}
/**
* Clear all cache entries
*/
clear() {
this.cache.clear();
}
/**
* Clear expired entries
*/
clearExpired() {
const now = Date.now();
for (const [key, entry] of this.cache.entries()) {
if (now - entry.timestamp > this.maxAge) {
this.cache.delete(key);
}
}
}
/**
* Get cache statistics
*/
getStats() {
let totalHits = 0;
for (const entry of this.cache.values()) {
totalHits += entry.hits;
}
const hitRate = this.cache.size > 0 ? totalHits / this.cache.size : 0;
return {
size: this.cache.size,
maxSize: this.maxSize,
hitRate,
totalHits
};
}
/**
* Get cache size in bytes (approximate)
*/
getMemoryUsage() {
let total = 0;
for (const entry of this.cache.values()) {
total += entry.result.markdown.length * 2;
total += JSON.stringify(entry.result.metadata).length;
}
return total;
}
};
// src/logger.ts
var Logger = class _Logger {
level;
prefix;
timestamps;
constructor(options = {}) {
this.level = options.level ?? 2 /* WARN */;
this.prefix = options.prefix ?? "[Transmutation]";
this.timestamps = options.timestamps ?? false;
}
/**
* Set logging level
*/
setLevel(level) {
this.level = level;
}
/**
* Get current logging level
*/
getLevel() {
return this.level;
}
/**
* Format log message with prefix and timestamp
*/
format(level, message) {
const parts = [];
if (this.timestamps) {
parts.push((/* @__PURE__ */ new Date()).toISOString());
}
parts.push(this.prefix, `[${level}]`, message);
return parts.join(" ");
}
/**
* Log debug message
*/
debug(message, ...args) {
if (this.level <= 0 /* DEBUG */) {
console.debug(this.format("DEBUG", message), ...args);
}
}
/**
* Log info message
*/
info(message, ...args) {
if (this.level <= 1 /* INFO */) {
console.info(this.format("INFO", message), ...args);
}
}
/**
* Log warning message
*/
warn(message, ...args) {
if (this.level <= 2 /* WARN */) {
console.warn(this.format("WARN", message), ...args);
}
}
/**
* Log error message
*/
error(message, error, ...args) {
if (this.level <= 3 /* ERROR */) {
console.error(this.format("ERROR", message), error, ...args);
}
}
/**
* Create a child logger with a different prefix
*/
child(prefix) {
return new _Logger({
level: this.level,
prefix: `${this.prefix}:${prefix}`,
timestamps: this.timestamps
});
}
};
var defaultLogger = new Logger({
level: process.env.TRANSMUTATION_LOG_LEVEL ? parseInt(process.env.TRANSMUTATION_LOG_LEVEL, 10) : 2 /* WARN */,
timestamps: process.env.TRANSMUTATION_LOG_TIMESTAMPS === "true"
});
// src/validation.ts
function validateBuffer(buffer) {
if (!Buffer.isBuffer(buffer)) {
throw new ConversionError(
"Invalid input: expected Buffer, got " + typeof buffer
);
}
if (buffer.length === 0) {
throw new ConversionError("Invalid input: buffer is empty");
}
const maxSize = 500 * 1024 * 1024;
if (buffer.length > maxSize) {
throw new ConversionError(
`Invalid input: buffer size (${(buffer.length / 1024 / 1024).toFixed(2)}MB) exceeds maximum allowed size (${maxSize / 1024 / 1024}MB)`
);
}
}
function validateFormat(format) {
const validFormats = Object.values(DocumentFormat);
if (!validFormats.includes(format)) {
throw new ConversionError(
`Invalid format: "${format}". Supported formats: ${validFormats.filter((f) => f !== "unknown" /* UNKNOWN */).join(", ")}`
);
}
if (format === "unknown" /* UNKNOWN */) {
throw new ConversionError(
"Cannot convert documents with unknown format. Please specify a valid format."
);
}
return format;
}
function validateFilePath(filePath) {
if (typeof filePath !== "string") {
throw new ConversionError(
"Invalid file path: expected string, got " + typeof filePath
);
}
if (filePath.trim().length === 0) {
throw new ConversionError("Invalid file path: path is empty");
}
const dangerousPatterns = [
/\.\./,
// Parent directory traversal
/^\/etc\//,
// System directories
/^\/sys\//,
/^\/proc\//
];
for (const pattern of dangerousPatterns) {
if (pattern.test(filePath)) {
throw new ConversionError(
`Invalid file path: path contains potentially dangerous pattern: ${filePath}`
);
}
}
}
function validateOptions(options) {
if (options === null || options === void 0) {
return;
}
if (typeof options !== "object") {
throw new ConversionError(
"Invalid options: expected object, got " + typeof options
);
}
if ("maxPages" in options) {
if (typeof options.maxPages !== "number") {
throw new ConversionError(
"Invalid option maxPages: expected number, got " + typeof options.maxPages
);
}
if (options.maxPages < 1) {
throw new ConversionError("Invalid option maxPages: must be at least 1");
}
if (!Number.isInteger(options.maxPages)) {
throw new ConversionError("Invalid option maxPages: must be an integer");
}
}
if ("preserveFormatting" in options) {
if (typeof options.preserveFormatting !== "boolean") {
throw new ConversionError(
"Invalid option preserveFormatting: expected boolean, got " + typeof options.preserveFormatting
);
}
}
}
function validateCacheConfig(config) {
if (config === null || config === void 0) {
return;
}
if (typeof config !== "object") {
throw new ConversionError(
"Invalid cache config: expected object, got " + typeof config
);
}
if ("cacheSize" in config) {
if (typeof config.cacheSize !== "number") {
throw new ConversionError(
"Invalid cacheSize: expected number, got " + typeof config.cacheSize
);
}
if (config.cacheSize < 1) {
throw new ConversionError("Invalid cacheSize: must be at least 1");
}
if (config.cacheSize > 1e4) {
throw new ConversionError("Invalid cacheSize: maximum is 10000");
}
}
if ("cacheMaxAge" in config) {
if (typeof config.cacheMaxAge !== "number") {
throw new ConversionError(
"Invalid cacheMaxAge: expected number, got " + typeof config.cacheMaxAge
);
}
if (config.cacheMaxAge < 0) {
throw new ConversionError("Invalid cacheMaxAge: must be non-negative");
}
}
if ("enableCache" in config) {
if (typeof config.enableCache !== "boolean") {
throw new ConversionError(
"Invalid enableCache: expected boolean, got " + typeof config.enableCache
);
}
}
}
// src/metrics.ts
var MetricsCollector = class {
metrics;
startTime;
constructor() {
this.metrics = {
totalConversions: 0,
successfulConversions: 0,
failedConversions: 0,
totalBytesProcessed: 0,
totalTimeMs: 0,
cacheHits: 0,
cacheMisses: 0,
errorsByType: /* @__PURE__ */ new Map(),
conversionsByFormat: /* @__PURE__ */ new Map()
};
this.startTime = Date.now();
}
/**
* Record a successful conversion
*/
recordSuccess(format, bytes, timeMs, fromCache) {
this.metrics.totalConversions++;
this.metrics.successfulConversions++;
this.metrics.totalBytesProcessed += bytes;
this.metrics.totalTimeMs += timeMs;
if (fromCache) {
this.metrics.cacheHits++;
} else {
this.metrics.cacheMisses++;
}
const count = this.metrics.conversionsByFormat.get(format) || 0;
this.metrics.conversionsByFormat.set(format, count + 1);
}
/**
* Record a failed conversion
*/
recordFailure(format, errorType) {
this.metrics.totalConversions++;
this.metrics.failedConversions++;
const count = this.metrics.errorsByType.get(errorType) || 0;
this.metrics.errorsByType.set(errorType, count + 1);
const formatCount = this.metrics.conversionsByFormat.get(format) || 0;
this.metrics.conversionsByFormat.set(format, formatCount + 1);
}
/**
* Get current metrics
*/
getMetrics() {
return { ...this.metrics };
}
/**
* Get metrics summary
*/
getSummary() {
const uptime = Date.now() - this.startTime;
const successRate = this.metrics.totalConversions > 0 ? this.metrics.successfulConversions / this.metrics.totalConversions * 100 : 0;
const totalCacheOps = this.metrics.cacheHits + this.metrics.cacheMisses;
const cacheHitRate = totalCacheOps > 0 ? this.metrics.cacheHits / totalCacheOps * 100 : 0;
const avgConversionTime = this.metrics.successfulConversions > 0 ? this.metrics.totalTimeMs / this.metrics.successfulConversions : 0;
const avgThroughput = this.metrics.totalTimeMs > 0 ? this.metrics.totalBytesProcessed / this.metrics.totalTimeMs * 1e3 : 0;
return {
uptime,
successRate,
cacheHitRate,
avgConversionTime,
avgThroughput
};
}
/**
* Reset all metrics
*/
reset() {
this.metrics = {
totalConversions: 0,
successfulConversions: 0,
failedConversions: 0,
totalBytesProcessed: 0,
totalTimeMs: 0,
cacheHits: 0,
cacheMisses: 0,
errorsByType: /* @__PURE__ */ new Map(),
conversionsByFormat: /* @__PURE__ */ new Map()
};
this.startTime = Date.now();
}
/**
* Export metrics as JSON
*/
toJSON() {
const summary = this.getSummary();
return {
timestamp: (/* @__PURE__ */ new Date()).toISOString(),
uptime: summary.uptime,
metrics: {
...this.metrics,
errorsByType: Object.fromEntries(this.metrics.errorsByType),
conversionsByFormat: Object.fromEntries(
this.metrics.conversionsByFormat
)
},
summary: {
successRate: summary.successRate.toFixed(2) + "%",
cacheHitRate: summary.cacheHitRate.toFixed(2) + "%",
avgConversionTime: summary.avgConversionTime.toFixed(2) + "ms",
avgThroughput: (summary.avgThroughput / 1024).toFixed(2) + " KB/s"
}
};
}
};
// src/index.ts
var Converter = class {
converters;
cache;
logger;
validateInput;
metrics;
constructor(config) {
this.converters = /* @__PURE__ */ new Map();
this.logger = config?.logger || defaultLogger;
this.validateInput = config?.validateInput ?? true;
if (config?.collectMetrics) {
this.metrics = new MetricsCollector();
this.logger.info("Metrics collection enabled");
}
if (config) {
validateCacheConfig(config);
}
if (config?.enableCache) {
this.cache = new ConversionCache(
config.cacheSize || 100,
config.cacheMaxAge || 36e5
);
this.logger.info(
`Cache enabled: size=${config.cacheSize || 100}, maxAge=${config.cacheMaxAge || 36e5}ms`
);
}
this.registerConverter(new PdfConverter());
this.registerConverter(new DocxConverter());
this.registerConverter(new XlsxConverter());
this.registerConverter(new PptxConverter());
this.registerConverter(new TxtConverter());
this.registerConverter(new HtmlConverter());
}
/**
* Register a converter
*/
registerConverter(converter) {
this.converters.set(converter.getFormat(), converter);
}
/**
* Detect file format from extension
*/
detectFormat(filePath) {
const ext = extname(filePath).toLowerCase().replace(".", "");
for (const converter of this.converters.values()) {
if (converter.canHandle(ext)) {
return converter.getFormat();
}
}
return "unknown" /* UNKNOWN */;
}
/**
* Get converter for a specific format
*/
getConverter(format) {
const converter = this.converters.get(format);
if (!converter) {
throw new ConversionError(`No converter available for format: ${format}`);
}
return converter;
}
/**
* Convert a file buffer to markdown
*/
async convertBuffer(buffer, format, options) {
const startTime = Date.now();
if (this.validateInput) {
validateBuffer(buffer);
validateFormat(format);
validateOptions(options);
}
this.logger.debug(`Converting ${format} buffer (${buffer.length} bytes)`);
if (this.cache) {
const cached = this.cache.get(buffer, format);
if (cached) {
this.logger.debug(`Cache hit for ${format}`);
if (this.metrics) {
const elapsed = Date.now() - startTime;
this.metrics.recordSuccess(format, buffer.length, elapsed, true);
}
return cached;
}
this.logger.debug(`Cache miss for ${format}`);
}
try {
const converter = this.getConverter(format);
const result = await converter.convert(buffer, options);
const elapsed = Date.now() - startTime;
this.logger.info(
`Converted ${format} in ${elapsed}ms (${(buffer.length / 1024).toFixed(2)} KB)`
);
const wasFromCache = false;
if (this.cache) {
this.cache.set(buffer, format, result);
this.logger.debug(`Cached ${format} result`);
}
if (this.metrics) {
this.metrics.recordSuccess(
format,
buffer.length,
elapsed,
wasFromCache
);
}
return result;
} catch (error) {
const elapsed = Date.now() - startTime;
this.logger.error(
`Conversion failed for ${format} after ${elapsed}ms`,
error
);
if (this.metrics) {
const errorType = error.constructor.name;
this.metrics.recordFailure(format, errorType);
}
throw error;
}
}
/**
* Convert a file to markdown
*/
async convertFile(filePath, options) {
if (this.validateInput) {
validateFilePath(filePath);
}
this.logger.debug(`Converting file: ${filePath}`);
const format = this.detectFormat(filePath);
if (format === "unknown" /* UNKNOWN */) {
const ext = extname(filePath);
this.logger.error(`Unsupported file format: ${ext}`);
throw new ConversionError(
`Unsupported file format: ${ext}. Supported formats: ${this.getSupportedFormats().join(", ")}`
);
}
try {
const buffer = await readFile(filePath);
return this.convertBuffer(buffer, format, options);
} catch (error) {
if (error.code === "ENOENT") {
throw new ConversionError(`File not found: ${filePath}`);
}
if (error.code === "EACCES") {
throw new ConversionError(`Permission denied: ${filePath}`);
}
throw error;
}
}
/**
* Get list of supported formats
*/
getSupportedFormats() {
return Array.from(this.converters.keys());
}
/**
* Check if a file format is supported
*/
isSupported(filePath) {
return this.detectFormat(filePath) !== "unknown" /* UNKNOWN */;
}
/**
* Clear the conversion cache
*/
clearCache() {
if (this.cache) {
this.cache.clear();
}
}
/**
* Get cache statistics (if caching is enabled)
*/
getCacheStats() {
return this.cache?.getStats();
}
/**
* Get cache memory usage in bytes (if caching is enabled)
*/
getCacheMemoryUsage() {
return this.cache?.getMemoryUsage() || 0;
}
/**
* Get conversion metrics (if metrics collection is enabled)
*/
getMetrics() {
return this.metrics?.getMetrics();
}
/**
* Get metrics summary (if metrics collection is enabled)
*/
getMetricsSummary() {
return this.metrics?.getSummary();
}
/**
* Reset metrics (if metrics collection is enabled)
*/
resetMetrics() {
if (this.metrics) {
this.metrics.reset();
}
}
/**
* Export metrics as JSON (if metrics collection is enabled)
*/
exportMetrics() {
return this.metrics?.toJSON();
}
};
// src/cli.ts
var program = new Command();
program.name("transmutation-lite").description(
"Simplified document converter for PDF, DOCX, XLSX, PPTX to Markdown"
).version("0.1.0");
program.command("convert <file>").description("Convert a single file to Markdown").option("-o, --output <path>", "Output file path (default: <filename>.md)").option(
"-m, --max-pages <number>",
"Maximum pages/sheets to process",
parseInt
).option("--no-preserve-formatting", "Disable formatting preservation").action(async (file, options) => {
try {
const converter = new Converter();
if (!converter.isSupported(file)) {
console.error(`\u274C Unsupported file format: ${extname(file)}`);
process.exit(1);
}
console.log(`\u{1F4C4} Converting: ${basename(file)}`);
const conversionOptions = {
preserveFormatting: options.preserveFormatting,
maxPages: options.maxPages
};
const result = await converter.convertFile(file, conversionOptions);
const outputPath = options.output || file.replace(/\.[^.]+$/, ".md");
await mkdir(dirname(outputPath), { recursive: true });
await writeFile(outputPath, result.markdown, "utf-8");
console.log(`\u2705 Converted successfully!`);
console.log(` Format: ${result.metadata.format}`);
console.log(` Pages: ${result.metadata.pageCount || "N/A"}`);
console.log(` Time: ${result.conversionTimeMs}ms`);
console.log(` Output: ${outputPath}`);
if (result.warnings && result.warnings.length > 0) {
console.log(`\u26A0\uFE0F Warnings:`);
result.warnings.forEach((warning) => console.log(` - ${warning}`));
}
} catch (error) {
console.error(
`\u274C Conversion failed: ${error instanceof Error ? error.message : "Unknown error"}`
);
process.exit(1);
}
});
program.command("batch <directory>").description("Convert all supported files in a directory").option(
"-o, --output <path>",
"Output directory (default: <directory>/output)"
).option("-r, --recursive", "Process subdirectories recursively").option(
"-m, --max-pages <number>",
"Maximum pages/sheets to process",
parseInt
).option("--parallel <number>", "Number of parallel conversions", parseInt, 4).option("--no-preserve-formatting", "Disable formatting preservation").action(async (directory, options) => {
try {
const converter = new Converter();
const files = await findFiles(directory, options.recursive);
const supportedFiles = files.filter(
(file) => converter.isSupported(file)
);
if (supportedFiles.length === 0) {
console.log("\u274C No supported files found");
process.exit(1);
}
console.log(`\u{1F4C1} Found ${supportedFiles.length} supported files`);
const outputDir = options.output || join(directory, "output");
await mkdir(outputDir, { recursive: true });
const conversionOptions = {
preserveFormatting: options.preserveFormatting,
maxPages: options.maxPages
};
let completed = 0;
let failed = 0;
const batchSize = options.parallel;
for (let i = 0; i < supportedFiles.length; i += batchSize) {
const batch = supportedFiles.slice(i, i + batchSize);
await Promise.all(
batch.map(async (file) => {
try {
const result = await converter.convertFile(
file,
conversionOptions
);
const relativePath = file.replace(directory, "").replace(/^[/\\]/, "");
const outputPath = join(
outputDir,
relativePath.replace(/\.[^.]+$/, ".md")
);
await mkdir(dirname(outputPath), { recursive: true });
await writeFile(outputPath, result.markdown, "utf-8");
completed++;
console.log(
`\u2705 [${completed}/${supportedFiles.length}] ${basename(file)} (${result.conversionTimeMs}ms)`
);
} catch (error) {
failed++;
console.error(
`\u274C [${completed + failed}/${supportedFiles.length}] ${basename(file)}: ${error instanceof Error ? error.message : "Unknown error"}`
);
}
})
);
}
console.log(`
\u{1F4CA} Summary:`);
console.log(` Total: ${supportedFiles.length}`);
console.log(` Success: ${completed}`);
console.log(` Failed: ${failed}`);
console.log(` Output: ${outputDir}`);
if (failed > 0) {
process.exit(1);
}
} catch (error) {
console.error(
`\u274C Batch conversion failed: ${error instanceof Error ? error.message : "Unknown error"}`
);
process.exit(1);
}
});
program.command("formats").description("List supported file formats").action(() => {
const converter = new Converter();
const formats = converter.getSupportedFormats();
console.log("\u{1F4CB} Supported formats:");
formats.forEach((format) => {
console.log(` - ${format.toUpperCase()}`);
});
});
async function findFiles(dir, recursive = false) {
const files = [];
const entries = await readdir(dir);
for (const entry of entries) {
const fullPath = join(dir, entry);
const stats = await stat(fullPath);
if (stats.isDirectory() && recursive) {
const subFiles = await findFiles(fullPath, recursive);
files.push(...subFiles);
} else if (stats.isFile()) {
files.push(fullPath);
}
}
return files;
}
program.parse();
//# sourceMappingURL=cli.js.map
//# sourceMappingURL=cli.js.map