@knowcode/convert-to-markdown
Version:
Convert Excel, PDF, and Word documents to clean, AI-ready formats like Markdown and JSON
198 lines (172 loc) • 4.82 kB
TypeScript
/**
* Convert to Markdown - TypeScript Definitions
*/
export interface ConversionOptions {
/**
* Original filename (used for metadata)
*/
filename?: string;
/**
* Filter sheets by prefix (Excel only)
*/
sheetPrefix?: string;
}
export interface ExcelSheet {
name: string;
rowCount: number;
columnCount: number;
cellCount: number;
nonEmptyCells: number;
emptyRate: string | number;
}
export interface ExcelJsonStatistics {
fileName: string;
fileSize: {
bytes: number;
KB: string;
MB: string;
};
sheets: ExcelSheet[];
totalSheets: number;
totalCells: number;
totalNonEmptyCells: number;
overallEmptyRate: string | number;
estimatedTokens: number;
}
export interface ExcelJsonResult {
/**
* JSON string containing the converted data
*/
content: string;
/**
* Conversion statistics
*/
statistics: ExcelJsonStatistics;
}
export interface MarkdownStatistics {
sizeInBytes: number;
sizeInKB: string;
sizeInMB: string;
numberOfLines: number;
numberOfParagraphs: number;
numberOfHeadings: number;
estimatedTokens: number;
[key: string]: any;
}
export interface ExcelMarkdownResult {
/**
* Markdown document string
*/
document: string;
/**
* Conversion statistics
*/
stats: MarkdownStatistics & {
numberOfSheets: number;
sheets: Record<string, { rowCount: number; columnCount: number }>;
numberOfTables: number;
estimatedTokensPerSheet: number;
};
}
export interface PdfMarkdownResult {
/**
* Markdown document string
*/
document: string;
/**
* Conversion statistics
*/
stats: MarkdownStatistics & {
numberOfPages: number;
numberOfTables: number;
estimatedTokensPerLine: string;
pdfInfo: Record<string, any>;
};
}
export interface WordHtmlStatistics {
fileSize: {
bytes: number;
KB: string;
MB: string;
};
messages: any[];
htmlLength: number;
}
export interface WordHtmlResult {
/**
* HTML string
*/
html: string;
/**
* Conversion statistics
*/
statistics: WordHtmlStatistics;
}
export interface WordMarkdownResult {
/**
* Markdown document string
*/
markdown: string;
/**
* Conversion statistics
*/
statistics: MarkdownStatistics & {
numberOfHeadings: number;
numberOfLists: number;
numberOfLinks: number;
numberOfTables: number;
conversionMessages: any[];
};
}
/**
* Main converter class
*/
export class ConvertToMarkdown {
/**
* Convert Excel file to JSON format
*/
static excelToJson(input: Buffer | string, options?: ConversionOptions): Promise<ExcelJsonResult>;
/**
* Convert Excel file to Markdown format
*/
static excelToMarkdown(input: Buffer | string, options?: ConversionOptions): Promise<ExcelMarkdownResult>;
/**
* Convert PDF file to Markdown format
*/
static pdfToMarkdown(input: Buffer | string, options?: ConversionOptions): Promise<PdfMarkdownResult>;
/**
* Convert Word document to HTML format
*/
static wordToHtml(input: Buffer | string, options?: ConversionOptions): Promise<WordHtmlResult>;
/**
* Convert Word document to Markdown format
*/
static wordToMarkdown(input: Buffer | string, options?: ConversionOptions): Promise<WordMarkdownResult>;
}
/**
* Direct converter functions
*/
export namespace excel {
export function toJson(buffer: Buffer, options?: ConversionOptions): Promise<ExcelJsonResult>;
export function toMarkdown(buffer: Buffer, options?: ConversionOptions): Promise<ExcelMarkdownResult>;
}
export namespace pdf {
export function toMarkdown(buffer: Buffer, options?: ConversionOptions): Promise<PdfMarkdownResult>;
}
export namespace word {
export function toHtml(buffer: Buffer, options?: ConversionOptions): Promise<WordHtmlResult>;
export function toMarkdown(buffer: Buffer, options?: ConversionOptions): Promise<WordMarkdownResult>;
}
/**
* Utility functions
*/
export namespace utils {
export function estimateTokens(text: string): number;
export function cleanTextForMarkdown(text: string): string;
export function dataToMarkdownTable(data: any[], headers?: string[]): string;
export function isRowEmpty(row: any): boolean;
export function cleanRowData(row: any): any;
export function createMetadataHeader(metadata: Record<string, any>): string;
export function calculateStats(content: string, additionalStats?: Record<string, any>): MarkdownStatistics;
}
export default ConvertToMarkdown;