taskflow-ai

/** * 文档处理器 - 支持多种格式的文档解析 * 作为PRD解析引擎的核心组件 */ import { Logger } from '../../infra/logger'; /** * 文档类型枚举 */ export declare enum DocumentType { MARKDOWN = "markdown", TEXT = "text", JSON = "json", HTML = "html", WORD = "word", PDF = "pdf" } /** * 文档结构接口 */ export interface DocumentStructure { title: string; sections: DocumentSection[]; metadata: DocumentMetadata; } /** * 文档章节接口 */ export interface DocumentSection { id: string; title: string; content: string; level: number; subsections: DocumentSection[]; type: SectionType; keywords: string[]; importance: number; } /** * 章节类型枚举 */ export declare enum SectionType { OVERVIEW = "overview", REQUIREMENTS = "requirements", FEATURES = "features", TECHNICAL = "technical", TIMELINE = "timeline", RESOURCES = "resources", APPENDIX = "appendix", OTHER = "other" } /** * 文档元数据接口 */ export interface DocumentMetadata { fileName: string; fileSize: number; createdAt: Date; modifiedAt: Date; documentType: DocumentType; language: string; wordCount: number; estimatedReadTime: number; } /** * 文档处理选项 */ export interface ProcessingOptions { extractTables?: boolean; extractImages?: boolean; detectLanguage?: boolean; analyzeStructure?: boolean; extractKeywords?: boolean; calculateImportance?: boolean; } /** * 文档处理器类 */ export declare class DocumentProcessor { private logger; private markdownParser; constructor(logger: Logger); /** * 处理文档文件 * @param filePath 文件路径 * @param options 处理选项 */ processDocument(filePath: string, options?: ProcessingOptions): Promise<DocumentStructure>; /** * 检测文档类型 * @param filePath 文件路径 */ private detectDocumentType; /** * 读取文件内容 * @param filePath 文件路径 * @param documentType 文档类型 */ private readFileContent; /** * 解析文档结构 * @param content 文档内容 * @param documentType 文档类型 * @param options 处理选项 */ private parseDocumentStructure; /** * 解析Markdown文档结构 * @param content Markdown内容 * @param options 处理选项 */ private parseMarkdownStructure; /** * 解析纯文本文档结构 * @param content 文本内容 * @param options 处理选项 */ private parseTextStructure; /** * 解析JSON文档结构 * @param content JSON内容 * @param options 处理选项 */ private parseJsonStructure; /** * 解析HTML文档结构 * @param content HTML内容 * @param options 处理选项 */ private parseHtmlStructure; /** * 从对象创建章节 * @param obj 对象 * @param id 章节ID * @param level 层级 */ private createSectionFromObject; /** * 构建章节层次结构 * @param sections 扁平的章节列表 */ private buildSectionHierarchy; /** * 分类章节类型 * @param title 章节标题 */ private classifySectionType; /** * 提取关键词 * @param text 文本内容 */ private extractKeywords; /** * 计算章节重要性 * @param title 标题 * @param content 内容 */ private calculateImportance; /** * 提取文档标题 * @param content 文档内容 * @param documentType 文档类型 */ private extractTitle; /** * 检测文档语言 * @param content 文档内容 */ private detectLanguage; /** * 统计单词数量 * @param content 文档内容 */ private countWords; }