UNPKG

document-outline-extractor

Version:

Extract structured outlines from documents with optional AI enhancement

212 lines 8.39 kB
"use strict"; Object.defineProperty(exports, "__esModule", { value: true }); exports.OutlineExtractor = void 0; const openai_1 = require("./openai"); const utils_1 = require("./utils"); class OutlineExtractor { constructor(config) { this.config = { chunkSize: config?.chunkSize || 5000, qualityThreshold: config?.qualityThreshold || 0.8, defaultFormat: config?.defaultFormat || 'tree', caching: config?.caching !== false, ...config }; // Initialize OpenAI client if configuration provided if (this.config.openai) { this.openaiClient = new openai_1.OpenAIClient(this.config.openai); } this.cache = new Map(); } /** * Extract outline from document content */ async extract(content, options) { const cacheKey = this.getCacheKey(content, options); if (this.config.caching && this.cache.has(cacheKey)) { return this.cache.get(cacheKey); } const quality = this.evaluateQuality(content); let outline; if (quality.score >= this.config.qualityThreshold) { // Use existing outline from document outline = this.extractExistingOutline(content, options); } else { // Generate outline using AI or fallback outline = await this.generateOutline(content, options); } if (this.config.caching) { this.cache.set(cacheKey, outline); } return outline; } /** * Evaluate the quality of existing outline in document */ evaluateQuality(content) { const headings = (0, utils_1.extractHeadings)(content); const lines = content.split(/\r?\n/); const score = (0, utils_1.computeOutlineScore)(headings, lines.length); const levels = headings.map(h => h.level); const uniqueLevels = new Set(levels); return { score, richness: uniqueLevels.size / 6, balance: this.calculateBalance(headings), coherence: this.calculateCoherence(headings), coverage: headings.length / lines.length, headingCount: headings.length, depth: Math.max(...levels, 0) }; } /** * Split document into manageable chunks */ splitDocument(content, strategy) { const chunkStrategy = strategy || 'smart'; const maxSize = this.config.chunkSize; switch (chunkStrategy) { case 'heading': return (0, utils_1.splitByHeadings)(content, maxSize); case 'size': return (0, utils_1.splitBySize)(content, maxSize); case 'smart': default: return (0, utils_1.smartSplit)(content, maxSize); } } /** * Clear the internal cache */ clearCache() { this.cache.clear(); } /** * Update configuration */ updateConfig(config) { this.config = { ...this.config, ...config }; // Update OpenAI client if configuration changed if (config.openai) { this.openaiClient = new openai_1.OpenAIClient(config.openai); } } extractExistingOutline(content, options) { const headings = (0, utils_1.extractHeadings)(content); const filtered = this.filterHeadings(headings, options); const tree = (0, utils_1.buildOutlineTree)(filtered); return (0, utils_1.formatOutline)(tree, options?.format || this.config.defaultFormat); } async generateOutline(content, options) { if (!this.openaiClient) { // Fallback to regex extraction return this.extractExistingOutline(content, options); } const chunks = this.splitDocument(content, options?.chunkingStrategy); const partialOutlines = []; // Generate outline for each chunk for (const chunk of chunks) { const outline = await this.generateChunkOutline(chunk, options); partialOutlines.push(...outline); } // Merge partial outlines if (partialOutlines.length === 0) { return this.extractExistingOutline(content, options); } const mergedOutline = chunks.length === 1 ? partialOutlines : await this.mergeOutlines(partialOutlines); return (0, utils_1.formatOutline)(mergedOutline, options?.format || this.config.defaultFormat); } async generateChunkOutline(chunk, options) { if (!this.openaiClient) { const headings = (0, utils_1.extractHeadings)(chunk); const filtered = this.filterHeadings(headings, options); return (0, utils_1.buildOutlineTree)(filtered); } const prompt = this.buildPrompt(chunk, options); const jsonResponse = await this.openaiClient.generateOutline(prompt, chunk); if (jsonResponse) { const parsedNodes = (0, utils_1.parseJsonOutline)(jsonResponse); if (parsedNodes.length > 0) { return parsedNodes; } } // Fallback to regex extraction const headings = (0, utils_1.extractHeadings)(chunk); const filtered = this.filterHeadings(headings, options); return (0, utils_1.buildOutlineTree)(filtered); } async mergeOutlines(outlines) { if (!this.openaiClient) { return outlines; } const mergedContent = outlines.map(node => (0, utils_1.formatOutline)([node], 'markdown')).join('\n\n---\n\n'); const prompt = `Merge the following partial outlines into a single, coherent multi-level outline. Maintain hierarchy and remove duplicates. Output only the final outline.`; const jsonResponse = await this.openaiClient.generateOutline(prompt, mergedContent); if (jsonResponse) { const parsedNodes = (0, utils_1.parseJsonOutline)(jsonResponse); if (parsedNodes.length > 0) { return parsedNodes; } } // Fallback: return original outlines return outlines; } filterHeadings(headings, options) { let filtered = headings; if (options?.minHeadingLevel) { filtered = filtered.filter(h => h.level >= options.minHeadingLevel); } if (options?.maxHeadingLevel) { filtered = filtered.filter(h => h.level <= options.maxHeadingLevel); } if (options?.maxDepth) { const minLevel = Math.min(...filtered.map(h => h.level)); filtered = filtered.filter(h => h.level - minLevel < options.maxDepth); } return filtered; } buildPrompt(content, options) { const depth = options?.maxDepth || (content.length > 2000 ? 2 : 1); const format = options?.format || this.config.defaultFormat; return `You are a world-class technical writer. Extract a clean ${depth}-level outline from the given text. Output format: ${format} Requirements: - Focus on main topics and key points - Maintain logical hierarchy - Be concise but comprehensive - No explanations, only the outline`; } calculateBalance(headings) { if (headings.length === 0) return 0; const levels = headings.map(h => h.level); const counts = {}; for (const level of levels) { counts[level] = (counts[level] || 0) + 1; } const probs = Object.values(counts).map(c => c / levels.length); const entropy = -probs.reduce((sum, p) => sum + p * Math.log2(p), 0); const maxEntropy = Math.log2(Object.keys(counts).length || 1); return maxEntropy === 0 ? 0 : entropy / maxEntropy; } calculateCoherence(headings) { if (headings.length <= 1) return 1; const levels = headings.map(h => h.level); let jumps = 0; for (let i = 1; i < levels.length; i++) { if (levels[i] - levels[i - 1] > 1) { jumps++; } } return 1 - (jumps / levels.length); } getCacheKey(content, options) { const optStr = JSON.stringify(options || {}); return `${content.length}_${content.substring(0, 100)}_${optStr}`; } } exports.OutlineExtractor = OutlineExtractor; //# sourceMappingURL=extractor.js.map