UNPKG

document-outline-extractor

Version:

Extract structured outlines from documents with optional AI enhancement

246 lines 7.25 kB
"use strict"; Object.defineProperty(exports, "__esModule", { value: true }); exports.extractHeadings = extractHeadings; exports.buildOutlineTree = buildOutlineTree; exports.renderOutline = renderOutline; exports.formatOutline = formatOutline; exports.parseJsonOutline = parseJsonOutline; exports.computeOutlineScore = computeOutlineScore; exports.splitByHeadings = splitByHeadings; exports.splitBySize = splitBySize; exports.smartSplit = smartSplit; /** * Extract headings from markdown content */ function extractHeadings(content) { const lines = content.split(/\r?\n/); const headings = []; for (let i = 0; i < lines.length; i++) { const line = lines[i]; const match = line.match(/^(#{1,6})\s+(.*)$/); if (match) { headings.push({ level: match[1].length, title: match[2].trim(), lineNumber: i + 1 }); } } return headings; } /** * Build hierarchical outline tree from flat headings */ function buildOutlineTree(headings) { const root = []; const stack = []; for (const heading of headings) { const node = { level: heading.level, title: heading.title, children: [], metadata: { lineNumber: heading.lineNumber } }; // Pop stack until we find parent level while (stack.length > 0 && stack[stack.length - 1].level >= heading.level) { stack.pop(); } // Add to parent or root if (stack.length === 0) { root.push(node); } else { stack[stack.length - 1].children.push(node); } stack.push(node); } return root; } /** * Render outline tree as indented text */ function renderOutline(nodes, depth = 0) { let result = ''; for (const node of nodes) { result += `${' '.repeat(depth)}- ${node.title}\n`; if (node.children.length > 0) { result += renderOutline(node.children, depth + 1); } } return result; } /** * Format outline in specified format */ function formatOutline(nodes, format) { switch (format) { case 'tree': return renderOutline(nodes); case 'markdown': return renderMarkdownOutline(nodes); case 'json': return JSON.stringify(nodes, null, 2); case 'flat': return renderFlatOutline(nodes); default: return renderOutline(nodes); } } /** * Parse JSON outline response into OutlineNode array */ function parseJsonOutline(jsonString) { try { const parsed = JSON.parse(jsonString); // Handle different response structures if (parsed.outline && Array.isArray(parsed.outline)) { return validateOutlineNodes(parsed.outline); } else if (Array.isArray(parsed)) { return validateOutlineNodes(parsed); } else { // Fallback: try to extract from text return []; } } catch (error) { console.error('Failed to parse JSON outline:', error); return []; } } /** * Validate and normalize OutlineNode structure */ function validateOutlineNodes(nodes) { return nodes.map(node => ({ level: typeof node.level === 'number' ? node.level : 1, title: typeof node.title === 'string' ? node.title : 'Untitled', children: Array.isArray(node.children) ? validateOutlineNodes(node.children) : [], ...(node.metadata && { metadata: node.metadata }) })); } /** * Render outline as markdown */ function renderMarkdownOutline(nodes, baseLevel = 1) { let result = ''; for (const node of nodes) { result += `${'#'.repeat(baseLevel)} ${node.title}\n\n`; if (node.children.length > 0) { result += renderMarkdownOutline(node.children, baseLevel + 1); } } return result; } /** * Render outline as flat list */ function renderFlatOutline(nodes, prefix = '') { let result = ''; for (let i = 0; i < nodes.length; i++) { const node = nodes[i]; const number = prefix ? `${prefix}.${i + 1}` : `${i + 1}`; result += `${number}. ${node.title}\n`; if (node.children.length > 0) { result += renderFlatOutline(node.children, number); } } return result; } /** * Compute outline quality score */ function computeOutlineScore(headings, totalLines) { if (headings.length === 0) return 0; const levels = headings.map(h => h.level); const uniqueLevels = new Set(levels); // Richness: variety of heading levels const richness = uniqueLevels.size / 6; // Balance: entropy of heading distribution const counts = {}; for (const level of levels) { counts[level] = (counts[level] || 0) + 1; } const probs = Object.values(counts).map(c => c / levels.length); const entropy = -probs.reduce((sum, p) => sum + p * Math.log2(p), 0); const maxEntropy = Math.log2(uniqueLevels.size || 1); const balance = maxEntropy === 0 ? 0 : entropy / maxEntropy; // Coherence: lack of level jumps let jumps = 0; for (let i = 1; i < levels.length; i++) { if (levels[i] - levels[i - 1] > 1) { jumps++; } } const coherence = 1 - (jumps / levels.length); // Coverage: heading density const ratio = headings.length / totalLines; const coverage = 1 / (1 + Math.exp(-10 * (ratio - 0.05))); // Weighted score return 0.35 * richness + 0.25 * balance + 0.25 * coherence + 0.15 * coverage; } /** * Split content by headings */ function splitByHeadings(content, maxSize) { const sections = content.split(/^(?=#\s)/m); const chunks = []; let current = ''; for (const section of sections) { if ((current + section).length > maxSize && current.length > 0) { chunks.push(current.trim()); current = section; } else { current += section; } } if (current.trim()) { chunks.push(current.trim()); } return chunks; } /** * Split content by size */ function splitBySize(content, maxSize) { const chunks = []; const lines = content.split(/\r?\n/); let current = ''; for (const line of lines) { if ((current + line + '\n').length > maxSize && current.length > 0) { chunks.push(current.trim()); current = line + '\n'; } else { current += line + '\n'; } } if (current.trim()) { chunks.push(current.trim()); } return chunks; } /** * Smart split: prefer heading boundaries, fall back to size */ function smartSplit(content, maxSize) { // Try heading split first const headingChunks = splitByHeadings(content, maxSize); // If any chunk is still too large, split by size const finalChunks = []; for (const chunk of headingChunks) { if (chunk.length > maxSize) { finalChunks.push(...splitBySize(chunk, maxSize)); } else { finalChunks.push(chunk); } } return finalChunks; } //# sourceMappingURL=utils.js.map