UNPKG

document-outline-extractor

Version:

Extract structured outlines from documents with optional AI enhancement

277 lines (227 loc) 6.81 kB
import { OutlineNode, Heading, OutlineFormat } from './types'; /** * Extract headings from markdown content */ export function extractHeadings(content: string): Heading[] { const lines = content.split(/\r?\n/); const headings: Heading[] = []; for (let i = 0; i < lines.length; i++) { const line = lines[i]; const match = line.match(/^(#{1,6})\s+(.*)$/); if (match) { headings.push({ level: match[1].length, title: match[2].trim(), lineNumber: i + 1 }); } } return headings; } /** * Build hierarchical outline tree from flat headings */ export function buildOutlineTree(headings: Heading[]): OutlineNode[] { const root: OutlineNode[] = []; const stack: OutlineNode[] = []; for (const heading of headings) { const node: OutlineNode = { level: heading.level, title: heading.title, children: [], metadata: { lineNumber: heading.lineNumber } }; // Pop stack until we find parent level while (stack.length > 0 && stack[stack.length - 1].level >= heading.level) { stack.pop(); } // Add to parent or root if (stack.length === 0) { root.push(node); } else { stack[stack.length - 1].children.push(node); } stack.push(node); } return root; } /** * Render outline tree as indented text */ export function renderOutline(nodes: OutlineNode[], depth = 0): string { let result = ''; for (const node of nodes) { result += `${' '.repeat(depth)}- ${node.title}\n`; if (node.children.length > 0) { result += renderOutline(node.children, depth + 1); } } return result; } /** * Format outline in specified format */ export function formatOutline(nodes: OutlineNode[], format: OutlineFormat): string { switch (format) { case 'tree': return renderOutline(nodes); case 'markdown': return renderMarkdownOutline(nodes); case 'json': return JSON.stringify(nodes, null, 2); case 'flat': return renderFlatOutline(nodes); default: return renderOutline(nodes); } } /** * Parse JSON outline response into OutlineNode array */ export function parseJsonOutline(jsonString: string): OutlineNode[] { try { const parsed = JSON.parse(jsonString); // Handle different response structures if (parsed.outline && Array.isArray(parsed.outline)) { return validateOutlineNodes(parsed.outline); } else if (Array.isArray(parsed)) { return validateOutlineNodes(parsed); } else { // Fallback: try to extract from text return []; } } catch (error) { console.error('Failed to parse JSON outline:', error); return []; } } /** * Validate and normalize OutlineNode structure */ function validateOutlineNodes(nodes: any[]): OutlineNode[] { return nodes.map(node => ({ level: typeof node.level === 'number' ? node.level : 1, title: typeof node.title === 'string' ? node.title : 'Untitled', children: Array.isArray(node.children) ? validateOutlineNodes(node.children) : [], ...(node.metadata && { metadata: node.metadata }) })); } /** * Render outline as markdown */ function renderMarkdownOutline(nodes: OutlineNode[], baseLevel = 1): string { let result = ''; for (const node of nodes) { result += `${'#'.repeat(baseLevel)} ${node.title}\n\n`; if (node.children.length > 0) { result += renderMarkdownOutline(node.children, baseLevel + 1); } } return result; } /** * Render outline as flat list */ function renderFlatOutline(nodes: OutlineNode[], prefix = ''): string { let result = ''; for (let i = 0; i < nodes.length; i++) { const node = nodes[i]; const number = prefix ? `${prefix}.${i + 1}` : `${i + 1}`; result += `${number}. ${node.title}\n`; if (node.children.length > 0) { result += renderFlatOutline(node.children, number); } } return result; } /** * Compute outline quality score */ export function computeOutlineScore(headings: Heading[], totalLines: number): number { if (headings.length === 0) return 0; const levels = headings.map(h => h.level); const uniqueLevels = new Set(levels); // Richness: variety of heading levels const richness = uniqueLevels.size / 6; // Balance: entropy of heading distribution const counts: Record<number, number> = {}; for (const level of levels) { counts[level] = (counts[level] || 0) + 1; } const probs = Object.values(counts).map(c => c / levels.length); const entropy = -probs.reduce((sum, p) => sum + p * Math.log2(p), 0); const maxEntropy = Math.log2(uniqueLevels.size || 1); const balance = maxEntropy === 0 ? 0 : entropy / maxEntropy; // Coherence: lack of level jumps let jumps = 0; for (let i = 1; i < levels.length; i++) { if (levels[i] - levels[i - 1] > 1) { jumps++; } } const coherence = 1 - (jumps / levels.length); // Coverage: heading density const ratio = headings.length / totalLines; const coverage = 1 / (1 + Math.exp(-10 * (ratio - 0.05))); // Weighted score return 0.35 * richness + 0.25 * balance + 0.25 * coherence + 0.15 * coverage; } /** * Split content by headings */ export function splitByHeadings(content: string, maxSize: number): string[] { const sections = content.split(/^(?=#\s)/m); const chunks: string[] = []; let current = ''; for (const section of sections) { if ((current + section).length > maxSize && current.length > 0) { chunks.push(current.trim()); current = section; } else { current += section; } } if (current.trim()) { chunks.push(current.trim()); } return chunks; } /** * Split content by size */ export function splitBySize(content: string, maxSize: number): string[] { const chunks: string[] = []; const lines = content.split(/\r?\n/); let current = ''; for (const line of lines) { if ((current + line + '\n').length > maxSize && current.length > 0) { chunks.push(current.trim()); current = line + '\n'; } else { current += line + '\n'; } } if (current.trim()) { chunks.push(current.trim()); } return chunks; } /** * Smart split: prefer heading boundaries, fall back to size */ export function smartSplit(content: string, maxSize: number): string[] { // Try heading split first const headingChunks = splitByHeadings(content, maxSize); // If any chunk is still too large, split by size const finalChunks: string[] = []; for (const chunk of headingChunks) { if (chunk.length > maxSize) { finalChunks.push(...splitBySize(chunk, maxSize)); } else { finalChunks.push(chunk); } } return finalChunks; }