document-outline-extractor
Version:
Extract structured outlines from documents with optional AI enhancement
277 lines (227 loc) • 6.81 kB
text/typescript
import { OutlineNode, Heading, OutlineFormat } from './types';
/**
* Extract headings from markdown content
*/
export function extractHeadings(content: string): Heading[] {
const lines = content.split(/\r?\n/);
const headings: Heading[] = [];
for (let i = 0; i < lines.length; i++) {
const line = lines[i];
const match = line.match(/^(#{1,6})\s+(.*)$/);
if (match) {
headings.push({
level: match[1].length,
title: match[2].trim(),
lineNumber: i + 1
});
}
}
return headings;
}
/**
* Build hierarchical outline tree from flat headings
*/
export function buildOutlineTree(headings: Heading[]): OutlineNode[] {
const root: OutlineNode[] = [];
const stack: OutlineNode[] = [];
for (const heading of headings) {
const node: OutlineNode = {
level: heading.level,
title: heading.title,
children: [],
metadata: {
lineNumber: heading.lineNumber
}
};
// Pop stack until we find parent level
while (stack.length > 0 && stack[stack.length - 1].level >= heading.level) {
stack.pop();
}
// Add to parent or root
if (stack.length === 0) {
root.push(node);
} else {
stack[stack.length - 1].children.push(node);
}
stack.push(node);
}
return root;
}
/**
* Render outline tree as indented text
*/
export function renderOutline(nodes: OutlineNode[], depth = 0): string {
let result = '';
for (const node of nodes) {
result += `${' '.repeat(depth)}- ${node.title}\n`;
if (node.children.length > 0) {
result += renderOutline(node.children, depth + 1);
}
}
return result;
}
/**
* Format outline in specified format
*/
export function formatOutline(nodes: OutlineNode[], format: OutlineFormat): string {
switch (format) {
case 'tree':
return renderOutline(nodes);
case 'markdown':
return renderMarkdownOutline(nodes);
case 'json':
return JSON.stringify(nodes, null, 2);
case 'flat':
return renderFlatOutline(nodes);
default:
return renderOutline(nodes);
}
}
/**
* Parse JSON outline response into OutlineNode array
*/
export function parseJsonOutline(jsonString: string): OutlineNode[] {
try {
const parsed = JSON.parse(jsonString);
// Handle different response structures
if (parsed.outline && Array.isArray(parsed.outline)) {
return validateOutlineNodes(parsed.outline);
} else if (Array.isArray(parsed)) {
return validateOutlineNodes(parsed);
} else {
// Fallback: try to extract from text
return [];
}
} catch (error) {
console.error('Failed to parse JSON outline:', error);
return [];
}
}
/**
* Validate and normalize OutlineNode structure
*/
function validateOutlineNodes(nodes: any[]): OutlineNode[] {
return nodes.map(node => ({
level: typeof node.level === 'number' ? node.level : 1,
title: typeof node.title === 'string' ? node.title : 'Untitled',
children: Array.isArray(node.children) ? validateOutlineNodes(node.children) : [],
...(node.metadata && { metadata: node.metadata })
}));
}
/**
* Render outline as markdown
*/
function renderMarkdownOutline(nodes: OutlineNode[], baseLevel = 1): string {
let result = '';
for (const node of nodes) {
result += `${'#'.repeat(baseLevel)} ${node.title}\n\n`;
if (node.children.length > 0) {
result += renderMarkdownOutline(node.children, baseLevel + 1);
}
}
return result;
}
/**
* Render outline as flat list
*/
function renderFlatOutline(nodes: OutlineNode[], prefix = ''): string {
let result = '';
for (let i = 0; i < nodes.length; i++) {
const node = nodes[i];
const number = prefix ? `${prefix}.${i + 1}` : `${i + 1}`;
result += `${number}. ${node.title}\n`;
if (node.children.length > 0) {
result += renderFlatOutline(node.children, number);
}
}
return result;
}
/**
* Compute outline quality score
*/
export function computeOutlineScore(headings: Heading[], totalLines: number): number {
if (headings.length === 0) return 0;
const levels = headings.map(h => h.level);
const uniqueLevels = new Set(levels);
// Richness: variety of heading levels
const richness = uniqueLevels.size / 6;
// Balance: entropy of heading distribution
const counts: Record<number, number> = {};
for (const level of levels) {
counts[level] = (counts[level] || 0) + 1;
}
const probs = Object.values(counts).map(c => c / levels.length);
const entropy = -probs.reduce((sum, p) => sum + p * Math.log2(p), 0);
const maxEntropy = Math.log2(uniqueLevels.size || 1);
const balance = maxEntropy === 0 ? 0 : entropy / maxEntropy;
// Coherence: lack of level jumps
let jumps = 0;
for (let i = 1; i < levels.length; i++) {
if (levels[i] - levels[i - 1] > 1) {
jumps++;
}
}
const coherence = 1 - (jumps / levels.length);
// Coverage: heading density
const ratio = headings.length / totalLines;
const coverage = 1 / (1 + Math.exp(-10 * (ratio - 0.05)));
// Weighted score
return 0.35 * richness + 0.25 * balance + 0.25 * coherence + 0.15 * coverage;
}
/**
* Split content by headings
*/
export function splitByHeadings(content: string, maxSize: number): string[] {
const sections = content.split(/^(?=#\s)/m);
const chunks: string[] = [];
let current = '';
for (const section of sections) {
if ((current + section).length > maxSize && current.length > 0) {
chunks.push(current.trim());
current = section;
} else {
current += section;
}
}
if (current.trim()) {
chunks.push(current.trim());
}
return chunks;
}
/**
* Split content by size
*/
export function splitBySize(content: string, maxSize: number): string[] {
const chunks: string[] = [];
const lines = content.split(/\r?\n/);
let current = '';
for (const line of lines) {
if ((current + line + '\n').length > maxSize && current.length > 0) {
chunks.push(current.trim());
current = line + '\n';
} else {
current += line + '\n';
}
}
if (current.trim()) {
chunks.push(current.trim());
}
return chunks;
}
/**
* Smart split: prefer heading boundaries, fall back to size
*/
export function smartSplit(content: string, maxSize: number): string[] {
// Try heading split first
const headingChunks = splitByHeadings(content, maxSize);
// If any chunk is still too large, split by size
const finalChunks: string[] = [];
for (const chunk of headingChunks) {
if (chunk.length > maxSize) {
finalChunks.push(...splitBySize(chunk, maxSize));
} else {
finalChunks.push(chunk);
}
}
return finalChunks;
}