document-outline-extractor
Version:
Extract structured outlines from documents with optional AI enhancement
212 lines • 8.39 kB
JavaScript
"use strict";
Object.defineProperty(exports, "__esModule", { value: true });
exports.OutlineExtractor = void 0;
const openai_1 = require("./openai");
const utils_1 = require("./utils");
class OutlineExtractor {
constructor(config) {
this.config = {
chunkSize: config?.chunkSize || 5000,
qualityThreshold: config?.qualityThreshold || 0.8,
defaultFormat: config?.defaultFormat || 'tree',
caching: config?.caching !== false,
...config
};
// Initialize OpenAI client if configuration provided
if (this.config.openai) {
this.openaiClient = new openai_1.OpenAIClient(this.config.openai);
}
this.cache = new Map();
}
/**
* Extract outline from document content
*/
async extract(content, options) {
const cacheKey = this.getCacheKey(content, options);
if (this.config.caching && this.cache.has(cacheKey)) {
return this.cache.get(cacheKey);
}
const quality = this.evaluateQuality(content);
let outline;
if (quality.score >= this.config.qualityThreshold) {
// Use existing outline from document
outline = this.extractExistingOutline(content, options);
}
else {
// Generate outline using AI or fallback
outline = await this.generateOutline(content, options);
}
if (this.config.caching) {
this.cache.set(cacheKey, outline);
}
return outline;
}
/**
* Evaluate the quality of existing outline in document
*/
evaluateQuality(content) {
const headings = (0, utils_1.extractHeadings)(content);
const lines = content.split(/\r?\n/);
const score = (0, utils_1.computeOutlineScore)(headings, lines.length);
const levels = headings.map(h => h.level);
const uniqueLevels = new Set(levels);
return {
score,
richness: uniqueLevels.size / 6,
balance: this.calculateBalance(headings),
coherence: this.calculateCoherence(headings),
coverage: headings.length / lines.length,
headingCount: headings.length,
depth: Math.max(...levels, 0)
};
}
/**
* Split document into manageable chunks
*/
splitDocument(content, strategy) {
const chunkStrategy = strategy || 'smart';
const maxSize = this.config.chunkSize;
switch (chunkStrategy) {
case 'heading':
return (0, utils_1.splitByHeadings)(content, maxSize);
case 'size':
return (0, utils_1.splitBySize)(content, maxSize);
case 'smart':
default:
return (0, utils_1.smartSplit)(content, maxSize);
}
}
/**
* Clear the internal cache
*/
clearCache() {
this.cache.clear();
}
/**
* Update configuration
*/
updateConfig(config) {
this.config = { ...this.config, ...config };
// Update OpenAI client if configuration changed
if (config.openai) {
this.openaiClient = new openai_1.OpenAIClient(config.openai);
}
}
extractExistingOutline(content, options) {
const headings = (0, utils_1.extractHeadings)(content);
const filtered = this.filterHeadings(headings, options);
const tree = (0, utils_1.buildOutlineTree)(filtered);
return (0, utils_1.formatOutline)(tree, options?.format || this.config.defaultFormat);
}
async generateOutline(content, options) {
if (!this.openaiClient) {
// Fallback to regex extraction
return this.extractExistingOutline(content, options);
}
const chunks = this.splitDocument(content, options?.chunkingStrategy);
const partialOutlines = [];
// Generate outline for each chunk
for (const chunk of chunks) {
const outline = await this.generateChunkOutline(chunk, options);
partialOutlines.push(...outline);
}
// Merge partial outlines
if (partialOutlines.length === 0) {
return this.extractExistingOutline(content, options);
}
const mergedOutline = chunks.length === 1 ? partialOutlines : await this.mergeOutlines(partialOutlines);
return (0, utils_1.formatOutline)(mergedOutline, options?.format || this.config.defaultFormat);
}
async generateChunkOutline(chunk, options) {
if (!this.openaiClient) {
const headings = (0, utils_1.extractHeadings)(chunk);
const filtered = this.filterHeadings(headings, options);
return (0, utils_1.buildOutlineTree)(filtered);
}
const prompt = this.buildPrompt(chunk, options);
const jsonResponse = await this.openaiClient.generateOutline(prompt, chunk);
if (jsonResponse) {
const parsedNodes = (0, utils_1.parseJsonOutline)(jsonResponse);
if (parsedNodes.length > 0) {
return parsedNodes;
}
}
// Fallback to regex extraction
const headings = (0, utils_1.extractHeadings)(chunk);
const filtered = this.filterHeadings(headings, options);
return (0, utils_1.buildOutlineTree)(filtered);
}
async mergeOutlines(outlines) {
if (!this.openaiClient) {
return outlines;
}
const mergedContent = outlines.map(node => (0, utils_1.formatOutline)([node], 'markdown')).join('\n\n---\n\n');
const prompt = `Merge the following partial outlines into a single, coherent multi-level outline.
Maintain hierarchy and remove duplicates. Output only the final outline.`;
const jsonResponse = await this.openaiClient.generateOutline(prompt, mergedContent);
if (jsonResponse) {
const parsedNodes = (0, utils_1.parseJsonOutline)(jsonResponse);
if (parsedNodes.length > 0) {
return parsedNodes;
}
}
// Fallback: return original outlines
return outlines;
}
filterHeadings(headings, options) {
let filtered = headings;
if (options?.minHeadingLevel) {
filtered = filtered.filter(h => h.level >= options.minHeadingLevel);
}
if (options?.maxHeadingLevel) {
filtered = filtered.filter(h => h.level <= options.maxHeadingLevel);
}
if (options?.maxDepth) {
const minLevel = Math.min(...filtered.map(h => h.level));
filtered = filtered.filter(h => h.level - minLevel < options.maxDepth);
}
return filtered;
}
buildPrompt(content, options) {
const depth = options?.maxDepth || (content.length > 2000 ? 2 : 1);
const format = options?.format || this.config.defaultFormat;
return `You are a world-class technical writer. Extract a clean ${depth}-level outline from the given text.
Output format: ${format}
Requirements:
- Focus on main topics and key points
- Maintain logical hierarchy
- Be concise but comprehensive
- No explanations, only the outline`;
}
calculateBalance(headings) {
if (headings.length === 0)
return 0;
const levels = headings.map(h => h.level);
const counts = {};
for (const level of levels) {
counts[level] = (counts[level] || 0) + 1;
}
const probs = Object.values(counts).map(c => c / levels.length);
const entropy = -probs.reduce((sum, p) => sum + p * Math.log2(p), 0);
const maxEntropy = Math.log2(Object.keys(counts).length || 1);
return maxEntropy === 0 ? 0 : entropy / maxEntropy;
}
calculateCoherence(headings) {
if (headings.length <= 1)
return 1;
const levels = headings.map(h => h.level);
let jumps = 0;
for (let i = 1; i < levels.length; i++) {
if (levels[i] - levels[i - 1] > 1) {
jumps++;
}
}
return 1 - (jumps / levels.length);
}
getCacheKey(content, options) {
const optStr = JSON.stringify(options || {});
return `${content.length}_${content.substring(0, 100)}_${optStr}`;
}
}
exports.OutlineExtractor = OutlineExtractor;
//# sourceMappingURL=extractor.js.map