html-content-processor
Version:
A professional library for processing, cleaning, filtering, and converting HTML content to Markdown. Features advanced customization options, presets, plugin support, fluent API, and TypeScript integration for reliable content extraction.
215 lines (214 loc) • 10.2 kB
JavaScript
;
var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
return new (P || (P = Promise))(function (resolve, reject) {
function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
step((generator = generator.apply(thisArg, _arguments || [])).next());
});
};
Object.defineProperty(exports, "__esModule", { value: true });
exports.DefaultMarkdownGenerator = void 0;
const html2text_1 = require("./html2text");
/**
* Regex for link references, used to convert links to reference style.
*/
const LINK_PATTERN = /!?\[([^\]]+)\]\(([^)]+?)(?:\s+"([^"]*)")?\)/g;
/**
* Quickly joins a base URL and a relative URL, handling common cases.
* @param baseUrl The base URL.
* @param url The URL to join.
* @returns The joined URL.
*/
function fastUrlJoin(baseUrl, url) {
if (url.startsWith('http://') || url.startsWith('https://') ||
url.startsWith('mailto:') || url.startsWith('//') || url.startsWith('data:')) {
return url;
}
// Handle absolute paths relative to the domain
if (url.startsWith('/')) {
if (baseUrl) {
try {
const base = new URL(baseUrl);
return `${base.protocol}//${base.host}${url}`;
}
catch (e) {
// If baseUrl is not a valid URL, it might be a path itself, or invalid
// In this case, simple concatenation might be wrong, but it's a fallback
return baseUrl.endsWith('/') ? baseUrl.slice(0, -1) + url : baseUrl + url;
}
}
else {
// If no baseUrl, an absolute path cannot be resolved meaningfully
return url;
}
}
// Handle relative paths
if (baseUrl) {
try {
return new URL(url, baseUrl).href;
}
catch (e) {
// If URL construction fails (e.g. invalid base or relative URL), return original url
return url;
}
}
else {
// If no baseUrl, a relative path cannot be resolved meaningfully
return url;
}
}
class DefaultMarkdownGenerator {
/**
* Creates a Markdown generator instance.
* @param contentFilter Content filter instance.
* @param options Generation options.
*/
constructor(contentFilter = null, options = {}) {
this.contentFilter = contentFilter;
this.options = Object.assign({ contentSource: 'cleaned_html' }, options);
}
/**
* Converts links to reference style.
* @param markdown Markdown text.
* @param baseUrl Base URL for resolving relative links.
* @returns A tuple containing the Markdown with references and the references section.
*/
convertLinksToRefs(markdown, baseUrl = '') {
const linkMap = new Map();
const urlCache = new Map(); // Cache for resolved URLs
const parts = [];
let lastEnd = 0;
let counter = 1;
let match;
while ((match = LINK_PATTERN.exec(markdown)) !== null) {
parts.push(markdown.slice(lastEnd, match.index));
const [fullMatch, text, url, title] = match;
let resolvedUrl = url;
if (baseUrl && !(url.startsWith('http://') || url.startsWith('https://') || url.startsWith('mailto:') || url.startsWith('data:'))) {
if (!urlCache.has(url)) {
urlCache.set(url, fastUrlJoin(baseUrl, url));
}
resolvedUrl = urlCache.get(url) || url;
}
if (!linkMap.has(resolvedUrl)) {
const descriptions = [];
if (title)
descriptions.push(title);
// Add text as description only if it's different from title and not empty
if (text && text.trim() && text !== title)
descriptions.push(text.trim());
linkMap.set(resolvedUrl, { refNum: counter, description: descriptions.length ? ': ' + descriptions.join(' - ') : '' });
counter++;
}
const linkInfo = linkMap.get(resolvedUrl);
if (linkInfo) {
// Use ⟨n⟩ format for references to avoid conflict with square brackets in code
const refText = text.trim() || resolvedUrl; // Use URL as text if text is empty
if (!fullMatch.startsWith('!')) { // Regular link
parts.push(`${refText}⟨${linkInfo.refNum}⟩`);
}
else { // Image
parts.push(`![${refText}⟨${linkInfo.refNum}⟩]`);
}
}
lastEnd = match.index + fullMatch.length;
}
parts.push(markdown.slice(lastEnd));
const convertedText = parts.join('');
if (linkMap.size === 0) {
return [convertedText, '']; // No links, no references section
}
const references = ['\n\n## References\n\n'];
// Sort links by their reference number for consistent output
const sortedLinks = Array.from(linkMap.entries()).sort((a, b) => a[1].refNum - b[1].refNum);
for (const [url, linkInfo] of sortedLinks) {
references.push(`⟨${linkInfo.refNum}⟩ ${url}${linkInfo.description}\n`);
}
return [convertedText, references.join('')];
}
/**
* Generates Markdown from HTML.
* @param inputHtml Input HTML string.
* @param baseUrl Base URL for resolving relative links.
* @param html2textOptions Options for HTML to text conversion.
* @param contentFilterOverride Optional override for the content filter.
* @param citations Whether to generate citations for links.
* @returns Markdown generation result object.
*/
generateMarkdown(inputHtml, baseUrl = '', html2textOptions = {}, contentFilterOverride = null, citations = true) {
return __awaiter(this, void 0, void 0, function* () {
try {
const h = new html2text_1.CustomHtml2Text(baseUrl);
const defaultOptions = {
bodyWidth: 0,
ignoreEmphasis: false,
ignoreLinks: false,
ignoreImages: false,
protectLinks: false,
singleLineBreak: true,
markCode: true,
escapeSnob: false,
};
const finalOptions = Object.assign(Object.assign(Object.assign({}, defaultOptions), this.options), html2textOptions);
h.updateParams(finalOptions);
const effectiveHtml = inputHtml || '';
let rawMarkdown;
try {
rawMarkdown = yield h.handle(effectiveHtml);
}
catch (e) {
rawMarkdown = `Error converting HTML to markdown: ${e instanceof Error ? e.message : String(e)}`;
}
// Basic cleanup for Markdown (e.g., extra spaces before code blocks)
rawMarkdown = rawMarkdown.replace(/^\s*```/gm, '```');
let markdownWithCitations = rawMarkdown;
let referencesMarkdown = '';
if (citations) {
try {
[markdownWithCitations, referencesMarkdown] = this.convertLinksToRefs(rawMarkdown, baseUrl);
}
catch (e) {
// Preserve raw markdown if citation generation fails
referencesMarkdown = `\n\nError generating citations: ${e instanceof Error ? e.message : String(e)}`;
}
}
let fitMarkdown = '';
let fitHtml = '';
const filterToUse = contentFilterOverride || this.contentFilter;
if (filterToUse) {
try {
const filteredChunks = yield filterToUse.filterContent(effectiveHtml);
// Wrap each chunk in a div to maintain structure for Markdown conversion, or join directly if preferred
fitHtml = filteredChunks.join('\n'); // Simpler join, CustomHtml2Text should handle block elements
fitMarkdown = yield h.handle(fitHtml);
fitMarkdown = fitMarkdown.replace(/^\s*```/gm, '```'); // Cleanup for fitMarkdown as well
}
catch (e) {
fitMarkdown = `Error generating fit markdown: ${e instanceof Error ? e.message : String(e)}`;
fitHtml = `Error during HTML filtering for fit content: ${e instanceof Error ? e.message : String(e)}`;
}
}
return {
rawMarkdown: rawMarkdown || '',
markdownWithCitations: markdownWithCitations || '',
referencesMarkdown: referencesMarkdown || '',
fitMarkdown: fitMarkdown || rawMarkdown,
fitHtml: fitHtml || ''
};
}
catch (e) {
const errorMsg = `Error in markdown generation: ${e instanceof Error ? e.message : String(e)}`;
return {
rawMarkdown: errorMsg,
markdownWithCitations: errorMsg,
referencesMarkdown: '',
fitMarkdown: errorMsg,
fitHtml: errorMsg
};
}
});
}
}
exports.DefaultMarkdownGenerator = DefaultMarkdownGenerator;