markdown-crawler
Version:
A powerful web crawler that extracts content from web pages and converts them to clean Markdown format, with support for code blocks and GitHub Flavored Markdown
197 lines (196 loc) • 7.67 kB
JavaScript
;
var __importDefault = (this && this.__importDefault) || function (mod) {
return (mod && mod.__esModule) ? mod : { "default": mod };
};
Object.defineProperty(exports, "__esModule", { value: true });
exports.extract_from_url = extract_from_url;
exports.extract_from_html = extract_from_html;
const readability_1 = require("@mozilla/readability");
const jsdom_1 = require("jsdom"); // Added VirtualConsole import
const turndown_1 = __importDefault(require("turndown"));
const turndown_plugin_gfm_1 = require("turndown-plugin-gfm");
const clearup_js_1 = require("./clearup.js");
// 配置TurndownService
const turndownService = new turndown_1.default({
headingStyle: 'atx',
hr: '---',
bulletListMarker: '-',
codeBlockStyle: 'fenced',
linkStyle: 'inlined', // 使用内联链接样式
linkReferenceStyle: 'full', // 使用完整引用样式
blankReplacement: function (content, node) {
return node.isBlock ? '\n\n' : '';
},
keepReplacement: function (content, node) {
return node.isBlock ? '\n\n' + content + '\n\n' : content;
},
defaultReplacement: function (content, node) {
return node.isBlock ? '\n\n' + content + '\n\n' : content;
}
});
// 使用GitHub风格的Markdown扩展
turndownService.use(turndown_plugin_gfm_1.gfm);
// 重写转义函数,完全禁用转义
turndownService.escape = function (string) {
// 不转义任何字符,保持原始格式
return string;
};
// 自定义链接规则,确保正确格式化
turndownService.addRule('links', {
filter: ['a'],
replacement: function (content, node) {
const href = node.getAttribute('href') || '';
const title = node.title;
if (href === '') {
return content;
}
// 清理URL中的转义字符
const cleanHref = href.replace(/\\(.)/g, '$1');
const cleanContent = content.replace(/\\(.)/g, '$1');
// 标准Markdown链接格式
return title
? `[${cleanContent}](${cleanHref} "${title}")`
: `[${cleanContent}](${cleanHref})`;
}
});
// 添加对复选框的特殊处理
turndownService.addRule('checkbox', {
filter: function (node) {
return (node.nodeName === 'INPUT' &&
node.getAttribute('type') === 'checkbox');
},
replacement: function (content, node) {
const isChecked = node.checked;
return isChecked ? '[x] ' : '[ ] ';
}
});
turndownService.addRule('fenceAllPreformattedText', {
filter: ['pre'],
replacement: function (content, node) {
const ext = getExt(node);
const code = [...node.childNodes]
.map(c => c.textContent)
.join('');
return `\n\`\`\`${ext}\n${code}\n\`\`\`\n\n`;
}
});
turndownService.addRule('strikethrough', {
filter: ['del', 's'],
replacement: function (content) {
return '~' + content + '~';
}
});
// 自定义处理换行的规则
turndownService.addRule('lineBreaks', {
filter: 'br',
replacement: function (content) {
return '\n'; // 使用单纯的换行符,不添加反斜杠
}
});
const getExt = (node) => {
// Simple match where the <pre> has the `highlight-source-js` tags
const getFirstTag = (node) => node.outerHTML.split('>').shift() + '>';
const match = node.outerHTML.match(/(highlight-source-|language-)[a-z]+/);
if (match)
return match[0].split('-').pop();
// Check the parent just in case
const parent = getFirstTag(node.parentNode).match(/(highlight-source-|language-)[a-z]+/);
if (parent)
return parent[0].split('-').pop();
const getInnerTag = (node) => node.innerHTML.split('>').shift() + '>';
const inner = getInnerTag(node).match(/(highlight-source-|language-)[a-z]+/);
if (inner)
return inner[0].split('-').pop();
// Nothing was found...
return '';
};
function extract_from_dom(dom) {
// 预处理HTML内容,修复链接格式问题
const document = dom.window.document;
const links = document.querySelectorAll('a');
// 处理所有链接,确保它们的格式正确
links.forEach(link => {
// 如果链接没有文本内容但有href,添加文本内容
if ((link.textContent || '').trim() === '' && link.getAttribute('href')) {
link.textContent = link.getAttribute('href') || '';
}
// 确保链接的href属性不包含转义字符
if (link.getAttribute('href')) {
const cleanHref = link.getAttribute('href')?.replace(/\\(.)/g, '$1');
link.setAttribute('href', cleanHref || '');
}
// 移除链接文本中的换行符
if (link.textContent) {
link.textContent = link.textContent.replace(/\s+/g, ' ').trim();
}
});
let article = new readability_1.Readability(dom.window.document, {
keepClasses: true,
debug: false,
charThreshold: 100,
}).parse();
// 提取标题,但不再用于内容处理
const title = article && article.title?.replace(/\n/g, ' ').replace(/\s+/g, ' ').trim() || '';
if (!article) {
throw new Error("Failed to parse article");
}
// remove HTML comments
article.content = article.content?.replace(/(\<!--.*?\-->)/g, "") || '';
// 恢复标题处理功能
// Try to add proper h1 if title is missing
if (title.length > 0) {
// check if first h2 is the same as title
const h2Regex = /<h2[^>]*>(.*?)<\/h2>/;
const match = article.content.match(h2Regex);
if (match?.[0].includes(title)) {
// replace fist h2 with h1
article.content = article.content.replace("<h2", "<h1").replace("</h2", "</h1");
}
else {
// add title as h1
article.content = `<h1>${title}</h1>\n${article.content}`;
}
}
// 重新处理HTML内容,确保链接格式正确
const tempDom = new jsdom_1.JSDOM(article.content);
const tempLinks = tempDom.window.document.querySelectorAll('a');
tempLinks.forEach(link => {
if (link.getAttribute('href')) {
// 移除链接URL中的转义字符
const cleanHref = link.getAttribute('href')?.replace(/\\(.)/g, '$1');
link.setAttribute('href', cleanHref || '');
}
});
// 更新处理后的HTML内容
article.content = tempDom.window.document.body.innerHTML;
// convert to markdown
let res = turndownService.turndown(article.content);
// 移除行尾多余的反斜杠(硬换行符号)
res = res.replace(/\\$/gm, '');
// 处理日文文本中常见的段落格式
res = res.replace(/\\\s+/g, ' '); // 移除反斜杠后跟空白
res = res.replace(/\\\n/g, '\n'); // 移除反斜杠后跟换行
// 清理多余的空行
res = res.replace(/\n{3,}/g, '\n\n');
// 特别处理日文文本中的格式
res = res.replace(/【(.+?)】\\\s*/g, '【$1】\n');
return [title, (0, clearup_js_1.cleanMarkdownWithRemark)(res)];
}
async function extract_from_url(page) {
const virtualConsole = new jsdom_1.VirtualConsole();
const dom = await jsdom_1.JSDOM.fromURL(page, {
runScripts: 'outside-only',
virtualConsole,
pretendToBeVisual: false // 不需要模擬視覺環境
});
return extract_from_dom(dom);
}
function extract_from_html(html) {
const virtualConsole = new jsdom_1.VirtualConsole();
const dom = new jsdom_1.JSDOM(html, {
runScripts: 'outside-only',
virtualConsole,
pretendToBeVisual: false // 不需要模擬視覺環境
});
return extract_from_dom(dom);
}