UNPKG

markdown-crawler

Version:

A powerful web crawler that extracts content from web pages and converts them to clean Markdown format, with support for code blocks and GitHub Flavored Markdown

36 lines (35 loc) 1.36 kB
import { unified } from 'unified'; import remarkParse from 'remark-parse'; import remarkStringify from 'remark-stringify'; import { visit, SKIP } from 'unist-util-visit'; function cleanMarkdownWithRemark(markdownContent) { const processor = unified() .use(remarkParse) // 解析 Markdown 為 AST .use(() => (tree) => { // 選擇性地移除圖像節點 visit(tree, 'image', (node, index, parent) => { // 檢查圖像 URL 是否為 base64 格式 if (node.url.startsWith('data:image')) { if (parent && index !== null) { // 創建一個替代的文字節點 const replacementNode = { type: 'text', value: `[${node.alt || ''} (base64 image removed)]` }; // 用替代節點替換 base64 圖像節點 parent.children.splice(index, 1, replacementNode); return [SKIP, index + 1]; } } // 對於外部連結圖像,保持不變 }); }) .use(remarkStringify, { bullet: '-', fences: true, listItemIndent: 'one', }); const result = processor.processSync(markdownContent); return String(result); } export { cleanMarkdownWithRemark };