UNPKG

markdown-crawler

Version:

A powerful web crawler that extracts content from web pages and converts them to clean Markdown format, with support for code blocks and GitHub Flavored Markdown

github.com/gkctou/md-crawler

gkctou/md-crawler

36 lines (35 loc) • 1.36 kB

JavaScript

import { unified } from 'unified'; import remarkParse from 'remark-parse'; import remarkStringify from 'remark-stringify'; import { visit, SKIP } from 'unist-util-visit'; function cleanMarkdownWithRemark(markdownContent) { const processor = unified() .use(remarkParse) // 解析 Markdown 為 AST .use(() => (tree) => { // 選擇性地移除圖像節點 visit(tree, 'image', (node, index, parent) => { // 檢查圖像 URL 是否為 base64 格式 if (node.url.startsWith('data:image')) { if (parent && index !== null) { // 創建一個替代的文字節點 const replacementNode = { type: 'text', value: `[${node.alt || ''} (base64 image removed)]` }; // 用替代節點替換 base64 圖像節點 parent.children.splice(index, 1, replacementNode); return [SKIP, index + 1]; } } // 對於外部連結圖像，保持不變 }); }) .use(remarkStringify, { bullet: '-', fences: true, listItemIndent: 'one', }); const result = processor.processSync(markdownContent); return String(result); } export { cleanMarkdownWithRemark };