UNPKG

markdown-crawler

Version:

A powerful web crawler that extracts content from web pages and converts them to clean Markdown format, with support for code blocks and GitHub Flavored Markdown

github.com/gkctou/md-crawler

gkctou/md-crawler

41 lines (40 loc) • 1.74 kB

JavaScript

"use strict"; var __importDefault = (this && this.__importDefault) || function (mod) { return (mod && mod.__esModule) ? mod : { "default": mod }; }; Object.defineProperty(exports, "__esModule", { value: true }); exports.cleanMarkdownWithRemark = cleanMarkdownWithRemark; const unified_1 = require("unified"); const remark_parse_1 = __importDefault(require("remark-parse")); const remark_stringify_1 = __importDefault(require("remark-stringify")); const unist_util_visit_1 = require("unist-util-visit"); function cleanMarkdownWithRemark(markdownContent) { const processor = (0, unified_1.unified)() .use(remark_parse_1.default) // 解析 Markdown 為 AST .use(() => (tree) => { // 選擇性地移除圖像節點 (0, unist_util_visit_1.visit)(tree, 'image', (node, index, parent) => { // 檢查圖像 URL 是否為 base64 格式 if (node.url.startsWith('data:image')) { if (parent && index !== null) { // 創建一個替代的文字節點 const replacementNode = { type: 'text', value: `[${node.alt || ''} (base64 image removed)]` }; // 用替代節點替換 base64 圖像節點 parent.children.splice(index, 1, replacementNode); return [unist_util_visit_1.SKIP, index + 1]; } } // 對於外部連結圖像，保持不變 }); }) .use(remark_stringify_1.default, { bullet: '-', fences: true, listItemIndent: 'one', }); const result = processor.processSync(markdownContent); return String(result); }