UNPKG

markdown-crawler

Version:

A powerful web crawler that extracts content from web pages and converts them to clean Markdown format, with support for code blocks and GitHub Flavored Markdown

41 lines (40 loc) 1.74 kB
"use strict"; var __importDefault = (this && this.__importDefault) || function (mod) { return (mod && mod.__esModule) ? mod : { "default": mod }; }; Object.defineProperty(exports, "__esModule", { value: true }); exports.cleanMarkdownWithRemark = cleanMarkdownWithRemark; const unified_1 = require("unified"); const remark_parse_1 = __importDefault(require("remark-parse")); const remark_stringify_1 = __importDefault(require("remark-stringify")); const unist_util_visit_1 = require("unist-util-visit"); function cleanMarkdownWithRemark(markdownContent) { const processor = (0, unified_1.unified)() .use(remark_parse_1.default) // 解析 Markdown 為 AST .use(() => (tree) => { // 選擇性地移除圖像節點 (0, unist_util_visit_1.visit)(tree, 'image', (node, index, parent) => { // 檢查圖像 URL 是否為 base64 格式 if (node.url.startsWith('data:image')) { if (parent && index !== null) { // 創建一個替代的文字節點 const replacementNode = { type: 'text', value: `[${node.alt || ''} (base64 image removed)]` }; // 用替代節點替換 base64 圖像節點 parent.children.splice(index, 1, replacementNode); return [unist_util_visit_1.SKIP, index + 1]; } } // 對於外部連結圖像,保持不變 }); }) .use(remark_stringify_1.default, { bullet: '-', fences: true, listItemIndent: 'one', }); const result = processor.processSync(markdownContent); return String(result); }