markdown-crawler
Version:
A powerful web crawler that extracts content from web pages and converts them to clean Markdown format, with support for code blocks and GitHub Flavored Markdown
41 lines (40 loc) • 1.74 kB
JavaScript
;
var __importDefault = (this && this.__importDefault) || function (mod) {
return (mod && mod.__esModule) ? mod : { "default": mod };
};
Object.defineProperty(exports, "__esModule", { value: true });
exports.cleanMarkdownWithRemark = cleanMarkdownWithRemark;
const unified_1 = require("unified");
const remark_parse_1 = __importDefault(require("remark-parse"));
const remark_stringify_1 = __importDefault(require("remark-stringify"));
const unist_util_visit_1 = require("unist-util-visit");
function cleanMarkdownWithRemark(markdownContent) {
const processor = (0, unified_1.unified)()
.use(remark_parse_1.default) // 解析 Markdown 為 AST
.use(() => (tree) => {
// 選擇性地移除圖像節點
(0, unist_util_visit_1.visit)(tree, 'image', (node, index, parent) => {
// 檢查圖像 URL 是否為 base64 格式
if (node.url.startsWith('data:image')) {
if (parent && index !== null) {
// 創建一個替代的文字節點
const replacementNode = {
type: 'text',
value: `[${node.alt || ''} (base64 image removed)]`
};
// 用替代節點替換 base64 圖像節點
parent.children.splice(index, 1, replacementNode);
return [unist_util_visit_1.SKIP, index + 1];
}
}
// 對於外部連結圖像,保持不變
});
})
.use(remark_stringify_1.default, {
bullet: '-',
fences: true,
listItemIndent: 'one',
});
const result = processor.processSync(markdownContent);
return String(result);
}