UNPKG

markdown-crawler

Version:

A powerful web crawler that extracts content from web pages and converts them to clean Markdown format, with support for code blocks and GitHub Flavored Markdown

github.com/gkctou/md-crawler

gkctou/md-crawler

105 lines (104 loc) • 4.93 kB

JavaScript

#!/usr/bin/env node "use strict"; var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) { if (k2 === undefined) k2 = k; var desc = Object.getOwnPropertyDescriptor(m, k); if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) { desc = { enumerable: true, get: function() { return m[k]; } }; } Object.defineProperty(o, k2, desc); }) : (function(o, m, k, k2) { if (k2 === undefined) k2 = k; o[k2] = m[k]; })); var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) { Object.defineProperty(o, "default", { enumerable: true, value: v }); }) : function(o, v) { o["default"] = v; }); var __importStar = (this && this.__importStar) || (function () { var ownKeys = function(o) { ownKeys = Object.getOwnPropertyNames || function (o) { var ar = []; for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k; return ar; }; return ownKeys(o); }; return function (mod) { if (mod && mod.__esModule) return mod; var result = {}; if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]); __setModuleDefault(result, mod); return result; }; })(); var __importDefault = (this && this.__importDefault) || function (mod) { return (mod && mod.__esModule) ? mod : { "default": mod }; }; Object.defineProperty(exports, "__esModule", { value: true }); const commander_1 = require("commander"); const crawler_js_1 = require("./crawler.js"); const yaml = __importStar(require("yaml")); // Changed import style const fs_1 = __importDefault(require("fs")); const path_1 = __importDefault(require("path")); const program = new commander_1.Command(); program .name('md-crawler') .description('Crawl web pages and convert to YAML format Markdown. Will recursively crawl all pages in subdirectories.') .argument('<url>', 'URL to crawl. For URLs containing spaces, wrap them in double quotes: "http://example.com/my page"') .argument('<output>', 'Output YAML file name. Will be saved in the current working directory.') .action(async (url, output) => { try { console.log('Starting web crawl...'); const additionalGlobalUrls = [url.endsWith('/') ? `${url}**/*` : `${url.substring(0, url.lastIndexOf('/'))}/**/*`]; const results = await (0, crawler_js_1.crawl)(url, additionalGlobalUrls); console.log('Converting format...'); // 创建YAML文档，确保Markdown内容保持正确格式 const yamlData = results.map(({ title, url, markdown }) => { // 使用YAML的块字符串格式(|)，并保持原始格式 return { url, title, content: markdown }; }); // 使用yaml.stringify并设置正确的选项 const yamlString = yaml.stringify(yamlData, { indent: 2, lineWidth: 0, // 禁用行宽限制，避免长行被折断 doubleQuotedAsJSON: false, // 避免过度转义 doubleQuotedMinMultiLineLength: Infinity, // 避免多行字符串使用双引号 defaultStringType: 'BLOCK_LITERAL' // 使用块字符串格式(|) }); // 后处理YAML字符串，移除内容中不必要的反斜杠 // const processedYamlString = yamlString // // 保留YAML结构，但移除内容中的反斜杠+换行符组合 // .replace(/(\s*content: \|[\r\n]+)([^]*?)(?=\n\s*-|\n\s*$)/g, (match, prefix, content) => { // // 只处理content部分，保留前缀 // const processedContent = content // // 移除反斜杠+换行符组合，但保留实际换行 // .replace(/\\(\r?\n\s*)/g, '$1') // // 修复URL中的反斜杠 // .replace(/\\\&/g, '&') // // 移除链接中的反斜杠 // .replace(/\\\[/g, '[') // .replace(/\\\]/g, ']') // .replace(/\\\(/g, '(') // .replace(/\\\)/g, ')') // // 保留Markdown代码块中的反斜杠 // .replace(/```([^`]*?)```/g, (codeMatch) => codeMatch.replace(/\\\\/g, '\\\\')); // return prefix + processedContent; // }); // Check and ensure file path ends with .yaml const outputWithExt = output.endsWith('.yaml') ? output : `${output}.yaml`; const outputPath = path_1.default.resolve(process.cwd(), outputWithExt); fs_1.default.writeFileSync(outputPath, yamlString); console.log(`Success! Results saved to: ${outputPath}`); } catch (error) { console.error('Error occurred:', error); process.exit(1); } }); program.parse();