UNPKG

markdown-crawler

Version:

A powerful web crawler that extracts content from web pages and converts them to clean Markdown format, with support for code blocks and GitHub Flavored Markdown

105 lines (104 loc) 4.93 kB
#!/usr/bin/env node "use strict"; var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) { if (k2 === undefined) k2 = k; var desc = Object.getOwnPropertyDescriptor(m, k); if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) { desc = { enumerable: true, get: function() { return m[k]; } }; } Object.defineProperty(o, k2, desc); }) : (function(o, m, k, k2) { if (k2 === undefined) k2 = k; o[k2] = m[k]; })); var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) { Object.defineProperty(o, "default", { enumerable: true, value: v }); }) : function(o, v) { o["default"] = v; }); var __importStar = (this && this.__importStar) || (function () { var ownKeys = function(o) { ownKeys = Object.getOwnPropertyNames || function (o) { var ar = []; for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k; return ar; }; return ownKeys(o); }; return function (mod) { if (mod && mod.__esModule) return mod; var result = {}; if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]); __setModuleDefault(result, mod); return result; }; })(); var __importDefault = (this && this.__importDefault) || function (mod) { return (mod && mod.__esModule) ? mod : { "default": mod }; }; Object.defineProperty(exports, "__esModule", { value: true }); const commander_1 = require("commander"); const crawler_js_1 = require("./crawler.js"); const yaml = __importStar(require("yaml")); // Changed import style const fs_1 = __importDefault(require("fs")); const path_1 = __importDefault(require("path")); const program = new commander_1.Command(); program .name('md-crawler') .description('Crawl web pages and convert to YAML format Markdown. Will recursively crawl all pages in subdirectories.') .argument('<url>', 'URL to crawl. For URLs containing spaces, wrap them in double quotes: "http://example.com/my page"') .argument('<output>', 'Output YAML file name. Will be saved in the current working directory.') .action(async (url, output) => { try { console.log('Starting web crawl...'); const additionalGlobalUrls = [url.endsWith('/') ? `${url}**/*` : `${url.substring(0, url.lastIndexOf('/'))}/**/*`]; const results = await (0, crawler_js_1.crawl)(url, additionalGlobalUrls); console.log('Converting format...'); // 创建YAML文档,确保Markdown内容保持正确格式 const yamlData = results.map(({ title, url, markdown }) => { // 使用YAML的块字符串格式(|),并保持原始格式 return { url, title, content: markdown }; }); // 使用yaml.stringify并设置正确的选项 const yamlString = yaml.stringify(yamlData, { indent: 2, lineWidth: 0, // 禁用行宽限制,避免长行被折断 doubleQuotedAsJSON: false, // 避免过度转义 doubleQuotedMinMultiLineLength: Infinity, // 避免多行字符串使用双引号 defaultStringType: 'BLOCK_LITERAL' // 使用块字符串格式(|) }); // 后处理YAML字符串,移除内容中不必要的反斜杠 // const processedYamlString = yamlString // // 保留YAML结构,但移除内容中的反斜杠+换行符组合 // .replace(/(\s*content: \|[\r\n]+)([^]*?)(?=\n\s*-|\n\s*$)/g, (match, prefix, content) => { // // 只处理content部分,保留前缀 // const processedContent = content // // 移除反斜杠+换行符组合,但保留实际换行 // .replace(/\\(\r?\n\s*)/g, '$1') // // 修复URL中的反斜杠 // .replace(/\\\&/g, '&') // // 移除链接中的反斜杠 // .replace(/\\\[/g, '[') // .replace(/\\\]/g, ']') // .replace(/\\\(/g, '(') // .replace(/\\\)/g, ')') // // 保留Markdown代码块中的反斜杠 // .replace(/```([^`]*?)```/g, (codeMatch) => codeMatch.replace(/\\\\/g, '\\\\')); // return prefix + processedContent; // }); // Check and ensure file path ends with .yaml const outputWithExt = output.endsWith('.yaml') ? output : `${output}.yaml`; const outputPath = path_1.default.resolve(process.cwd(), outputWithExt); fs_1.default.writeFileSync(outputPath, yamlString); console.log(`Success! Results saved to: ${outputPath}`); } catch (error) { console.error('Error occurred:', error); process.exit(1); } }); program.parse();