UNPKG

markdown-crawler

Version:

A powerful web crawler that extracts content from web pages and converts them to clean Markdown format, with support for code blocks and GitHub Flavored Markdown

67 lines (66 loc) 3.23 kB
#!/usr/bin/env node import { Command } from 'commander'; import { crawl } from './crawler.js'; import * as yaml from 'yaml'; // Changed import style import fs from 'fs'; import path from 'path'; const program = new Command(); program .name('md-crawler') .description('Crawl web pages and convert to YAML format Markdown. Will recursively crawl all pages in subdirectories.') .argument('<url>', 'URL to crawl. For URLs containing spaces, wrap them in double quotes: "http://example.com/my page"') .argument('<output>', 'Output YAML file name. Will be saved in the current working directory.') .action(async (url, output) => { try { console.log('Starting web crawl...'); const additionalGlobalUrls = [url.endsWith('/') ? `${url}**/*` : `${url.substring(0, url.lastIndexOf('/'))}/**/*`]; const results = await crawl(url, additionalGlobalUrls); console.log('Converting format...'); // 创建YAML文档,确保Markdown内容保持正确格式 const yamlData = results.map(({ title, url, markdown }) => { // 使用YAML的块字符串格式(|),并保持原始格式 return { url, title, content: markdown }; }); // 使用yaml.stringify并设置正确的选项 const yamlString = yaml.stringify(yamlData, { indent: 2, lineWidth: 0, // 禁用行宽限制,避免长行被折断 doubleQuotedAsJSON: false, // 避免过度转义 doubleQuotedMinMultiLineLength: Infinity, // 避免多行字符串使用双引号 defaultStringType: 'BLOCK_LITERAL' // 使用块字符串格式(|) }); // 后处理YAML字符串,移除内容中不必要的反斜杠 // const processedYamlString = yamlString // // 保留YAML结构,但移除内容中的反斜杠+换行符组合 // .replace(/(\s*content: \|[\r\n]+)([^]*?)(?=\n\s*-|\n\s*$)/g, (match, prefix, content) => { // // 只处理content部分,保留前缀 // const processedContent = content // // 移除反斜杠+换行符组合,但保留实际换行 // .replace(/\\(\r?\n\s*)/g, '$1') // // 修复URL中的反斜杠 // .replace(/\\\&/g, '&') // // 移除链接中的反斜杠 // .replace(/\\\[/g, '[') // .replace(/\\\]/g, ']') // .replace(/\\\(/g, '(') // .replace(/\\\)/g, ')') // // 保留Markdown代码块中的反斜杠 // .replace(/```([^`]*?)```/g, (codeMatch) => codeMatch.replace(/\\\\/g, '\\\\')); // return prefix + processedContent; // }); // Check and ensure file path ends with .yaml const outputWithExt = output.endsWith('.yaml') ? output : `${output}.yaml`; const outputPath = path.resolve(process.cwd(), outputWithExt); fs.writeFileSync(outputPath, yamlString); console.log(`Success! Results saved to: ${outputPath}`); } catch (error) { console.error('Error occurred:', error); process.exit(1); } }); program.parse();