markdown-crawler
Version:
A powerful web crawler that extracts content from web pages and converts them to clean Markdown format, with support for code blocks and GitHub Flavored Markdown
105 lines (104 loc) • 4.93 kB
JavaScript
;
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
var desc = Object.getOwnPropertyDescriptor(m, k);
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
desc = { enumerable: true, get: function() { return m[k]; } };
}
Object.defineProperty(o, k2, desc);
}) : (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
o[k2] = m[k];
}));
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
Object.defineProperty(o, "default", { enumerable: true, value: v });
}) : function(o, v) {
o["default"] = v;
});
var __importStar = (this && this.__importStar) || (function () {
var ownKeys = function(o) {
ownKeys = Object.getOwnPropertyNames || function (o) {
var ar = [];
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
return ar;
};
return ownKeys(o);
};
return function (mod) {
if (mod && mod.__esModule) return mod;
var result = {};
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
__setModuleDefault(result, mod);
return result;
};
})();
var __importDefault = (this && this.__importDefault) || function (mod) {
return (mod && mod.__esModule) ? mod : { "default": mod };
};
Object.defineProperty(exports, "__esModule", { value: true });
const commander_1 = require("commander");
const crawler_js_1 = require("./crawler.js");
const yaml = __importStar(require("yaml")); // Changed import style
const fs_1 = __importDefault(require("fs"));
const path_1 = __importDefault(require("path"));
const program = new commander_1.Command();
program
.name('md-crawler')
.description('Crawl web pages and convert to YAML format Markdown. Will recursively crawl all pages in subdirectories.')
.argument('<url>', 'URL to crawl. For URLs containing spaces, wrap them in double quotes: "http://example.com/my page"')
.argument('<output>', 'Output YAML file name. Will be saved in the current working directory.')
.action(async (url, output) => {
try {
console.log('Starting web crawl...');
const additionalGlobalUrls = [url.endsWith('/') ? `${url}**/*` : `${url.substring(0, url.lastIndexOf('/'))}/**/*`];
const results = await (0, crawler_js_1.crawl)(url, additionalGlobalUrls);
console.log('Converting format...');
// 创建YAML文档,确保Markdown内容保持正确格式
const yamlData = results.map(({ title, url, markdown }) => {
// 使用YAML的块字符串格式(|),并保持原始格式
return {
url,
title,
content: markdown
};
});
// 使用yaml.stringify并设置正确的选项
const yamlString = yaml.stringify(yamlData, {
indent: 2,
lineWidth: 0, // 禁用行宽限制,避免长行被折断
doubleQuotedAsJSON: false, // 避免过度转义
doubleQuotedMinMultiLineLength: Infinity, // 避免多行字符串使用双引号
defaultStringType: 'BLOCK_LITERAL' // 使用块字符串格式(|)
});
// 后处理YAML字符串,移除内容中不必要的反斜杠
// const processedYamlString = yamlString
// // 保留YAML结构,但移除内容中的反斜杠+换行符组合
// .replace(/(\s*content: \|[\r\n]+)([^]*?)(?=\n\s*-|\n\s*$)/g, (match, prefix, content) => {
// // 只处理content部分,保留前缀
// const processedContent = content
// // 移除反斜杠+换行符组合,但保留实际换行
// .replace(/\\(\r?\n\s*)/g, '$1')
// // 修复URL中的反斜杠
// .replace(/\\\&/g, '&')
// // 移除链接中的反斜杠
// .replace(/\\\[/g, '[')
// .replace(/\\\]/g, ']')
// .replace(/\\\(/g, '(')
// .replace(/\\\)/g, ')')
// // 保留Markdown代码块中的反斜杠
// .replace(/```([^`]*?)```/g, (codeMatch) => codeMatch.replace(/\\\\/g, '\\\\'));
// return prefix + processedContent;
// });
// Check and ensure file path ends with .yaml
const outputWithExt = output.endsWith('.yaml') ? output : `${output}.yaml`;
const outputPath = path_1.default.resolve(process.cwd(), outputWithExt);
fs_1.default.writeFileSync(outputPath, yamlString);
console.log(`Success! Results saved to: ${outputPath}`);
}
catch (error) {
console.error('Error occurred:', error);
process.exit(1);
}
});
program.parse();