url-to-markdown-cli-tool
Version:
CLI tool for converting web pages to clean, LLM-friendly markdown. Fetches content from URLs and converts HTML to optimized markdown format perfect for LLM training, RAG systems, and AI applications.
209 lines (178 loc) • 8.22 kB
JavaScript
#!/usr/bin/env node
/**
* Main CLI interface for the URL to LLM-friendly Markdown converter.
*
* JavaScript/Node.js equivalent of the Python cli.py module.
*/
const { Command } = require('commander');
const fs = require('fs').promises;
const { getPageSource } = require('./lib/pageFetcher');
const { getProcessedMarkdown } = require('./lib/markdownProcessor');
/**
* Main execution function
* @param {Object} options - CLI options object
*/
async function run(options) {
try {
// Fetch page source
const pageSource = await getPageSource(options.url, {
wait: options.wait,
headless: !options.showBrowser,
showBrowser: options.showBrowser,
disableWebSecurity: options.disableWebSecurity,
viewportWidth: options.viewportWidth,
viewportHeight: options.viewportHeight
});
if (!pageSource) {
throw new Error('Failed to fetch the page. Ensure Chrome/Chromium is installed.');
}
// Build list of tags to remove
let tagsToRemove = options.removeTags || [];
// Add clean-content tags if option is enabled
if (options.cleanContent) {
const cleanContentTags = ['nav', 'footer', 'aside', 'script', 'style', 'header', 'noscript', 'canvas'];
tagsToRemove = [...tagsToRemove, ...cleanContentTags];
}
// Process HTML to markdown
const processed = await getProcessedMarkdown(pageSource, options.url, {
keepImages: options.images !== false,
keepWebpageLinks: options.links !== false,
removeGifImage: options.gifImages === false,
removeSvgImage: options.svgImages === false,
removeTags: tagsToRemove,
includeTags: options.includeTags
});
// Output result
if (options.output) {
await fs.writeFile(options.output, processed, 'utf-8');
console.log(`Output written to: ${options.output}`);
} else {
console.log(processed);
}
} catch (error) {
console.error(`Error: ${error.message}`);
process.exit(1);
}
}
/**
* Main function to set up CLI and parse arguments
*/
function main() {
const program = new Command();
program
.name('url-to-md')
.description('Fetch URL content and output LLM-friendly markdown')
.version('1.1.0')
.argument('<url>', 'URL to fetch')
.option('-o, --output <file>', 'Write output to file instead of stdout')
.option('--no-links', 'Remove webpage links from the output')
.option('--no-images', 'Remove images from the output')
.option('--no-gif-images', 'Remove GIF images from the output')
.option('--no-svg-images', 'Remove SVG images from the output')
.option('--clean-content', 'Remove common non-content tags (nav, footer, aside, script, style, header, noscript, canvas)')
.option('--include-tags <tags...>', 'Include only specific HTML tags and their content (e.g., --include-tags article main section)')
.option('--remove-tags <tags...>', 'Remove specific HTML tags from the output (e.g., --remove-tags div span button)')
.option('--wait <seconds>', 'Seconds to wait for the page to load', parseFloat, 1.5)
.option('--show-browser', 'Show the browser window (visible mode)', false)
.option('--mobile', 'Use mobile viewport (375x667 - iPhone)')
.option('--tablet', 'Use tablet viewport (768x1024 - iPad portrait)')
.option('--desktop', 'Use desktop viewport (1920x1080 - standard desktop)')
.option('--viewport-width <width>', 'Set viewport width in pixels (320-1920)', (value) => parseInt(value, 10), 375)
.option('--viewport-height <height>', 'Set viewport height in pixels (568-1080)', (value) => parseInt(value, 10), 667)
.option('--disable-web-security', 'Disable web security (CORS) - use with caution for difficult sites', false)
.addHelpText('after', `
Examples:
Basic usage:
$ url-to-md https://example.com
Extract only article content:
$ url-to-md https://news-site.com --include-tags article
Extract main content areas:
$ url-to-md https://blog.com --include-tags main section article
Combine include-tags with remove-tags (remove ads within articles):
$ url-to-md https://news.com --include-tags article --remove-tags aside nav
Extract content and save to file:
$ url-to-md https://example.com --include-tags main -o content.md
Clean extraction with mobile viewport:
$ url-to-md https://site.com --include-tags article --clean-content --mobile
Tag Filtering:
--include-tags: Specifies which HTML tags to include in the conversion.
Only content within these tags will be processed.
--remove-tags: Removes specific tags from the output. When used with
--include-tags, removes tags within the included content.
Priority: If a tag appears in both --include-tags and --remove-tags,
--include-tags takes precedence and the tag is included.
Common Use Cases:
News articles: --include-tags article
Blog posts: --include-tags main article
Documentation: --include-tags main section
Product pages: --include-tags main --remove-tags aside nav
Clean extraction: --include-tags article --clean-content`)
.action(async (url, options) => {
// Validate URL
try {
new URL(url);
} catch (error) {
console.error(`Error: Invalid URL provided: ${url}`);
process.exit(1);
}
// Validate wait time
if (options.wait < 0) {
console.error('Error: Wait time must be non-negative');
process.exit(1);
}
// Validate include-tags option
if (options.includeTags !== undefined && (!Array.isArray(options.includeTags) || options.includeTags.length === 0)) {
console.error('Error: --include-tags requires at least one tag name');
process.exit(1);
}
// Handle viewport presets (presets override individual width/height options)
if (options.mobile) {
options.viewportWidth = 375;
options.viewportHeight = 667;
} else if (options.tablet) {
options.viewportWidth = 768;
options.viewportHeight = 1024;
} else if (options.desktop) {
options.viewportWidth = 1920;
options.viewportHeight = 1080;
}
// Validate viewport dimensions
if (options.viewportWidth < 320 || options.viewportWidth > 1920) {
console.error('Error: Viewport width must be between 320 and 1920 pixels');
process.exit(1);
}
if (options.viewportHeight < 568 || options.viewportHeight > 1080) {
console.error('Error: Viewport height must be between 568 and 1080 pixels');
process.exit(1);
}
// Check for conflicting viewport options
const presetCount = [options.mobile, options.tablet, options.desktop].filter(Boolean).length;
if (presetCount > 1) {
console.error('Error: Only one viewport preset (--mobile, --tablet, or --desktop) can be used at a time');
process.exit(1);
}
// Add URL to options object
options.url = url;
await run(options);
});
// Parse command line arguments
program.showHelpAfterError();
program.parse();
}
// Handle uncaught exceptions
process.on('uncaughtException', (error) => {
console.error(`Fatal error: ${error.message}`);
process.exit(1);
});
process.on('unhandledRejection', (reason, promise) => {
console.error('Unhandled promise rejection:', reason);
process.exit(1);
});
// Run the CLI if this file is executed directly
if (require.main === module) {
main();
}
module.exports = {
run,
main
};