markdown-crawler

Version:

A powerful web crawler that extracts content from web pages and converts them to clean Markdown format, with support for code blocks and GitHub Flavored Markdown

github.com/gkctou/md-crawler

gkctou/md-crawler

80 lines (79 loc) • 1.95 kB

JSON

View Raw

{ "name": "markdown-crawler", "version": "1.0.17", "description": "A powerful web crawler that extracts content from web pages and converts them to clean Markdown format, with support for code blocks and GitHub Flavored Markdown", "main": "dist/index.js", "bin": { "markdown-crawler": "dist/cli.js" }, "scripts": { "build": "tsc && chmod +x ./dist/cli.js", "dev": "tsc --watch", "clean": "rm -rf dist", "prebuild": "npm run clean", "test": "node dist/cli.js -- https://sample.site/docs/ sample.docs.yaml", "lint": "tsc --noEmit", "prepub": "npm version patch", "pub": "npm publish --access public" }, "files": [ "dist", "README.md", "README-*.md", "LICENSE" ], "keywords": [ "web-crawler", "markdown", "html-to-markdown", "readability", "content-extraction", "playwright", "cli-tool", "web-scraping", "gfm", "turndown", "ai", "llm", "ai-content", "ai-processing", "machine-learning", "nlp", "text-extraction", "semantic-content", "structured-data", "clean-text" ], "author": "gkctou@gmail.com", "license": "MIT", "repository": { "type": "git", "url": "git+https://github.com/gkctou/md-crawler.git" }, "bugs": { "url": "https://github.com/gkctou/md-crawler/issues" }, "homepage": "https://github.com/gkctou/md-crawler#readme", "engines": { "node": ">=16.0.0" }, "devDependencies": { "@types/node": "^22.13.14", "typescript": "^5.8.2" }, "dependencies": { "@mozilla/readability": "^0.6.0", "@types/minimatch": "^5.1.2", "cheerio": "^1.0.0", "commander": "^13.1.0", "jsdom": "^26.0.0", "minimatch": "^10.0.1", "remark-parse": "^11.0.0", "remark-stringify": "^11.0.0", "turndown": "^7.2.0", "turndown-plugin-gfm": "^1.0.2", "unified": "^11.0.5", "unist-util-visit": "^5.0.0", "yaml": "^2.7.0" } }