bquery
Version:
bquery is a useful node module to fetch web page, which use css selector to fetch and structure this html page content.
78 lines (77 loc) • 4.65 kB
JSON
{
"name": "bquery",
"version": "0.4.0",
"description": "bquery is a useful node module to fetch web page, which use css selector to fetch and structure this html page content.",
"main": "./lib/bquery",
"bin": {
"noodle": "./bin/bquery-server"
},
"dependencies": {
"connect": "~2.3.5",
"connect-ratelimit": "0.0.5",
"JSONSelect": "0.4.0",
"feedparser": "0.10.7",
"moment": "1.7.2",
"cheerio": "0.14.0",
"request": "2.11.4",
"q": "0.8.9",
"xml2json": "0.9.0",
"underscore": "1.4.2",
"mocha": "1.7.4",
"chai": "1.4.2",
"colors": "0.6.0-1",
"req-fast": "*",
"urijs": "*",
"html-to-text": "1.3.0",
"underscore": "*"
},
"devDependencies": {},
"scripts": {
"test": "echo \"Error: no test specified\" && exit 1",
"start": "bin/bquery-server"
},
"keywords": [
"scraper",
"proxy",
"cross-domain",
"cross domain",
"selectors",
"JSONSelect",
"json",
"html",
"rate limit"
],
"author": {
"name": "BeyondLink"
},
"license": "BSD",
"directories": {
"doc": "docs",
"test": "tests"
},
"_id": "bquery@0.3.1",
"_shasum": "60498d96a152264b874e69a6219e30e435501169",
"_from": "bquery@0.3.0",
"_npmVersion": "1.4.21",
"_npmUser": {
"name": "rickjose",
"email": "rickshawjose@gmail.com"
},
"maintainers": [
{
"name": "rickjose",
"email": "rickshawjose@gmail.com"
}
],
"dist": {
"shasum": "07419e360a8d2efcb524b5038591bd1c2a33aae5",
"tarball": "https://registry.npmjs.org/bquery/-/bquery-0.2.1.tgz"
},
"_npmOperationalInternal": {
"host": "packages-12-west.internal.npmjs.com",
"tmp": "tmp/bquery-0.2.1.tgz_1457508484833_0.6001792382448912"
},
"_resolved": "https://registry.npmjs.org/bquery/-/bquery-0.2.1.tgz",
"readme": "# bquery\n\nQuick , simple and elegant way to fetch a web documents and structure it.\n\n## Installation\n\nLatest release:\n\n $ npm install bquery\n\n\n```js\nvar buquery = require(\"buqery\");\nbquery.query({\n \"url\": \"https://github.com/\",\n \"selector\": \"ul.header-nav.left>li\",\n \"extract\": {\n \"title\":{},\n \"url\": {\n \"selector\": \"a\",\n \"extract\": \"href\"\n }\n }\n}).then(function(docs){\n console.log(docs);\n //=> {\"results\":[{\"result\":[{\"title\":\"Explore\",\"url\":\"https://github.com/explore\"},{\"title\":\"Features\",\"url\":\"https://github.com/features\"},{\"title\":\"Enterprise\",\"url\":\"https://enterprise.github.com/\"},{\"title\":\"Blog\",\"url\":\"https://github.com/blog\"}]}]}\n})\n```\n\n\n## Options\nbquery can sutomatic recognition the web document charset, but special circumstances you can also set docuemnt's charset.\n\n```js\nvar buquery = require(\"buqery\");\nbquery.query({\n \"url\": \"https://github.com/\",\n \"selector\": \"ul.header-nav.left>li\",\n \"charset\": \"utf-8\",\n \"extract\": {\n \"title\":{},\n \"url\": {\n \"selector\": \"a\",\n \"extract\": \"href\"\n }\n }\n}).then(function(docs){\n console.log(docs);\n})\n```\n\n\nYou can also set the timeout period for the request.\n```js\nbquery.query({\n \"url\": \"https://github.com/\",\n \"selector\": \"ul.header-nav.left>li>a\",\n \"timeout\": 3000\n});\n```\n\n\nSometimes you need to modify the page content link css, javascript or other content before you fetch the docuemnt content. you can use \"preSelect\" option.\n```js\nbquery.query({\n \"url\": \"https://github.com/\",\n \"selector\": \"ul.header-nav.left>li\",\n \"preSelect\": function($){ //=> $ is a cheerio object, you can do any operate wich base on cheerio\n $(\"ul.header-nav.left>li\").each(function(i, elem){\n if($(\"a\", elem).text() == \"Explore\"){\n $(elem).remove()\n }\n });\n },\n \"extract\": {\n \"title\":{},\n \"url\": {\n \"selector\": \"a\",\n \"extract\": \"href\"\n }\n }\n}).then(function(docs){\n console.log(docs); \n //=>[\n //=> { title: 'Features', url: 'https://github.com/features' },\n //=> { title: 'Enterprise', url: 'https://enterprise.github.com/' },\n //=> { title: 'Blog', url: 'https://github.com/blog' } \n //=>]\n})\n```\n\n\nyou can also use callback to modify selected attribute\n```js\n{\n \"url\": \"https://github.com/\",\n \"selector\": \"ul.header-nav.left>li\",\n \"extract\": {\n \"title\":{\n \"extract\": \"text\",\n \"callback\": function(txt){\n return \"foo_\" + txt;\n }\n },\n \"url\": {\n \"selector\": \"a\",\n \"extract\": \"href\"\n }\n }\n}\n```\n\n\n",
"readmeFilename": "README.md"
}