UNPKG

bquery

Version:

bquery is a useful node module to fetch web page, which use css selector to fetch and structure this html page content.

78 lines (77 loc) 4.65 kB
{ "name": "bquery", "version": "0.4.0", "description": "bquery is a useful node module to fetch web page, which use css selector to fetch and structure this html page content.", "main": "./lib/bquery", "bin": { "noodle": "./bin/bquery-server" }, "dependencies": { "connect": "~2.3.5", "connect-ratelimit": "0.0.5", "JSONSelect": "0.4.0", "feedparser": "0.10.7", "moment": "1.7.2", "cheerio": "0.14.0", "request": "2.11.4", "q": "0.8.9", "xml2json": "0.9.0", "underscore": "1.4.2", "mocha": "1.7.4", "chai": "1.4.2", "colors": "0.6.0-1", "req-fast": "*", "urijs": "*", "html-to-text": "1.3.0", "underscore": "*" }, "devDependencies": {}, "scripts": { "test": "echo \"Error: no test specified\" && exit 1", "start": "bin/bquery-server" }, "keywords": [ "scraper", "proxy", "cross-domain", "cross domain", "selectors", "JSONSelect", "json", "html", "rate limit" ], "author": { "name": "BeyondLink" }, "license": "BSD", "directories": { "doc": "docs", "test": "tests" }, "_id": "bquery@0.3.1", "_shasum": "60498d96a152264b874e69a6219e30e435501169", "_from": "bquery@0.3.0", "_npmVersion": "1.4.21", "_npmUser": { "name": "rickjose", "email": "rickshawjose@gmail.com" }, "maintainers": [ { "name": "rickjose", "email": "rickshawjose@gmail.com" } ], "dist": { "shasum": "07419e360a8d2efcb524b5038591bd1c2a33aae5", "tarball": "https://registry.npmjs.org/bquery/-/bquery-0.2.1.tgz" }, "_npmOperationalInternal": { "host": "packages-12-west.internal.npmjs.com", "tmp": "tmp/bquery-0.2.1.tgz_1457508484833_0.6001792382448912" }, "_resolved": "https://registry.npmjs.org/bquery/-/bquery-0.2.1.tgz", "readme": "# bquery\n\nQuick , simple and elegant way to fetch a web documents and structure it.\n\n## Installation\n\nLatest release:\n\n $ npm install bquery\n\n\n```js\nvar buquery = require(\"buqery\");\nbquery.query({\n \"url\": \"https://github.com/\",\n \"selector\": \"ul.header-nav.left>li\",\n \"extract\": {\n \"title\":{},\n \"url\": {\n \"selector\": \"a\",\n \"extract\": \"href\"\n }\n }\n}).then(function(docs){\n console.log(docs);\n //=> {\"results\":[{\"result\":[{\"title\":\"Explore\",\"url\":\"https://github.com/explore\"},{\"title\":\"Features\",\"url\":\"https://github.com/features\"},{\"title\":\"Enterprise\",\"url\":\"https://enterprise.github.com/\"},{\"title\":\"Blog\",\"url\":\"https://github.com/blog\"}]}]}\n})\n```\n\n\n## Options\nbquery can sutomatic recognition the web document charset, but special circumstances you can also set docuemnt's charset.\n\n```js\nvar buquery = require(\"buqery\");\nbquery.query({\n \"url\": \"https://github.com/\",\n \"selector\": \"ul.header-nav.left>li\",\n \"charset\": \"utf-8\",\n \"extract\": {\n \"title\":{},\n \"url\": {\n \"selector\": \"a\",\n \"extract\": \"href\"\n }\n }\n}).then(function(docs){\n console.log(docs);\n})\n```\n\n\nYou can also set the timeout period for the request.\n```js\nbquery.query({\n \"url\": \"https://github.com/\",\n \"selector\": \"ul.header-nav.left>li>a\",\n \"timeout\": 3000\n});\n```\n\n\nSometimes you need to modify the page content link css, javascript or other content before you fetch the docuemnt content. you can use \"preSelect\" option.\n```js\nbquery.query({\n \"url\": \"https://github.com/\",\n \"selector\": \"ul.header-nav.left>li\",\n \"preSelect\": function($){ //=> $ is a cheerio object, you can do any operate wich base on cheerio\n $(\"ul.header-nav.left>li\").each(function(i, elem){\n if($(\"a\", elem).text() == \"Explore\"){\n $(elem).remove()\n }\n });\n },\n \"extract\": {\n \"title\":{},\n \"url\": {\n \"selector\": \"a\",\n \"extract\": \"href\"\n }\n }\n}).then(function(docs){\n console.log(docs); \n //=>[\n //=> { title: 'Features', url: 'https://github.com/features' },\n //=> { title: 'Enterprise', url: 'https://enterprise.github.com/' },\n //=> { title: 'Blog', url: 'https://github.com/blog' } \n //=>]\n})\n```\n\n\nyou can also use callback to modify selected attribute\n```js\n{\n \"url\": \"https://github.com/\",\n \"selector\": \"ul.header-nav.left>li\",\n \"extract\": {\n \"title\":{\n \"extract\": \"text\",\n \"callback\": function(txt){\n return \"foo_\" + txt;\n }\n },\n \"url\": {\n \"selector\": \"a\",\n \"extract\": \"href\"\n }\n }\n}\n```\n\n\n", "readmeFilename": "README.md" }