sitemapper
Version:
Parser for XML Sitemaps to be used with Robots.txt and web crawlers
2 lines • 4.97 kB
JavaScript
function asyncGeneratorStep(b,d,f,e,g,h,a){try{var c=b[h](a),i=c.value}catch(a){return void f(a)}c.done?d(i):Promise.resolve(i).then(e,g)}function _asyncToGenerator(b){return function(){var c=this,d=arguments;return new Promise(function(e,f){function g(a){asyncGeneratorStep(i,e,f,g,h,"next",a)}function h(a){asyncGeneratorStep(i,e,f,g,h,"throw",a)}var i=b.apply(c,d);g(void 0)})}}import{XMLParser}from"fast-xml-parser";import got from"got";import zlib from"zlib";import pLimit from"p-limit";export default class Sitemapper{constructor(a){var b=a||{requestHeaders:{}};this.url=b.url,this.timeout=b.timeout||15e3,this.timeoutTable={},this.lastmod=b.lastmod||0,this.requestHeaders=b.requestHeaders,this.debug=b.debug,this.concurrency=b.concurrency||10,this.retries=b.retries||0,this.rejectUnauthorized=!1!==b.rejectUnauthorized,this.fields=b.fields||!1,this.proxyAgent=b.proxyAgent||{},this.exclusions=b.exclusions||[]}fetch(){var a=arguments,b=this;return _asyncToGenerator(function*(){var c=0<a.length&&a[0]!==void 0?a[0]:b.url,d={url:"",sites:[],errors:[]};b.debug&&b.lastmod&&console.debug("Using minimum lastmod value of ".concat(b.lastmod));try{d=yield b.crawl(c)}catch(a){b.debug&&console.error(a)}return{url:c,sites:d.sites||[],errors:d.errors||[]}})()}static get timeout(){return this.timeout}static set timeout(a){this.timeout=a}static get lastmod(){return this.lastmod}static set lastmod(a){this.lastmod=a}static set url(a){this.url=a}static get url(){return this.url}static set debug(a){this.debug=a}static get debug(){return this.debug}parse(){var a=arguments,b=this;return _asyncToGenerator(function*(){var c=0<a.length&&a[0]!==void 0?a[0]:b.url,d={method:"GET",decompress:!0,responseType:"buffer",headers:b.requestHeaders,https:{rejectUnauthorized:b.rejectUnauthorized},agent:b.proxyAgent};try{var e=got.get(c,d);b.initializeTimeout(c,e);var f=yield e;if(!f||200!==f.statusCode){clearTimeout(b.timeoutTable[c]);var g=f?f.statusCode:0,h=f?f.statusMessage:"No response";return{error:"HTTP Error: ".concat(g," ").concat(h),data:f}}var i=f.body;2<f.body.length&&31===f.body[0]&&139===f.body[1]&&(i=zlib.gunzipSync(f.body));var j=new XMLParser({isArray:a=>["sitemap","url"].some(b=>b===a),removeNSPrefix:!0}),k=j.parse(i.toString());return{error:null,data:k}}catch(a){return"CancelError"===a.name?{error:"Request timed out after ".concat(b.timeout," milliseconds for url: '").concat(c,"'"),data:a}:"HTTPError"===a.name?{error:"HTTP Error occurred: ".concat(a.message),data:a}:{error:"Error occurred: ".concat(a.name),data:a}}})()}initializeTimeout(a,b){this.timeoutTable[a]=setTimeout(()=>b.cancel(),this.timeout)}crawl(a){var b=arguments,c=this;return _asyncToGenerator(function*(){var d=1<b.length&&b[1]!==void 0?b[1]:0;try{var{error:e,data:f}=yield c.parse(a);if(clearTimeout(c.timeoutTable[a]),e)return d<c.retries?(c.debug&&console.log("(Retry attempt: ".concat(d+1," / ").concat(c.retries,") ").concat(a," due to ").concat(f.name," on previous request")),c.crawl(a,d+1)):(c.debug&&console.error("Error occurred during \"crawl('".concat(a,"')\":\n\r Error: ").concat(e)),{sites:[],errors:[{type:f.name,message:e,url:a,retries:d}]});if(f&&f.urlset&&f.urlset.url){c.debug&&console.debug("Urlset found during \"crawl('".concat(a,"')\""));var g=Array.isArray(f.urlset.url)?f.urlset.url:[f.urlset.url],h=g.filter(a=>{if(0===c.lastmod)return!0;if(void 0===a.lastmod)return!1;var b=new Date(a.lastmod).getTime();return b>=c.lastmod}).filter(a=>!c.isExcluded(a.loc)).map(b=>{if(!c.fields)return b.loc;var d={};c.fields.sitemap&&(d.sitemap=a);for(var[e,f]of Object.entries(c.fields))f&&b[e]&&(d[e]=b[e]);return d});return{sites:h,errors:[]}}if(f&&f.sitemapindex){c.debug&&console.debug("Additional sitemap found during \"crawl('".concat(a,"')\""));var i=f.sitemapindex.sitemap.map(a=>a.loc).filter(a=>!c.isExcluded(a)),j=pLimit(c.concurrency),k=i.map(a=>j(()=>c.crawl(a))),l=yield Promise.all(k),m=l.filter(a=>0===a.errors.length).reduce((a,b)=>{var{sites:c}=b;return[...a,...c]},[]),n=l.filter(a=>0!==a.errors.length).reduce((a,b)=>{var{errors:c}=b;return[...a,...c]},[]);return{sites:m,errors:n}}return d<c.retries?(c.debug&&console.log("(Retry attempt: ".concat(d+1," / ").concat(c.retries,") ").concat(a," due to ").concat(f.name," on previous request")),c.crawl(a,d+1)):(c.debug&&console.error("Unknown state during \"crawl('".concat(a,")'\":"),e,f),{sites:[],errors:[{url:a,type:f.name||"UnknownStateError",message:"An unknown error occurred.",retries:d}]})}catch(a){c.debug&&c.debug&&console.error(a)}})()}getSites(){var a=arguments,b=this;return _asyncToGenerator(function*(){var c=0<a.length&&a[0]!==void 0?a[0]:b.url,d=1<a.length?a[1]:void 0;console.warn("\r\nWarning:","function .getSites() is deprecated, please use the function .fetch()\r\n");var e={},f=[];try{var g=yield b.fetch(c);f=g.sites}catch(a){e=a}return d(e,f)})()}isExcluded(a){return 0!==this.exclusions.length&&this.exclusions.some(b=>b.test(a))}}
//# sourceMappingURL=sitemapper.js.map