UNPKG

spiderable-middleware

Version:

Boost organic traffic from SERPs, AI Chat Bots, and GPT

499 lines (427 loc) 21.1 kB
var https = require('https'); var URL = require('url').URL; if (typeof window !== 'undefined') { throw new Error('Running `spiderable-middleware` in Browser environment isn\'t allowed! Please make sure `spiderable-middleware` NPM package is imported and used only in Node.js environment.'); } var keepAliveAgent = new https.Agent({ keepAlive: true, timeout: 190000, }); var nullBuf = typeof Buffer.from === 'function' ? Buffer.from('') : new Buffer(''); var STRS = { authBasic: 'Basic ', caseInsensitive: 'i', empty: '', escapedFragment: '_escaped_fragment_', get: 'get', head: 'head', pipe: '|', semicolon: ':', slash: '/', string: 'string', ua: 'user-agent', objs: { array: '[object Array]', regexp: '[object RegExp]', object: '[object Object]', string: '[object String]' }, queryArgs: { url: '/?url=', bot: '&___bot=', }, enc: { base64: 'base64', }, }; var packageDetails = { version: '2.3.0', name: 'SPIDERABLE_MIDDLEWARE', }; var RE = { newLine: /\r|\n/, proto: /^https?:\/\//i, trailingSlash: /\/$/, beginningSlash: /^\//, digit: /\d+/, staticExt: /\.(?:3ds|3g2|3gp|3gpp|7z|a|aac|aaf|adp|ai|aif|aiff|alz|ape|apk|appcache|ar|arj|asf|asx|atom|au|avchd|avi|bak|bbaw|bh|bin|bk|bmp|btif|bz2|bzip2|cab|caf|cco|cgm|class|cmx|cpio|cr2|crt|crx|css|csv|cur|dat|deb|der|dex|djvu|dll|dmg|dng|doc|docm|docx|dot|dotm|dra|drc|DS_Store|dsk|dts|dtshd|dvb|dwg|dxf|ear|ecelp4800|ecelp7470|ecelp9600|egg|eol|eot|eps|epub|exe|f4a|f4b|f4p|f4v|fbs|fh|fla|flac|fli|flv|fpx|fst|fvt|g3|geojson|gif|graffle|gz|gzip|h261|h263|h264|hqx|htc|ico|ief|img|ipa|iso|jad|jar|jardiff|jng|jnlp|jpeg|jpg|jpgv|jpm|js|jxr|key|kml|kmz|ktx|less|lha|lvp|lz|lzh|lzma|lzo|m2v|m3u|m4a|m4p|m4v|map|manifest|mar|markdown|md|mdi|mdown|mdwn|mht|mid|midi|mj2|mka|mkd|mkdn|mkdown|mkv|mml|mmr|mng|mobi|mov|movie|mp2|mp3|mp4|mp4a|mpe|mpeg|mpg|mpga|mpv|msi|msm|msp|mxf|mxu|nef|npx|nsv|numbers|o|oex|oga|ogg|ogv|opus|otf|pages|pbm|pcx|pdb|pdf|pea|pem|pgm|pic|pl|pm|png|pnm|pot|potm|potx|ppa|ppam|ppm|pps|ppsm|ppsx|ppt|pptm|pptx|prc|ps|psd|pya|pyc|pyo|pyv|qt|ra|rar|ras|raw|rdf|rgb|rip|rlc|rm|rmf|rmvb|ron|roq|rpm|rss|rtf|run|rz|s3m|s7z|safariextz|scpt|sea|sgi|shar|sil|sit|slk|smv|so|sub|svg|svgz|svi|swf|tar|tbz|tbz2|tcl|tga|tgz|thmx|tif|tiff|tk|tlz|topojson|torrent|ttc|ttf|txt|txz|udf|uvh|uvi|uvm|uvp|uvs|uvu|vcard|vcf|viv|vob|vtt|war|wav|wax|wbmp|wdp|weba|webapp|webm|webmanifest|webp|whl|wim|wm|wma|wml|wmlc|wmv|wmx|woff|woff2|wvx|xbm|xif|xla|xlam|xloc|xls|xlsb|xlsm|xlsx|xlt|xltm|xltx|xm|xmind|xml|xpi|xpm|xsl|xwd|xz|yuv|z|zip|zipx)$/i }; var isObject = function (obj) { var type = typeof obj; return type === 'function' || type === 'object' && !!obj; }; var _warn = function warn(...args) { console.warn.call(console, '[WARN] [Spiderable-Middleware]', ...args); }; var _info = function info(...args) { console.info.call(console, '[INFO] [Spiderable-Middleware]', ...args); }; /** Class representing a Spiderable */ module.exports = (function () { /** * Create a Spiderable instance * @param {object} _opts - configuration object * @param {string} _opts.serviceURL * @param {string} _opts.rootURL * @param {string} _opts.auth * @param {boolean} _opts.sanitizeUrls * @param {[string]} _opts.botsUA * @param {[string]} _opts.ignoredHeaders * @param {[string]} _opts.ignore * @param {[string|RegExp]} _opts.only * @param {RegExp} _opts.onlyRE * @param {number} _opts.timeout * @param {object} _opts.requestOptions * @param {boolean} _opts.debug */ function Spiderable(_opts) { var opts = {}; if (_opts && Object.prototype.toString.call(_opts) === STRS.objs.object) { opts = _opts; } this.NAME = packageDetails.name; this.userAgent = 'spiderable-middleware/' + packageDetails.version; this.auth = opts.auth; this.debug = opts.debug || process.env.DEBUG === 'true' || process.env.DEBUG === true || false; var ignore = opts.ignore || false; this.only = opts.only || false; this.onlyRE = opts.onlyRE || false; this.botsUA = opts.botsUA || Spiderable.prototype.botsUA; this.rootURL = opts.rootURL || process.env.ROOT_URL; this.timeout = opts.timeout || 180000; this.staticExt = opts.staticExt || RE.staticExt; this.serviceURL = opts.serviceURL || process.env.SPIDERABLE_SERVICE_URL || process.env.PRERENDER_SERVICE_URL || 'https://render.ostr.io'; this.sanitizeUrls = opts.sanitizeUrls || false; this.ignoredHeaders = opts.ignoredHeaders || Spiderable.prototype.ignoredHeaders; this.requestOptions = opts.requestOptions || {}; if (Object.prototype.toString.call(this.staticExt) !== STRS.objs.regexp) { _warn('`opts.staticExt` must be instance of RegExp, falling back to defaults.'); this.staticExt = RE.staticExt; } if (this.onlyRE && Object.prototype.toString.call(this.onlyRE) !== STRS.objs.regexp) { _warn('`opts.onlyRE` must be instance of RegExp, rules are ignored!'); this.onlyRE = false; } if (Object.prototype.toString.call(this.botsUA) !== STRS.objs.array) { _warn('`opts.botsUA` must be instance of Array, falling back to defaults.'); this.botsUA = Spiderable.prototype.botsUA; } if (Object.prototype.toString.call(this.ignoredHeaders) !== STRS.objs.array) { _warn('`opts.ignoredHeaders` must be instance of Array, falling back to defaults.'); this.ignoredHeaders = Spiderable.prototype.ignoredHeaders; } if (this.only && Object.prototype.toString.call(this.only) !== STRS.objs.array) { _warn('`opts.only` must be instance of Array, rules are ignored!'); this.only = false; } this._debug = function debug(...args) { if (this.debug) { _info('[DEBUG]', ...args); } }; if (!this.handler) { this.handler = this.middleware.bind(this); } if (!this.handle) { this.handle = (...args) => { this.middleware.call(this, ...args); return void 0; }; } this.headersRE = new RegExp('^(' + this.ignoredHeaders.join(STRS.pipe) + ')$', STRS.caseInsensitive); this.botsRE = new RegExp(this.botsUA.join(STRS.pipe), STRS.caseInsensitive); if (!this.auth) { this.auth = process.env.SPIDERABLE_SERVICE_AUTH || process.env.PRERENDER_SERVICE_AUTH || STRS.empty; } if (ignore && Object.prototype.toString.call(ignore) !== STRS.objs.array) { _warn('`opts.ignore` must be instance of Array, rules are ignored!'); ignore = false; } if (!this.rootURL) { throw new Error('{rootURL} or env variable ROOT_URL is not detected! But must be specified!'); } if (!this.serviceURL) { throw new Error('{serviceURL} or env variable SPIDERABLE_SERVICE_URL or PRERENDER_SERVICE_URL is not detected! But must be specified!'); } if (!RE.proto.test(this.rootURL)) { throw new Error('{rootURL} is malformed! Must start with http or https protocol'); } if (!RE.proto.test(this.serviceURL)) { throw new Error('{serviceURL} is malformed! Must start with http or https protocol'); } this.rootURL = this.rootURL.replace(RE.trailingSlash, STRS.empty).replace(RE.beginningSlash, STRS.empty); this.serviceURL = this.serviceURL.replace(RE.trailingSlash, STRS.empty).replace(RE.beginningSlash, STRS.empty); if (ignore) { this.ignoreRE = new RegExp(ignore.join(STRS.pipe), STRS.empty); } else { this.ignoreRE = false; } this._debug('Spiderable class initiated', this); } /** * @memberOf Spiderable * Array of bots and crawlers user agents * @name botsUA * @type {string[]} */ Spiderable.prototype.botsUA = ['\\.net crawler', '360spider', '50\\.nu', '8bo crawler bot', 'aboundex', 'accoona', 'adldxbot', 'adsbot-google', 'ahrefsbot', 'altavista', 'appengine-google', 'applebot', 'archiver', 'arielisbot', 'ask jeeves', 'auskunftbot', 'baidumobaider', 'baiduspider', 'becomebot', 'bingbot', 'bingpreview', 'bitbot', 'bitlybot', 'blitzbot', 'blogbridge', 'boardreader', 'botseer', 'catchbot', 'catchpoint bot', 'charlotte', 'checklinks', 'cliqzbot', 'clumboot', 'coccocbot', 'converacrawler', 'crawl-e', 'crawlconvera', 'dataparksearch', 'daum', 'deusu', 'developers\\.google\\.com/+/web/snippet', 'discordbot', 'dotbot', 'duckduckbot', 'elefent', 'embedly', 'evernote', 'exabot', 'facebookbot', 'facebookexternalhit', 'fatbot', 'fdse robot', 'feed seeker bot', 'feedfetcher', 'femtosearchbot', 'findlinks', 'flamingo_searchengine', 'flipboard', 'followsite bot', 'furlbot', 'fyberspider', 'gaisbot', 'galaxybot', 'geniebot', 'genieo', 'gigablast', 'gigabot', 'girafabot', 'gomezagent', 'gonzo1', 'google sketchup', 'google-structured-data-testing-tool', 'googlebot', 'haosouspider', 'heritrix', 'holmes', 'hoowwwer', 'htdig', 'ia_archiver', 'idbot', 'infuzapp', 'innovazion crawler', 'internetarchive', 'iqdb', 'iskanie', 'istellabot', 'izsearch\\.com', 'kaloogabot', 'kaz\\.kz_bot', 'kd bot', 'konqueror', 'kraken', 'kurzor', 'larbin', 'leia', 'lesnikbot', 'linguee bot', 'linkaider', 'linkapediabot', 'linkedinbot', 'lite bot', 'llaut', 'lookseek', 'lycos', 'mail\\.ru_bot', 'masidani_bot', 'masscan', 'mediapartners-google', 'metajobbot', 'mj12bot', 'mnogosearch', 'mogimogi', 'mojeekbot', 'motominerbot', 'mozdex', 'msiecrawler', 'msnbot', 'msrbot', 'netpursual', 'netresearch', 'netvibes', 'newsgator', 'ng-search', 'nicebot', 'nutchcvs', 'nuzzel', 'nymesis', 'objectssearch', 'odklbot', 'omgili', 'oovoo', 'oozbot', 'openfosbot', 'orangebot', 'orbiter', 'org_bot', 'outbrain', 'pagepeeker', 'pagesinventory', 'parsijoobot', 'paxleframework', 'peeplo screenshot bot', 'pinterest', 'plantynet_webrobot', 'plukkie', 'pompos', 'psbot', 'quora link preview', 'qwantify', 'read%20later', 'reaper', 'redcarpet', 'redditbot', 'retreiver', 'riddler', 'rival iq', 'rogerbot', 'saucenao', 'scooter', 'scrapy', 'scrubby', 'searchie', 'searchsight', 'seekbot', 'semanticdiscovery', 'seznambot', 'showyoubot', 'simplepie', 'simpy', 'sitelockspider', 'skypeuripreview', 'slack-imgproxy', 'slackbot', 'slurp', 'snappy', 'sogou', 'solofield', 'speedy spider', 'speedyspider', 'sputnikbot', 'stackrambler', 'teeraidbot', 'teoma', 'theusefulbot', 'thumbshots\\.ru', 'thumbshotsbot', 'tineye', 'toweya\\.com', 'toweyabot', 'tumblr', 'tweetedtimes', 'tweetmemebot', 'twitterbot', 'url2png', 'vagabondo', 'vebidoobot', 'viber', 'visionutils', 'vkshare', 'voilabot', 'vortex', 'votay bot', 'voyager', 'w3c_validator', 'wasalive\\.bot', 'web-sniffer', 'websquash\\.com', 'webthumb', 'whatsapp', 'whatweb', 'wire', 'wotbox', 'yacybot', 'yahoo', 'yandex', 'yeti', 'yisouspider', 'yodaobot', 'yooglifetchagent', 'yoozbot', 'yottaamonitor', 'yowedo', 'zao-crawler', 'zebot_www\\.ze\\.bz', 'zooshot', 'zyborg', 'ai2bot', 'amazonbot', 'anthropic\\.com', 'bard', 'bytespider', 'ccbot', 'chatgpt-user', 'claude-web', 'claudebot', 'cohere-ai', 'deepseek', 'diffbot', 'duckassistbot', 'gemini', 'google-extended', 'gptbot', 'grok', 'meta-external', 'mistralai', 'oai-searchbot', 'omgili', 'openai\\.com', 'perplexity\\.ai', 'perplexitybot', 'xai', 'youbot']; /** * @memberOf Spiderable * Array of ignored headers * @name ignoredHeaders * @type {string[]} */ Spiderable.prototype.ignoredHeaders = ['age', 'alt-svc', 'cache-status', 'cf-connecting-ip', 'cf-ipcountry', 'cf-cache-status', 'cf-ray', 'cf-request-id', 'cnection', 'cneonction', 'connection', 'content-encoding', 'content-length', 'date', 'etag', 'expect-ct', 'expires', 'keep-alive', 'last-modified', 'link', 'nel', 'nncoection', 'pragma', 'server', 'set-cookie', 'status', 'transfer-encoding', 'report-to', 'vary', 'via', 'www-authenticate', 'x-accel-buffering', 'x-accel-charset', 'x-accel-expires', 'x-accel-limit-rate', 'x-accel-redirect', 'x-ostrio-domain', 'x-powered-by', 'x-preprender-status', 'x-prerender-status', 'x-real-ip', 'x-runtime']; /** * @memberOf Spiderable * Get complete URL to send request to rendering endpoint * @name getServiceURL * @param {URL} urlObj - URL instance returned from `getRequestURL()` method * @param {string|undefined} bua - Bot User Agent string from request headers * @returns {string} */ Spiderable.prototype.getServiceURL = function (urlObj, bua) { var botUA = bua ? bua : ''; var reqUrl = this.rootURL; reqUrl += STRS.slash + urlObj.pathname.replace(RE.beginningSlash, STRS.empty) + urlObj.search; reqUrl = (this.serviceURL + STRS.queryArgs.url + encodeURIComponent(reqUrl)); if (typeof botUA === STRS.string && botUA.length) { reqUrl += STRS.queryArgs.bot + encodeURIComponent(botUA); } this._debug('[getServiceURL] URL:', reqUrl); return reqUrl; }; /** * @memberOf Spiderable * Check request and return complete origin URL if valid * @name getRequestURL * @param {IncomingMessage} req - Original IncomingMessage of node.js server * @returns {string|false} */ Spiderable.prototype.getRequestURL = function (req) { var path = req.url; if (this.sanitizeUrls) { path = path.replace(/\/+/g, STRS.slash); } var urlObj; try { urlObj = new URL(path, this.rootURL); } catch (e) { // BAD URL IS PASSED! // IGNORING AND PASSING DOWN TO THE APP this._debug('[getRequestURL] [bad url]', path, req.url, e); return false; } urlObj.pathname = urlObj.pathname.replace(RE.beginningSlash, STRS.empty); var escapedFragment = urlObj.searchParams.has(STRS.escapedFragment) ? urlObj.searchParams.get(STRS.escapedFragment) : false; if (escapedFragment && typeof escapedFragment === STRS.string) { urlObj.searchParams.delete(STRS.escapedFragment); if (escapedFragment.length) { urlObj.pathname += STRS.slash + (escapedFragment.replace(RE.beginningSlash, STRS.empty)); } } this._debug('[getRequestURL] URL:', urlObj.toString()); if (!this.botsRE.test(req.headers[STRS.ua] || STRS.empty) && escapedFragment === false) { return false; } return urlObj; }; /** * @memberOf Spiderable * Check request and return complete origin URL if valid * @name middleware * @param {IncomingMessage} req - Original IncomingMessage of node.js server * @param {ServerResponse} res - Writable ServerResponse * @param {function} next - Function that will be called to skip this middleware and process the next one * @returns {boolean} */ Spiderable.prototype.middleware = function (req, res, next) { if (this.NAME !== packageDetails.name) { _warn('middleware has lost its context, ensure it\'s binds back, for example: `spiderable.handle.bind(spiderable)`'); return false; } var method = req.method.toLowerCase(); var self = this; if (method !== STRS.get && method !== STRS.head) { next(); return false; } var urlObj = this.getRequestURL(req); if (!urlObj) { next(); return false; } var hasIgnored = false; var hasOnly = false; if (this.staticExt.test(urlObj.pathname)) { next(); return false; } if (this.onlyRE) { hasOnly = this.onlyRE.test(urlObj.pathname); hasIgnored = !hasOnly; } if (!hasOnly && this.only) { hasIgnored = true; for (var i = 0; i < this.only.length; i++) { if (Object.prototype.toString.call(this.only[i]) === STRS.objs.string) { if (this.only[i] === urlObj.pathname) { hasIgnored = false; hasOnly = true; break; } } else if (Object.prototype.toString.call(this.only[i]) === STRS.objs.regexp) { if (this.only[i].test(urlObj.pathname)) { hasIgnored = false; hasOnly = true; break; } } else { _warn('`opts.only` {' + this.only[i] + '} rule isn\'t instance of {String} nor {RegExp}, rule ignored!'); } } } if (this.ignoreRE && this.ignoreRE.test(urlObj.pathname)) { hasIgnored = true; } if (hasIgnored) { this._debug('[middleware] [hasIgnored]', urlObj.pathname); next(); return false; } var reqUrl = this.getServiceURL(urlObj, req.headers[STRS.ua]); var reqHeaders = { 'User-Agent': this.userAgent, Accept: '*/*', }; if (this.auth) { reqHeaders.Authorization = STRS.authBasic + Buffer.from(this.auth).toString(STRS.enc.base64); } var payload = { method: method.toUpperCase(), headers: reqHeaders, agent: keepAliveAgent, }; var requestOptionsKeys = Object.keys(this.requestOptions); if (requestOptionsKeys.length > 0) { for (var ii = 0; ii < requestOptionsKeys.length; ii++) { if (isObject(payload[requestOptionsKeys[ii]]) && isObject(this.requestOptions[requestOptionsKeys[ii]])) { payload[requestOptionsKeys[ii]] = Object.assign({}, payload[requestOptionsKeys[ii]], this.requestOptions[requestOptionsKeys[ii]]); } else { payload[requestOptionsKeys[ii]] = this.requestOptions[requestOptionsKeys[ii]]; } } } try { var _headersRE = this.headersRE; var url = new URL(reqUrl); this._debug('[middleware] [requesting]', url.toString()); var serviceReq = https.request(url, payload, function (resp) { for (var _hName in resp.headers) { if (resp.headers[_hName]) { var hName = _hName.toLowerCase(); if (!res.headersSent && !_headersRE.test(hName)) { res.setHeader(hName, resp.headers[_hName]); } } } if (resp.statusCode === 401 || resp.statusCode === 403) { _warn('Can\'t authenticate! Please check "auth" parameter and other settings'); } if (method === STRS.head) { self._debug('[middleware] [request] [HEAD] [received and end]', resp.statusCode); res.writeHead(resp.statusCode); res.end(); return; } if (!res.headersSent) { self._debug('[middleware] [request] [writeHead]', resp.statusCode); res.writeHead(resp.statusCode); } res.write(nullBuf); resp.on('data', function (data) { self._debug('[middleware] [response] [data received]', res.finished, res.writableEnded); if (!res.finished && !res.writableEnded) { res.write(data); } }); resp.on('end', function (data) { self._debug('[middleware] [response] [end successfully]', res.finished, res.writableEnded); if (!res.finished && !res.writableEnded) { res.end(data); } }); }); var isEnded = false; var onEnd = function (error) { self._debug('[middleware] [onEnd]', error); if (isEnded) { return; } if (error) { // DO NOT THROW AN ERROR ABOUT ABORTED REQUESTS if (!req.writableEnded && !req.aborted && !req.destroyed && error.statusCode !== 499) { _warn('Error while connecting to external service:', error); next(); return; } } if (!res.headersSent) { res.writeHead(200); } if (!res.finished && !res.writableEnded) { res.end(); } if (!serviceReq.writableEnded && !serviceReq.aborted && !serviceReq.destroyed) { serviceReq.end(); } isEnded = true; }; serviceReq.on('abort', onEnd); serviceReq.on('error', onEnd); serviceReq.on('timeout', onEnd); serviceReq.setNoDelay(true); serviceReq.setTimeout(this.timeout, onEnd); req.on('error', function (error) { _warn('[REQ] ["error" event] Unexpected error:', error); if (!req.aborted) { serviceReq.destroy(); next(); } }); res.on('error', function (error) { _warn('[RES] ["error" event] Unexpected error:', error); serviceReq.destroy(); next(); }); req.on('aborted', function () { self._debug('[middleware] [req.aborted]', arguments); // No need to log this event as nothing bad happened // this simply means host which sent this request // has aborted the connection or got disconnected req.aborted = true; serviceReq.destroy(); try { res.end(); } catch (_e) { // We assume res.end() to throw an error // but still will call for it, as we'd like // to make sure memory, socket and session // are freed-up and closed } }); // SET TIMEOUT AS A PRECAUTION res.setTimeout(this.timeout, onEnd); req.setTimeout(this.timeout, onEnd); serviceReq.setTimeout(this.timeout, onEnd); // SEND REQUEST TO PRERENDERING ENDPOINT serviceReq.end(); } catch (e) { _warn('Exception while connecting to external service:', e); next(); return false; } return true; }; return Spiderable; })();