nurlresolver
Version:
url resolver for node
173 lines • 6.73 kB
JavaScript
import got, { HTTPError } from 'got';
import scrapeIt from "scrape-it";
import * as helper from './utils/helper.js';
import { CookieJar } from 'tough-cookie';
import _debug from 'debug';
import { URL } from 'url';
import { performance } from "perf_hooks";
const logger = _debug('nurl:BaseUrlResolver');
export class BaseUrlResolver {
domains;
gotInstance = got;
_speedRank;
useCookies;
scrapeItAsync = scrapeIt;
scrapeHtml = scrapeIt.scrapeHTML;
_cookieJar;
defaultUserAgent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36';
_resolverOptions;
_context;
getSecondLevelDomain = helper.getSecondLevelDomain;
isValidHttpUrl = helper.isValidHttpUrl;
getHiddenForm = helper.parseHiddenFormV2;
scrapeLinkHref = helper.scrapeLinkHref;
scrapePageTitle = helper.scrapePageTitle;
scrapeInnerText = helper.scrapeInnerText;
parseAllLinks = helper.parseAllLinks;
parseElementAttributes = helper.parseElementAttributes;
extractFileNameFromUrl = helper.extractFileNameFromUrl;
wait = helper.wait;
nodeatob = helper.nodeatob;
nodebtoa = helper.nodebtoa;
getServerPublicIp = helper.getServerPublicIp;
parseScripts = helper.parseScripts;
scrapeAllLinks(html, context, baseUrl = '') {
return helper.parseAllLinks(html, context, baseUrl)
.map(x => { return { link: x.link, title: x.title }; });
}
constructor(options) {
this.domains = options.domains;
this.useCookies = options.useCookies || false;
this._speedRank = options.speedRank || 30;
}
/**
* @param {string} urlToResolve
* Override this method if you want to implement can resolve function
*/
async canResolve(urlToResolve) {
return this.domains.some((innerUrl) => {
return innerUrl.test(urlToResolve);
});
}
/**
*
* @param {string} urlToResolve
*/
async resolve(urlToResolve, options, context) {
let canResolve = false;
this._resolverOptions = options;
this._context = context;
try {
canResolve = await this.canResolve(urlToResolve);
}
catch (error) {
logger(`Error occurred while calling canResolve BaseResolver: ${error}`);
}
if (canResolve) {
let status = 'ERROR';
const startTime = performance.now();
try {
this.setupEnvironment();
let resolveResults = [];
const _resolveResults = await this.resolveInner(urlToResolve);
resolveResults = resolveResults.concat(_resolveResults);
resolveResults.forEach(x => x.parent = x.parent || urlToResolve);
resolveResults.filter(x => x.isPlayable).forEach(x => x.speedRank = this._speedRank);
if (options.extractMetaInformation) {
await Promise.all(resolveResults
.filter(x => x.isPlayable)
.map(this.fillMetaInfoInner, this));
}
const result = this.massageResolveResults(resolveResults);
status = result.length > 0 ? 'OK' : 'NOT FOUND';
return result;
}
catch (error) {
if (error instanceof HTTPError) {
logger('http error %s %s', urlToResolve, error.message);
}
else if (error instanceof Error) {
logger('unknown error %s %s', urlToResolve, error.message);
}
}
finally {
const timeTook = (performance.now() - startTime);
logger('%s %s %sms', status, urlToResolve, timeTook.toFixed(0));
}
}
return [];
}
massageResolveResults(resolveResults) {
return resolveResults.map(x => {
x.link = new URL(x.link).href; //normalizing the url like escaping spaces
return x;
});
}
async fillMetaInfoInner(resolveMediaItem) {
await this.fillMetaInfo(resolveMediaItem);
}
async fillMetaInfo(resolveMediaItem) {
const headResponse = await this.gotInstance.head(resolveMediaItem.link, {
headers: resolveMediaItem.headers || {}
});
resolveMediaItem.size = headResponse.headers['content-length'];
resolveMediaItem.lastModified = headResponse.headers['last-modified'];
resolveMediaItem.contentType = headResponse.headers['content-type'];
}
setupEnvironment() {
const rejectUnauthorized = process.env.NODE_TLS_REJECT_UNAUTHORIZED;
const gotOptions = {
https: {
rejectUnauthorized: rejectUnauthorized == '1'
},
headers: {
'User-Agent': this.defaultUserAgent
}, timeout: {
request: (this._resolverOptions?.timeout || 20) * 1000 //by default let every individual request time out after 20 seconds
}, retry: {
limit: 0
}
};
if (this.useCookies) {
this._cookieJar = new CookieJar();
gotOptions.cookieJar = this._cookieJar;
}
this.gotInstance = got.extend(gotOptions);
}
async postHiddenForm(urlToPost, page, ix, resolveBody = true, followRedirect) {
const form = this.getHiddenForm(page, ix);
if (form) {
const response2 = await this.gotInstance.post(urlToPost, {
form: form,
headers: {
Referer: urlToPost
},
followRedirect: followRedirect || false //it can raise some unhandled error which can potentially cause whole application shutdown.
});
return resolveBody ? response2.body : response2;
}
throw new Error('No form found to post.');
}
}
export class GenericFormBasedResolver extends BaseUrlResolver {
_selector;
_formIx;
constructor(options, selector, formIx) {
super(options);
this._selector = selector;
this._formIx = formIx || 0;
}
async resolveInner(_urlToResolve) {
const response = await this.gotInstance(_urlToResolve);
const response2Body = await this.postHiddenForm(response.url, response.body, this._formIx);
const link = this.scrapeLinkHref(response2Body, this._selector);
const title = this.extractFileNameFromUrl(link);
const result = {
link,
title,
isPlayable: true
};
return [result];
}
}
//# sourceMappingURL=BaseResolver.js.map