unprint
Version:
Simplify common web scraping tasks while staying in control of the data.
1,142 lines (888 loc) • 26.5 kB
JavaScript
const { JSDOM, VirtualConsole } = require('jsdom');
const EventEmitter = require('events');
const http = require('http');
const https = require('https');
const tunnel = require('tunnel');
const axios = require('axios').default;
const Bottleneck = require('bottleneck');
const moment = require('moment-timezone');
const merge = require('deepmerge');
const settings = {
throwErrors: false,
logErrors: true,
requestTimeout: 30000,
limits: {
default: {
interval: 10,
concurrency: 10,
},
},
};
const virtualConsole = new VirtualConsole();
const { window: globalWindow } = new JSDOM('', { virtualConsole });
let globalOptions = {
...settings,
};
const events = new EventEmitter();
function configure(newOptions) {
globalOptions = merge(globalOptions, newOptions);
}
function handleError(error, code) {
if (globalOptions.logErrors) {
console.error(`unprint encountered an error (${code}): ${error.message}`);
}
if (globalOptions.throwErrors) {
throw Object.assign(error, { code });
}
return null;
}
virtualConsole.on('error', (message) => handleError(message, 'JSDOM'));
virtualConsole.on('jsdomError', (message) => handleError(message, 'JSDOM'));
function trim(string) {
if (typeof string === 'string') {
return string.trim().replace(/\s+/g, ' ');
}
return string;
}
function iterateXpathResult(iterator, results = []) {
const element = iterator.iterateNext();
if (element) {
return iterateXpathResult(iterator, results.concat(element));
}
return results;
}
function getElements(context, selector, firstOnly = false) {
if (!selector) {
return context.element;
}
if (/^\//.test(selector)) {
// XPath selector
const iterator = globalWindow.document.evaluate(`.${selector}`, context.element, null, globalWindow.XPathResult.ORDERED_NODE_ITERATOR_TYPE, null);
if (firstOnly) {
return iterator.iterateNext();
}
return iterateXpathResult(iterator);
}
if (firstOnly) {
return context.element.querySelector(selector);
}
return Array.from(context.element.querySelectorAll(selector));
}
function queryElement(context, selectors, _customOptions) {
if (!selectors && context.element.nodeName === '#document') {
return null;
}
const target = [].concat(selectors).reduce((acc, selector) => acc || getElements(context, selector, true), null);
return target || null;
}
function queryElements(context, selectors, _customOptions) {
if (!selectors) {
return context.element;
}
const targets = [].concat(selectors).reduce((acc, selector) => acc || getElements(context, selector, false), null);
return targets || [];
}
function queryExistence(context, selector, customOptions) {
return !!queryElement(context, selector, customOptions);
}
function queryCount(context, selector, customOptions) {
return queryElements(context, selector, customOptions)?.length || 0;
}
function getAttributeKey(options) {
if (!options) {
return null;
}
if (Object.hasOwn(options, 'attr')) {
return options.attr;
}
if (Object.hasOwn(options, 'attribute')) {
return options.attribute;
}
return null;
}
function extractContent(element, options) {
if (!element) {
return null;
}
const attributeKey = getAttributeKey(options);
if (attributeKey) {
// handle attribute extraction in content method so all methods can easily optionally query a specific attribute
const attribute = options.forceGetAttribute
? element.getAttribute(attributeKey)
: element[attributeKey] || element.getAttribute(attributeKey);
if (attribute && options.trim) {
return trim(attribute);
}
return attribute;
}
if (options.trim) {
return trim(element.textContent);
}
return element.textContent;
}
function queryContent(context, selector, customOptions) {
const options = {
...context.options,
trim: true,
...customOptions,
};
const target = queryElement(context, selector, options);
return extractContent(target, options);
}
function queryContents(context, selector, customOptions) {
const options = { ...context.options, ...customOptions };
const targets = queryElements(context, selector, options);
return targets.map((target) => extractContent(target, options)).filter(Boolean);
}
function queryAttribute(context, selector, attribute, customOptions) {
return queryContent(context, selector, {
...customOptions,
attribute,
});
}
function queryAttributes(context, selector, attribute, customOptions) {
return queryContents(context, selector, {
...customOptions,
attribute,
});
}
function queryDataset(context, selector, dataAttribute, customOptions) {
const target = queryElement(context, selector, customOptions);
if (target) {
return target.dataset[dataAttribute];
}
return null;
}
function queryDatasets(context, selector, dataAttribute, customOptions) {
const targets = queryElements(context, selector, customOptions);
return targets.map((target) => target.dataset[dataAttribute]);
}
const defaultNumberRegexp = /\d+([.,]\d+)?/;
function extractNumber(rawNumberString, customOptions) {
if (!rawNumberString) {
return null;
}
const options = {
match: defaultNumberRegexp,
matchIndex: 0,
separator: '.',
...customOptions,
};
const numberString = options.separator === ','
? rawNumberString.replace(',', '.')
: rawNumberString.replace(',', '');
if (numberString && options.match) {
const number = Number(numberString.match(options.match)?.[options.matchIndex]);
if (Number.isNaN(number)) {
return null;
}
return number;
}
if (numberString) {
const number = Number(numberString);
if (Number.isNaN(number)) {
return null;
}
return number;
}
return null;
}
function queryNumber(context, selector, customOptions) {
const numberString = queryContent(context, selector, customOptions);
return extractNumber(numberString, customOptions);
}
function queryNumbers(context, selector, customOptions) {
const numberStrings = queryContents(context, selector, customOptions);
if (!numberStrings) {
return null;
}
return numberStrings
.map((numberString) => extractNumber(numberString, customOptions))
.filter(Boolean);
}
function queryHtml(context, selector, customOptions) {
const target = queryElement(context, selector, customOptions);
if (target) {
return trim(target.innerHTML);
}
return null;
}
function queryHtmls(context, selector, customOptions) {
const targets = queryElements(context, selector, customOptions);
return targets.map((target) => trim(target.innerHTML));
}
function extractText(target, customOptions) {
const options = {
filter: true,
trim: true,
join: true,
...customOptions,
};
const nodes = Array.from(target.childNodes)
.filter((node) => node.nodeName === '#text')
.map((node) => (options.trim ? trim(node.textContent) : node.textContent));
const filteredNodes = options.filter
? nodes.filter(Boolean)
: nodes;
if (options.join) {
const text = filteredNodes.join(typeof options.join === 'string' ? options.join : ' ');
if (options.trim) {
return text.trim();
}
return text;
}
return filteredNodes;
}
function queryText(context, selector, customOptions) {
const target = queryElement(context, selector, customOptions);
if (!target) {
return null;
}
return extractText(target, customOptions);
}
function queryTexts(context, selector, customOptions) {
const targets = queryElements(context, selector, customOptions);
return targets.map((target) => extractText(target, customOptions));
}
function prefixUrl(urlPath, originUrl, customOptions) {
if (!urlPath) {
return null;
}
const options = {
protocol: 'https',
...customOptions,
};
if (/^http/.test(urlPath)) {
// this is already a complete URL
return urlPath;
}
if (options.protocol && /^\/\//.test(urlPath)) {
return `${options.protocol.replace(/:$/, '')}:${urlPath}`; // allow protocol to be defined either as 'https' or 'https:'
}
if (!originUrl) {
return urlPath;
}
const { origin, protocol } = new URL(originUrl);
if (protocol && /^\/\//.test(urlPath)) {
return `${protocol}${urlPath}`;
}
if (/^\//.test(urlPath)) {
return `${origin}${urlPath}`;
}
if (/^\.\//.test(urlPath)) {
return `${originUrl.replace(/\/+$/, '')}${urlPath.slice(1)}`;
}
return `${origin}/${urlPath}`;
}
function queryUrl(context, selector = 'a', customOptions) {
const options = {
...context.options,
attribute: 'href',
forceGetAttribute: true, // don't get origin URL when empty
...customOptions,
};
const url = queryContent(context, selector, options);
return prefixUrl(url, options.origin, customOptions);
}
function queryUrls(context, selector = 'a', customOptions) {
const options = {
...context.options,
attribute: 'href',
forceGetAttribute: true, // don't get origin URL when empty
...customOptions,
};
const urls = queryContents(context, selector, options);
return urls.map((url) => prefixUrl(url, options.origin, customOptions));
}
function getImageUrl(context, selector, options) {
const attributeKey = getAttributeKey(options);
if (attributeKey) {
return queryAttribute(context, selector, attributeKey, options);
}
return queryAttribute(context, selector, 'data-src', options)
|| queryAttribute(context, selector, 'src', options);
}
function getImageUrls(context, selector, options) {
const attributeKey = getAttributeKey(options);
if (attributeKey) {
return queryAttributes(context, selector, attributeKey, options);
}
const dataLinks = queryAttributes(context, selector, 'data-src', options);
if (dataLinks.lenght > 0) {
return dataLinks;
}
return queryAttributes(context, selector, 'src', options);
}
function queryImage(context, selector = 'img', customOptions) {
const options = {
...context.options,
...customOptions,
forceGetAttribute: true, // don't get origin URL when empty
};
const imageUrl = getImageUrl(context, selector, options);
return prefixUrl(imageUrl, options.origin, options);
}
function queryImages(context, selector = 'img', customOptions) {
const options = {
...context.options,
...customOptions,
forceGetAttribute: true, // don't get origin URL when empty
};
const imageUrls = getImageUrls(context, selector, options);
return imageUrls.map((imageUrl) => prefixUrl(imageUrl, options.origin, options));
}
function extractSourceSet(sourceSet, customOptions) {
if (!sourceSet) {
return null;
}
const sources = sourceSet
.split(/\s*,\s+/)
.map((source) => {
const [link, descriptor] = source.trim().split(' ');
if (link) {
return {
descriptor: descriptor || 'fallback',
url: prefixUrl(link, customOptions.origin, customOptions.protocol),
};
}
return null;
})
.filter(Boolean)
.sort((sourceA, sourceB) => {
if (sourceB.descriptor === 'fallback' || parseInt(sourceA.descriptor, 10) > parseInt(sourceB.descriptor, 10)) {
return -1;
}
if (parseInt(sourceA.descriptor, 10) < parseInt(sourceB.descriptor, 10)) {
return 1;
}
return 0;
});
if (customOptions.includeDescriptor) {
return sources.map((source) => ({
descriptor: source.descriptor,
url: prefixUrl(source.url),
}));
}
return sources.map((source) => prefixUrl(source.url));
}
function querySourceSet(context, selector, attr = 'srcset', customOptions = {}) {
const sourceSet = queryAttribute(context, selector, attr, customOptions);
return extractSourceSet(sourceSet, customOptions);
}
function querySourceSets(context, selector, attr = 'srcset', customOptions = {}) {
const sourceSets = queryAttributes(context, selector, attr, customOptions);
return sourceSets.map((sourceSet) => extractSourceSet(sourceSet, customOptions));
}
function removeStyleFunctionSpaces(el) {
// jsdom appears to have a bug where it ignores inline CSS attributes set to a function() containing spaces, e.g. url( image.png )
el.setAttribute('style', el.getAttribute('style')
.replace(/\(\s+(.*)\s+\)/g, (match, cssArgs) => `(${cssArgs})`)
.replace(/\)[\w\s/-]+;/g, ');'));
}
function queryStyle(context, selector, customOptions) {
const options = {
attribute: 'style',
attemptBugfix: true,
...customOptions,
};
const element = queryElement(context, selector, options);
if (element) {
if (options.attemptBugfix) {
removeStyleFunctionSpaces(element, options);
}
if (element.style) {
return options.styleAttribute
? element.style.getPropertyValue(options.styleAttribute)
: element.style._values;
}
}
return null;
}
function queryStyles(context, selector, customOptions) {
const options = {
...customOptions,
attribute: 'style',
};
const elStyles = queryElements(context, selector, options).map((element) => {
removeStyleFunctionSpaces(element);
if (element.style) {
return options.styleAttribute
? element.style.getPropertyValue(options.styleAttribute)
: element.style._values;
}
return null;
});
return elStyles.filter(Boolean);
}
function queryStyleUrl(context, selector, styleAttribute, customOptions) {
const options = {
styleAttribute,
...customOptions,
};
const style = queryStyle(context, selector, options);
if (!style) {
return null;
}
const url = style.match(/url\(['"]?(.*)['"]?\)/)?.[1];
return url;
}
function queryStyleUrls(context, selector, styleAttribute, customOptions) {
const options = {
styleAttribute,
...customOptions,
};
const styles = queryStyles(context, selector, options);
const urls = styles.map((style) => style.match(/url\(['"]?(.*)['"]?\)/)?.[1])?.filter(Boolean);
return urls;
}
function queryBackground(context, selector, customOptions) {
return queryStyleUrl(context, selector, 'background-image', customOptions);
}
function queryBackgrounds(context, selector, customOptions) {
return queryStyleUrls(context, selector, 'background-image', customOptions);
}
function queryVideo(context, selector = 'video source', customOptions) {
const options = {
...context.options,
attribute: 'src',
forceGetAttribute: true, // don't get origin URL when empty
...customOptions,
};
const videoUrl = queryContent(context, selector, options);
return prefixUrl(videoUrl, options.origin, options);
}
function queryVideos(context, selector = 'video source', customOptions) {
const options = {
...context.options,
attribute: 'src',
forceGetAttribute: true, // don't get origin URL when empty
...customOptions,
};
const videoUrls = queryContents(context, selector, options);
return videoUrls.map((videoUrl) => prefixUrl(videoUrl, options.origin, options));
}
function queryPoster(context, selector = 'video', customOptions) {
const options = {
attribute: 'poster',
...customOptions,
};
const posterUrl = queryContent(context, selector, options);
return prefixUrl(posterUrl, options.origin, options);
}
function queryPosters(context, selector = 'video', customOptions) {
const options = {
attribute: 'poster',
...customOptions,
};
const posterUrls = queryContents(context, selector, options);
return posterUrls.map((posterUrl) => prefixUrl(posterUrl, options.origin, options));
}
function extractJson(dataString) {
if (!dataString) {
return null;
}
try {
return JSON.parse(dataString);
} catch (error) {
return null;
}
}
function queryJson(context, selector, customOptions) {
const dataString = queryContent(context, selector, customOptions);
return extractJson(dataString);
}
function queryJsons(context, selector, customOptions) {
const dataStrings = queryContents(context, selector, customOptions);
return dataStrings.map((dataString) => extractJson(dataString)).filter(Boolean);
}
function extractDate(dateString, format, customOptions) {
if (!dateString) {
return null;
}
if (!format) {
return handleError(new Error('Missing required date format parameter'), 'NO_DATE_FORMAT');
}
const options = {
match: /((\d{1,4}[/-]\d{1,2}[/-]\d{1,4})|(\w+\s+\d{1,2},?\s+\d{4}))((T|\s+)\d{1,2}:\d{2}(:\d{2})?)?/g, // matches any of 01-01-1970, 1970-01-01 and January 1, 1970 with optional 00:00[:00] time
matchIndex: 0,
timezone: 'UTC',
...customOptions,
};
const dateStamp = options.match
? trim(dateString).match(options.match)
: trim(dateString);
if (dateStamp) {
const dateValue = moment.tz(options.match ? dateStamp[options.matchIndex] : dateStamp, format, options.timezone);
if (dateValue.isValid()) {
return dateValue.toDate();
}
}
return null;
}
function queryDate(context, selector, format, customOptions) {
const dateString = queryContent(context, selector, customOptions);
return extractDate(dateString, format, {
...context.options,
...customOptions,
});
}
function queryDates(context, selector, format, customOptions) {
const dateStrings = queryContents(context, selector, customOptions);
return dateStrings.map((dateString) => extractDate(dateString, format, {
...context.options,
customOptions,
}));
}
function formatDate(dateValue, format, inputFormat) {
if (inputFormat) {
return moment(dateValue, inputFormat).format(format);
}
return moment(dateValue).format(format);
}
function extractDuration(durationString, match) {
const durationMatch = durationString?.match(match || /(\d+:)?\d+:\d+/);
if (durationMatch) {
const segments = ['00'].concat(durationMatch[0].split(/[:hm]/)).slice(-3);
return moment.duration(segments.join(':')).asSeconds();
}
return null;
}
function extractTimestamp(durationString) {
const timestampMatch = durationString?.match(/(\d+\s*H)?.*(\d+\s*M)?.*(\d+\s*(S|$))?/i)?.[0];
if (timestampMatch) {
const hours = timestampMatch.match(/(\d+)\s*H/i)?.[1] || 0;
const minutes = timestampMatch.match(/(\d+)\s*M/i)?.[1] || 0;
const seconds = timestampMatch.match(/(\d+)\s*(s(ec\w*)?)|$/i)?.[1] || 0;
return (Number(hours) * 3600) + (Number(minutes) * 60) + Number(seconds);
}
return null;
}
function queryDuration(context, selector, customOptions) {
const options = { ...customOptions };
const durationString = queryContent(context, selector, customOptions);
if (!durationString) {
return null;
}
if (options.match) {
return extractDuration(durationString, options.match);
}
return extractDuration(durationString)
|| extractTimestamp(durationString)
|| null;
}
const queryFns = {
element: queryElement,
elements: queryElements,
el: queryElement,
els: queryElements,
all: queryElements,
content: queryContent,
contents: queryContents,
attribute: queryAttribute,
attributes: queryAttributes,
attr: queryAttribute,
attrs: queryAttributes,
background: queryBackground,
backgrounds: queryBackgrounds,
dataset: queryDataset,
datasets: queryDatasets,
data: queryDataset,
datas: queryDatasets,
exists: queryExistence,
count: queryCount,
html: queryHtml,
htmls: queryHtmls,
image: queryImage,
images: queryImages,
img: queryImage,
imgs: queryImages,
json: queryJson,
jsons: queryJsons,
style: queryStyle,
styles: queryStyles,
styleUrl: queryStyleUrl,
styleUrls: queryStyleUrls,
number: queryNumber,
num: queryNumber,
numbers: queryNumbers,
nums: queryNumbers,
poster: queryPoster,
posters: queryPosters,
date: queryDate,
dates: queryDates,
duration: queryDuration,
dur: queryDuration,
sourceSet: querySourceSet,
sourceSets: querySourceSets,
srcSet: querySourceSet,
srcSets: querySourceSets,
text: queryText,
texts: queryTexts,
url: queryUrl,
urls: queryUrls,
video: queryVideo,
videos: queryVideos,
};
function isDomObject(element) {
if (!element) {
return false;
}
return typeof element.nodeType !== 'undefined';
}
function initQueryFns(fns, context) {
if (context) {
return Object.fromEntries(Object.entries(fns).map(([key, fn]) => [key, (...args) => {
events.emit('query', {
key,
args,
origin: context.options.origin,
});
return fn(context, ...args);
}]));
}
// context is passed directly to query method
return Object.fromEntries(Object.entries(fns).map(([key, fn]) => [key, (...args) => {
// first argument is already an unprint context. this seems like a convoluted approach, but there is little reason not to allow it
if (args[0]?.isUnprint) {
events.emit('query', {
key,
args,
});
return fn(...args);
}
// most common usage is to pass an element directly, convert to context
if (isDomObject(args[0])) {
const element = args[0];
events.emit('query', {
key,
args,
});
return fn({
element,
html: element.outerHTML || element.body?.outerHTML,
isUnprint: true,
}, ...args.slice(1));
}
return handleError(new Error('Context is not provided or initialized'), 'INVALID_CONTEXT');
}]));
}
function init(elementOrHtml, selector, options = {}) {
if (!elementOrHtml) {
return null;
}
if (typeof elementOrHtml === 'string') {
// the context should be raw HTML
const { window } = new JSDOM(elementOrHtml, { virtualConsole, ...options.parser });
return init(window.document, selector, { ...options, window });
}
if (!isDomObject(elementOrHtml)) {
// the context is not a valid
return handleError(new Error('Init context is not a DOM element, HTML or an array'), 'INVALID_CONTEXT');
}
const element = selector
? queryElement({ element: elementOrHtml }, selector)
: elementOrHtml;
if (!element) {
return null;
}
const context = {
element,
html: element.outerHTML || element.body?.outerHTML,
...(options.window && {
window: options.window,
document: options.window.document,
}),
options,
isUnprint: true,
};
context.query = initQueryFns(queryFns, context);
return context;
}
function initAll(context, selector, options = {}) {
if (Array.isArray(context)) {
return context.map((element) => init(element, selector, options));
}
if (typeof context === 'string') {
// the context should be raw HTML
const { window } = new JSDOM(context, { virtualConsole, ...options.parser });
return initAll(window.document, selector, { ...options, window });
}
if (!isDomObject(context)) {
// the context is not a valid
return handleError(new Error('Init context is not a DOM element, HTML or an array'), 'INVALID_CONTEXT');
}
return queryElements({ element: context }, selector)
.map((element) => init(element, null, options));
}
const limiters = {
default: new Bottleneck(),
};
function getLimiterValue(prop, options, hostname) {
if (options[prop] !== undefined) {
return options[prop];
}
if (options.limits[hostname]?.enable !== false && options.limits[hostname]?.[prop] !== undefined) {
return options.limits[hostname][prop];
}
return options.limits.default[prop];
}
function getLimiter(url, options) {
const { hostname } = new URL(url);
const interval = getLimiterValue('interval', options, hostname);
const concurrency = getLimiterValue('concurrency', options, hostname);
if (!limiters[interval]?.[concurrency]) {
limiters[interval] = limiters[interval] || {};
limiters[interval][concurrency] = new Bottleneck({
minTime: interval,
maxConcurrent: concurrency,
timeout: options.timeout + 10000, // timeout 10 seconds after axious should
});
}
return {
limiter: limiters[interval][concurrency],
interval,
concurrency,
};
}
/* eslint-disable no-param-reassign */
function setProxy(instance, options, url) {
const { hostname } = new URL(url);
if (options.proxy
&& options.proxy.enable !== false
&& options.proxy.use !== false // use is a local override for enable
&& (options.proxy.use
|| options.proxy.hostnames?.includes(hostname))
) {
const proxyAgent = tunnel.httpsOverHttp({
proxy: {
host: options.proxy.host,
port: options.proxy.port,
},
});
instance.defaults.httpAgent = proxyAgent;
instance.defaults.httpsAgent = proxyAgent;
return true;
}
instance.defaults.httpAgent = options.httpsAgent || new http.Agent({ ...options.agent });
instance.defaults.httpsAgent = options.httpsAgent || new https.Agent({ ...options.agent });
return false;
}
/* eslint-enable no-param-reassign */
async function request(url, body, customOptions = {}, method = 'GET') {
const options = merge.all([{
timeout: 1000,
extract: true,
url,
}, globalOptions, customOptions]);
const { limiter, interval, concurrency } = getLimiter(url, options);
const instance = axios.create({
method,
validateStatus: null,
headers: options.headers,
timeout: options.timeout,
signal: options.abortSignal,
// ...options,
// httpAgent: options.httpAgent || new http.Agent({ ...options.agent }),
});
const isProxied = setProxy(instance, options, url);
const feedbackBase = {
url,
method,
interval,
concurrency,
isProxied,
options,
};
events.emit('requestInit', feedbackBase);
const res = await limiter.schedule(async () => instance.request({
url,
data: body,
}));
if (!(res.status >= 200 && res.status < 300)) {
handleError(new Error(`HTTP response from ${url} not OK (${res.status} ${res.statusText}): ${res.data}`), 'HTTP_NOT_OK');
events.emit('requestError', {
...feedbackBase,
status: res.status,
statusText: res.statusText,
});
return {
ok: false,
status: res.status,
statusText: res.statusText,
headers: res.headers,
response: res,
res,
};
}
const base = {
ok: true,
status: res.status,
statusText: res.statusText,
headers: res.headers,
response: res,
res,
};
events.emit('requestSuccess', {
...feedbackBase,
status: res.status,
statusText: res.statusText,
});
if (['application/json', 'application/javascript'].some((type) => res.headers['content-type'].includes(type)) && typeof res.data === 'object') {
return {
...base,
data: res.data,
};
}
if (!options.extract) {
return base;
}
const contextOptions = {
...customOptions,
origin: url,
};
const context = options.selectAll
? initAll(res.data, options.selectAll, contextOptions)
: init(res.data, options.select, contextOptions);
return {
...base,
context,
};
}
async function get(url, options) {
return request(url, null, options, 'GET');
}
async function post(url, body, options) {
return request(url, body, options, 'POST');
}
function on(trigger, fn) {
events.on(trigger, fn);
}
function off(trigger, fn) {
events.off(trigger, fn);
}
module.exports = {
configure,
on,
off,
events,
get,
post,
request,
initialize: init,
initializeAll: initAll,
init,
initAll,
extractDate,
extractDuration,
extractNumber,
extractTimestamp,
formatDate,
dateConstants: {
ISO_8601: moment.ISO_8601,
...moment.HTML5_FMT,
},
prefixUrl,
options: configure,
query: initQueryFns(queryFns),
};
;