UNPKG

node-scrapy

Version:

Simple, lightweight and expressive web scraping with Node.js

109 lines (90 loc) 3.65 kB
const { ModelError, UnknownFilterError } = require('./errors') const { parseQuery } = require('./query-parser') const { has, isArray, isObject, isString } = require('./utils') /** * Given a `dom`, traverse it to get the desired item * @static * @private * @param {Object} dom DOM node * @param {(string|Array|Object)} item Data to extract * @param {Object} selectorEngine Selector engine * @param {Object} getters A collection of getter functions * @param {Object} filters A collection of filter functions * @return {string} A string or an array of strings with the extracted data */ function extractItem(dom, item, selectorEngine, getters, filters) { if (isArray(item)) { const queryAST = parseQuery(item[0]) const matches = queryAST.selector ? selectorEngine.selectAll(queryAST.selector, dom) : [dom] if (!matches || !matches.length) return null if (isArray(item[1]) || isObject(item[1])) { return matches.map((context) => extractItem(context, item[1], selectorEngine, getters, filters) ) } return matches.map((node) => { const data = resolveGetter(getters, queryAST)(node) return applyFilters(filters, queryAST, data) }) } if (isObject(item)) { return Object.keys(item).reduce((acc, key) => { acc[key] = extractItem(dom, item[key], selectorEngine, getters, filters) return acc }, {}) } if (isString(item)) { const queryAST = parseQuery(item) const match = queryAST.selector ? selectorEngine.selectOne(queryAST.selector, dom) : dom if (!match) return null const data = resolveGetter(getters, queryAST)(match) return applyFilters(filters, queryAST, data) } const unsupportedType = item === null ? 'null' : typeof item throw new ModelError( `The model has to be a string, an Object or an Array; got ${unsupportedType} instead.` ) } /** * Given a getters collection and a query, decide what getter function to use * @static * @private * @param {Object} collection Simple collection of getter functions * @param {Object} query Query object, containing a getter property * @return {Function} A getter function */ function resolveGetter(collection, query) { // Default to $textContent if no getter was provided in query if (query.getter === null) return collection.$textContent // Use requested getter function if it is part of the getters collection if (has(collection, query.getter)) return collection[query.getter] // Simply return node attribute if a getter was requested in query // but doesn't exist in the getters collection return (el) => collection.attribute(el, query.getter) } /** * Given a filters collection, a query object, and data to process, apply all * requested filters by query over the data * @static * @private * @param {Object} collection Simple collection of filter functions * @param {Object} query Query object, containing a filters array property * @param {string} data Data to be transformed * @return {string} Data after all filters have been applied in order */ function applyFilters(collection, query, data) { // Apply each filter declared in the query, passing the result of the last as // the argument of the next return query.filters.reduce((result, filter) => { // Check if the requested filter exists in the filters collection if (has(collection, filter.name)) { // if so, call it return collection[filter.name].apply(null, [result, ...filter.args]) } // if not, throw error throw new UnknownFilterError(`Filter ${filter.name} does not exist.`) }, data) } module.exports = { extractItem, }