modelscrape
Version:
Cheerio-based scraping helper to scrape websites based on models/templates.
183 lines (158 loc) • 6.24 kB
JavaScript
const trae = require("trae");
const cheerio = require("cheerio");
const { makeUrl } = require("./config");
const { Entity, Prop, Page } = require("./types");
const { validateParam, validateQuery } = require("./utils/validators");
const { AttributeError } = require("./errors");
/**
* Set the base url for page templates
*
* @param {*} domain
* @return {Function}
*/
function fetchHtmlAndAddCheerio(domain) {
const joinEndpoint = makeUrl(domain);
/**
* Fetch HTML for PageTemplate
*
* @param {Query} query It delivers the endpoint to be executed for the particular URL
* @return Page
*/
return async function(query) {
const url = joinEndpoint(query.endpoint);
const resp = await trae.get(url);
const page = new Page(query);
page.setCheerio(cheerio.load(resp.data));
return page;
};
}
/**
* Select attributes that have been listed in the Element* template and that are available at the node object
*
* @param {ElementEntity[]} collection
* @param {Node[]} $node
*/
function selectNodeAttributes(collection, $node) {
// Continue only if there's an attrs prop
if (!collection.hasOwnProperty("attrs")) return {};
if (!Array.isArray(collection.attrs)) {
if (collection.attrs === undefined) return {};
// Dispatch error if the attrs prop is not an Array
throw AttributeError.mustBeArray();
}
// Collecting attributes. If not found, then null.
return collection.attrs.reduce((attributes, current) => {
attributes[current] =
$node.attribs[current] !== undefined
? $node.attribs[current]
: null;
return attributes;
}, {});
}
/**
* Closure for avoiding multiple passing of cheerio object
*
* @param {Cheerio} $ Cheerio object after loading data
* @return Function
*/
function produceEntityCollection($) {
/**
* It will extract data from HTML according to queries defined in model
*
* @param {EntityModel}
*/
return function(entityModel) {
const { props, query: entityQuery } = entityModel;
const entityNodes = $(entityQuery);
return (
entityNodes
// .map method is Cheerios not the regular Array.map
.map((_, entityNode) => {
const entityInstance = new Entity({
name: entityModel.name
});
entityInstance.attrs = selectNodeAttributes(
entityModel,
entityNode
);
// In order to create an object for attributtes.
// The provided name of the attribute will be used as key in obj.
entityInstance.props = props.reduce((prop, propModel) => {
const propNodes = $(propModel.query, entityNode);
const propsArray = propNodes
// .map method is Cheerios not the regular Array.map
.map((_, propNode) => {
const propInstance = new Prop({
name: propModel.name
});
// If no text-type node is found, then null.
propInstance.value =
propNode.nodeType === 1
? propNode.children.length > 0
? propNode.children
.map(child =>
child.nodeType === 3
? child.data
: null
)
.filter(
item =>
item !== null &&
item !== undefined
)
: null
: propNode.nodeType === 3
? propNode.data
: null;
propInstance.attrs = selectNodeAttributes(
propModel,
propNode
);
return propInstance;
})
.get();
// I know... it can be simpler. You are welcome to submit a PR :)
// If not a valid item, then null
prop[propModel.name] =
propsArray.length > 0
? propsArray.length > 1
? propsArray
: propsArray[0]
: null;
return prop;
}, {});
return entityInstance;
})
.get()
);
};
}
/**
* Populates the page template collection with actual entities
*
* @param {PageTemplate} page Page template object
* @return Array
*/
function populatePageCollections(pageTemplate) {
const { $, collections } = pageTemplate;
return collections.map(produceEntityCollection($));
}
/**
* Loader for scrapping
*
* @param {object} {url: string, pages: Array} - Url and Pages template collection
* @return Promise
*/
module.exports = async param => {
try {
const { url, queryObjects: queryArray } = validateParam(param);
queryArray.forEach(validateQuery);
const pagesArray = await Promise.all(
queryArray.map(fetchHtmlAndAddCheerio(url))
);
return pagesArray.map(populatePageCollections);
} catch (error) {
return error;
}
};
;