node-scrapy
Version:
Simple, lightweight and expressive web scraping with Node.js
67 lines (56 loc) • 1.9 kB
JavaScript
const nearley = require('nearley')
const grammar = require('../dist/grammar')
const { trim } = require('./utils')
const compiledGrammar = nearley.Grammar.fromCompiled(grammar)
// This matches a pair of parentheses and its content at the end of the string, either at the start
// of the string or preceded by a whitespace. Examples:
// "(whatever content here)"
// "irrelevant text (whatever content here)"
// In both cases, the match will carry the "(whatever content here)" bit, the latter will also
// have a leading whitespace.
const queryExtensionPattern = /(?:^|\s)(\(.*\))$/
// This matches the parentheses plus any leading or trailing whitespace matched above, used to clean
// up the content. In both cases above, if we do a Sting#replace, we end up with the string:
// "whatever content here"
const leadingAndTrailingBoundaries = /(^\s*\(\s*)|(\s*\)\s*$)/g
/**
* Parse node-scrapy query syntax into handy object
* @static
* @public
* @param {string} query Query string
* @return {Object} Parsed query
*
* @example
* const query = '.link (href | trim:both)'
*
* parseQuery(query)
* // => {
* // selector: '.link',
* // getter: 'href',
* // filters: [{ name: 'trim', args: ['both'] }]
* // }
*/
function parseQuery(query) {
const trimmed = trim(query)
const match = trimmed.match(queryExtensionPattern)
if (!match) {
return {
selector: trimmed,
getter: null,
filters: [],
}
}
const queryExtension = match.find(Boolean)
const selector = trim(trimmed.slice(0, trimmed.indexOf(queryExtension)))
const queryExtensionContent = queryExtension.replace(leadingAndTrailingBoundaries, '')
const parser = new nearley.Parser(compiledGrammar)
parser.feed(queryExtensionContent)
// Return the first result
return {
selector,
...parser.results.find(Boolean),
}
}
module.exports = {
parseQuery,
}