UNPKG

stew-select

Version:

CSS selectors that allow regular expressions. Stew is a meatier soup.

334 lines (309 loc) 16.7 kB
fs = require 'fs' path = require 'path' HOMEDIR = path.join(__dirname,'..') LIB_DIR = if fs.existsSync(path.join(HOMEDIR,'lib-cov')) then path.join(HOMEDIR,'lib-cov') else path.join(HOMEDIR,'lib') DOMUtil = require(path.join(LIB_DIR,'dom-util')).DOMUtil PredicateFactory = require(path.join(LIB_DIR,'predicate-factory')).PredicateFactory # **Stew** is a DOM selection engine that # supports the full CSS selector syntax # as well as CSS selectors extended with # regular expressions. # # Method names that start with `_` are subject # to change without notice. Other methods may be # considered a part of the public API. class Stew # **The Stew constructor** accepts an optional `DOMUtil` instance # (allowing callers to configure the `DOMUtil` used by `Stew`). constructor:(dom_util)-> @factory = new PredicateFactory() @dom_util = dom_util ? new DOMUtil() # **select** selects nodes from the given `dom` # that match the given `selector`. # # If `selector` is a string, it will be parsed as # described in the README. Otherwise `selector` # is assumed to be a predicate function # (like those generated by `PredicateFactory`). # # If `dom` is a string, it will be parsed as HTML # (using `DOMUtil.parse_html`, which see). If `dom` # is a single node, the given `selector` will be # applied to it. If `dom` is an array of nodes, # the given `selector` will be each element in turn. # # This results in an array of matching nodes. # # If a `callback` is provided, the resulting array is # passed to it (assuming the signature # `callback(err,nodeset)`). Otherwise # the resulting array is returned by this function. # # Note that when `dom` is a string, a callback method # *must* be provided. (Since our HTML parsing # is asynchronous.) When `dom` is an object, the # callback method is optional (but will be used # when present) select:(dom,selector,callback)-> if typeof selector is 'string' selector = @_parse_selectors(selector) if typeof dom is 'string' if callback? @dom_util.parse_html dom, (err, dom)=> if err? callback(err) else callback(null,@_unguarded_select(dom,selector)) else throw new Error('When select is invoked on a string object, the `callback(err,nodeset)` parameter is required.') else nodeset = @_unguarded_select(dom,selector) callback?(null,nodeset) return nodeset # **_unguarded_select** is the "inner" method # for `select`. It assumes `dom` is a node or # array of nodes and that `predicate` is a # predicate function. It returns an array of # matching nodes. (Generally this method # will not be directly called by clients.) _unguarded_select:(dom,predicate)-> result = [] visit = (node,parent,path,siblings,sib_index)-> if predicate(node,parent,path,siblings,sib_index) result.push node return { 'continue':true, 'visit_children':true } @dom_util.walk_dom dom, visit:visit return result # **select_first** selects the first node in the # given `dom` that matches the given `selector`. # # It behaves exactly like `select` (which see) # save that it aborts processing as soon as # the first matching node is found, and returns # a single node rather than an array of nodes. select_first:(dom,selector,callback)-> if typeof selector is 'string' selector = @_parse_selectors(selector) if typeof dom is 'string' if callback? @dom_util.parse_html dom, (err, dom)=> if err? callback(err) else callback(null,@_unguarded_select_first(dom,selector)) else throw new Error('When select_first is invoked on a string object, the `callback(err,node)` parameter is required.') else node = @_unguarded_select_first(dom,selector) callback?(null,node) return node # **_unguarded_select_first** is the "inner" method for `select_first`. # (Generally this method will not be directly called by clients.) _unguarded_select_first:(dom,predicate)-> result = null visit = (node,parent,path,siblings,sib_index)-> if predicate(node,parent,path,siblings,sib_index) result = node return { 'continue':false, 'visit_children':false } else return { 'continue':true, 'visit_children':true } @dom_util.walk_dom dom, visit:visit return result # **_SPLIT_ON_WS_REGEXP** is regular expression that is # used to split a string of CSS selectors into individual # selectors. It is similiar to `str.split(/\s/)`, but: # - treats "quoted phrases" (and `/regular expressions/`) as a single token # - also splits on the CSS "operators" of `>`, `+`, `,` and `~` # (Shout-out to # http://stackoverflow.com/questions/2817646/javascript-split-string-on-space-or-on-quotes-to-array # from which this expression was originally derived.) _SPLIT_ON_WS_REGEXP = /([^\"\/\s,\+>]|(\"[^\"]+\")|(\/[^\/]+\/)|(\[[^\]]*\]))+|[,\+~>]/g # **_split_on_ws_respecting_quotes** is used to split a string of # CSS selectors into individual selectors. _split_on_ws_respecting_quotes:(selector)-> result = [] while true token = _SPLIT_ON_WS_REGEXP.exec(selector) if token?[0]? result.push(token[0]) else break return result # **_parse_selectors** accepts a string containing one # or more CSS selectors and returns the corresponding # predicate (a boolean-valued function with the signature # `(node,node_metadata,all_metadata)`) _parse_selectors:(selectors)-> result = [] if typeof selectors is 'string' selectors = @_split_on_ws_respecting_quotes(selectors) child_operator = false # TODO there is probably a more elegant way to handle `>`, `+` and `,` here. adjacent_operator = false preceding_sibling_operator = false or_operator = false for selector in selectors if selector is '>' child_operator = true else if selector is '+' adjacent_operator = true else if selector is '~' preceding_sibling_operator = true else if selector is ',' or_operator = true else predicate = @_parse_selector(selector) if child_operator result.push( @factory.direct_descendant_predicate( result.pop(), predicate ) ) child_operator = false else if adjacent_operator result.push( @factory.adjacent_sibling_predicate( result.pop(), predicate ) ) adjacent_operator = false else if preceding_sibling_operator result.push( @factory.preceding_sibling_predicate( result.pop(), predicate ) ) preceding_sibling_operator = false else if or_operator result.push( @factory.or_predicate( [ result.pop(), predicate ] ) ) or_operator = false else result.push( predicate ) if result.length > 0 result = @factory.descendant_predicate(result) return result # **_CSS_SELECTOR_REGEXP** is a regular expression for parsing an individual CSS selector # (which might include a tag name, an ID, one or more classes, one or more attributes and a pseudo class). # # `"tag#id.class-one.class-two[name~=\"value with spaces\"]".match(_CSS_SELECTOR_REGEXP)` # #{ TODO: Combine the `id` and `class` rules to make them order-indepedent? (I think CSS specifies the order, but still.) #{############################################################################################################################################################################################################################################################################ #{ 11 1 11 11 1 112 2 2 2 2 2 22 22 3 33 3 3 3 # #{ 12 3 4 56 7 89 01 2 34 56 7 890 1 2 3 4 5 67 89 0 12 3 4 5 # _CSS_SELECTOR_REGEXP: /((\/[^\/]*\/[gmi]*)|(\*|[\w-]+))?(\#((\/[^\/]*\/[gmi]*)|([\w-]+)))?((\.((\/[^\/]*\/[gmi]*)|([\w-]+)))*)((\[((\/[^\/]*\/[gmi]*)|([\w-]+))(((=)|(~=)|(\|=)|(\*=)|(\^=)|(\$=))(("(([^\\"]|(\\"))*)")|((\/[^\/]*\/[gmi]*)|([\w- :]+))))?\])*)(:([\w-]+))?/ # #{ \-name--------------------------/|\-id-----------------------------/\-class(es)-----------------------/|| \-attr-name-----------------/|\-operator----------------------/\-value-----------------------------------------------/| | |\-pseduo--/ # #{ || \-operator-and-value---------------------------------------------------------------------/ | | # #{ |\-attr-clause-([])----------------------------------------------------------------------------------------------------------/ | # #{ \-attr-clauses-([][]...)-------------------------------------------------------------------------------------------------------/ # #{############################################################################################################################################################################################################################################################################ # Indices of the important captured groups. _NAME = 1 _ID = 4 _CLASSES = 8 _ATTRIBUTES = 13 _PSEUDO_CLASS = 35 # **_ATTRIBUTE_CLAUSE_REGEXP** is a regular expression used to # split one or more `[<name> <op> <value>]` expressions # into individual components. #{########################################################################################################################################################### #{ 1 1 1 11 11 1 11 2 # #{ 1 23 4 567 8 9 0 1 2 34 56 7 89 0 # _ATTRIBUTE_CLAUSE_REGEXP: /(\[((\/[^\/]*\/[gmi]*)|([\w-]+))(((=)|(~=)|(\|=)|(\*=)|(\^=)|(\$=))(("(([^\\"]|(\\"))*)")|((\/[^\/]*\/[gmi]*)|([\w- :]+))))?\])/g # #{ \-name----------------------/|\-operator-----------------------/\-value-----------------------------------------------/| # #{ \-operator-and-value----------------------------------------------------------------------/ # #{########################################################################################################################################################### # Indices of the important captured groups. _ATTR_NAME = 2 _OPERATOR = 6 _DEQUOTED_ATTR_VALUE = 15 _NEVERQUOTED_ATTR_VALUE = 18 # **_parse_selector** returns a (possibly compound) predicate # that matches the provided `selector` (string). _parse_selector:(selector)-> match = @_CSS_SELECTOR_REGEXP.exec(selector) clauses = [] # The name part. if match[_NAME]? if match[_NAME] is '*' clauses.push(@factory.any_tag_predicate()) else clauses.push(@factory.by_tag_predicate(@_to_string_or_regex(match[_NAME]))) # The ID part. if match[_ID]? clauses.push(@factory.by_id_predicate(@_to_string_or_regex(match[_ID].substring(1)))) # One or more class parts. if match[_CLASSES]?.length > 0 # match[CLASSES] contains something like `.foo.bar` cs = match[_CLASSES].split('.') # split the string into individual class names cs.shift() # and skip the first (empty) token that is included for c in cs clauses.push(@factory.by_class_predicate(@_to_string_or_regex(c))) # TODO FIXME Support for `*=`, `^=` and `$=` is kinda hacked-in here. Refactor to be more DRY. # One or more attribute parts. if match[_ATTRIBUTES]?.length > 0 # match[_ATTRIBUTES] contains one or more `[name=value]` (or `[name]`) strings attr_match = @_ATTRIBUTE_CLAUSE_REGEXP.exec(match[_ATTRIBUTES]) while attr_match? if attr_match[_ATTR_NAME]? and (not attr_match[_OPERATOR]?) clauses.push(@factory.by_attr_exists_predicate(@_to_string_or_regex(attr_match[_ATTR_NAME]))) if attr_match[_ATTR_NAME]? and attr_match[_OPERATOR]? and (attr_match[_DEQUOTED_ATTR_VALUE]? or attr_match[_NEVERQUOTED_ATTR_VALUE]?) delim = null if attr_match[_OPERATOR] is '~=' delim = /\s+/ if attr_match[_OPERATOR] is '|=' clauses.push( @factory.by_attr_value_pipe_equals( @_to_string_or_regex(attr_match[_ATTR_NAME]), @_to_string_or_regex(attr_match[_DEQUOTED_ATTR_VALUE] ? attr_match[_NEVERQUOTED_ATTR_VALUE]) ) ) else if attr_match[_OPERATOR] is '^=' # starts with aval = @_to_string_or_regex(attr_match[_DEQUOTED_ATTR_VALUE] ? attr_match[_NEVERQUOTED_ATTR_VALUE]) if typeof aval is 'string' regexp_source = @factory._escape_for_regexp(aval) aval = new RegExp("^#{regexp_source}") else regexp_source = aval.source modifier = '' modifier += 'i' if aval.ignoreCase modifier += 'g' if aval.global modifier += 'm' if aval.multiline unless /^\^/.test regexp_source aval = new RegExp("^#{regexp_source}") clauses.push(@factory.by_attr_value_predicate(@_to_string_or_regex(attr_match[_ATTR_NAME]),aval)) else if attr_match[_OPERATOR] is '$=' # ends with aval = @_to_string_or_regex(attr_match[_DEQUOTED_ATTR_VALUE] ? attr_match[_NEVERQUOTED_ATTR_VALUE]) if typeof aval is 'string' regexp_source = @factory._escape_for_regexp(aval) aval = new RegExp("#{regexp_source}$") else regexp_source = aval.source modifier = '' modifier += 'i' if aval.ignoreCase modifier += 'g' if aval.global modifier += 'm' if aval.multiline unless /\$$/.test regexp_source aval = new RegExp("#{regexp_source}$") clauses.push(@factory.by_attr_value_predicate(@_to_string_or_regex(attr_match[_ATTR_NAME]),aval)) else if attr_match[_OPERATOR] is '*=' # contains aval = @_to_string_or_regex(attr_match[_DEQUOTED_ATTR_VALUE] ? attr_match[_NEVERQUOTED_ATTR_VALUE]) if typeof aval is 'string' regexp_source = @factory._escape_for_regexp(aval) aval = new RegExp(regexp_source) clauses.push(@factory.by_attr_value_predicate(@_to_string_or_regex(attr_match[_ATTR_NAME]),aval)) else clauses.push( @factory.by_attr_value_predicate( @_to_string_or_regex(attr_match[_ATTR_NAME]), @_to_string_or_regex(attr_match[_DEQUOTED_ATTR_VALUE] ? attr_match[_NEVERQUOTED_ATTR_VALUE]), delim ) ) attr_match = @_ATTRIBUTE_CLAUSE_REGEXP.exec(match[_ATTRIBUTES]) # The pseudo-class part. if match[_PSEUDO_CLASS]? if match[_PSEUDO_CLASS] is 'first-child' clauses.push(@factory.first_child_predicate()) # Combine them with `and` if needed. if clauses.length > 0 clauses = @factory.and_predicate(clauses) return clauses # **_to_string_or_regex** converts a string that starts and ends with `/` # (with an optional `g`, `m` or `i` suffix) into a regular expression, # and otherwise returns the original `str` value. _to_string_or_regex:(str)-> match = str.match /^\/(.*)\/([gmi]*)$/ if match?[1]? return new RegExp(match[1],match[2]) else return str # Public API includes `Stew` and `DOMUtil` exports = exports ? this exports.Stew = Stew exports.DOMUtil = DOMUtil