stew-select
Version:
CSS selectors that allow regular expressions. Stew is a meatier soup.
334 lines (309 loc) • 16.7 kB
text/coffeescript
fs = require 'fs'
path = require 'path'
HOMEDIR = path.join(__dirname,'..')
LIB_DIR = if fs.existsSync(path.join(HOMEDIR,'lib-cov')) then path.join(HOMEDIR,'lib-cov') else path.join(HOMEDIR,'lib')
DOMUtil = require(path.join(LIB_DIR,'dom-util')).DOMUtil
PredicateFactory = require(path.join(LIB_DIR,'predicate-factory')).PredicateFactory
# **Stew** is a DOM selection engine that
# supports the full CSS selector syntax
# as well as CSS selectors extended with
# regular expressions.
#
# Method names that start with `_` are subject
# to change without notice. Other methods may be
# considered a part of the public API.
class Stew
# **The Stew constructor** accepts an optional `DOMUtil` instance
# (allowing callers to configure the `DOMUtil` used by `Stew`).
constructor:(dom_util)->
@factory = new PredicateFactory()
@dom_util = dom_util ? new DOMUtil()
# **select** selects nodes from the given `dom`
# that match the given `selector`.
#
# If `selector` is a string, it will be parsed as
# described in the README. Otherwise `selector`
# is assumed to be a predicate function
# (like those generated by `PredicateFactory`).
#
# If `dom` is a string, it will be parsed as HTML
# (using `DOMUtil.parse_html`, which see). If `dom`
# is a single node, the given `selector` will be
# applied to it. If `dom` is an array of nodes,
# the given `selector` will be each element in turn.
#
# This results in an array of matching nodes.
#
# If a `callback` is provided, the resulting array is
# passed to it (assuming the signature
# `callback(err,nodeset)`). Otherwise
# the resulting array is returned by this function.
#
# Note that when `dom` is a string, a callback method
# *must* be provided. (Since our HTML parsing
# is asynchronous.) When `dom` is an object, the
# callback method is optional (but will be used
# when present)
select:(dom,selector,callback)->
if typeof selector is 'string'
selector = @_parse_selectors(selector)
if typeof dom is 'string'
if callback?
@dom_util.parse_html dom, (err, dom)=>
if err?
callback(err)
else
callback(null,@_unguarded_select(dom,selector))
else
throw new Error('When select is invoked on a string object, the `callback(err,nodeset)` parameter is required.')
else
nodeset = @_unguarded_select(dom,selector)
callback?(null,nodeset)
return nodeset
# **_unguarded_select** is the "inner" method
# for `select`. It assumes `dom` is a node or
# array of nodes and that `predicate` is a
# predicate function. It returns an array of
# matching nodes. (Generally this method
# will not be directly called by clients.)
_unguarded_select:(dom,predicate)->
result = []
visit = (node,parent,path,siblings,sib_index)->
if predicate(node,parent,path,siblings,sib_index)
result.push node
return { 'continue':true, 'visit_children':true }
@dom_util.walk_dom dom, visit:visit
return result
# **select_first** selects the first node in the
# given `dom` that matches the given `selector`.
#
# It behaves exactly like `select` (which see)
# save that it aborts processing as soon as
# the first matching node is found, and returns
# a single node rather than an array of nodes.
select_first:(dom,selector,callback)->
if typeof selector is 'string'
selector = @_parse_selectors(selector)
if typeof dom is 'string'
if callback?
@dom_util.parse_html dom, (err, dom)=>
if err?
callback(err)
else
callback(null,@_unguarded_select_first(dom,selector))
else
throw new Error('When select_first is invoked on a string object, the `callback(err,node)` parameter is required.')
else
node = @_unguarded_select_first(dom,selector)
callback?(null,node)
return node
# **_unguarded_select_first** is the "inner" method for `select_first`.
# (Generally this method will not be directly called by clients.)
_unguarded_select_first:(dom,predicate)->
result = null
visit = (node,parent,path,siblings,sib_index)->
if predicate(node,parent,path,siblings,sib_index)
result = node
return { 'continue':false, 'visit_children':false }
else
return { 'continue':true, 'visit_children':true }
@dom_util.walk_dom dom, visit:visit
return result
# **_SPLIT_ON_WS_REGEXP** is regular expression that is
# used to split a string of CSS selectors into individual
# selectors. It is similiar to `str.split(/\s/)`, but:
# - treats "quoted phrases" (and `/regular expressions/`) as a single token
# - also splits on the CSS "operators" of `>`, `+`, `,` and `~`
# (Shout-out to
# http://stackoverflow.com/questions/2817646/javascript-split-string-on-space-or-on-quotes-to-array
# from which this expression was originally derived.)
_SPLIT_ON_WS_REGEXP = /([^\"\/\s,\+>]|(\"[^\"]+\")|(\/[^\/]+\/)|(\[[^\]]*\]))+|[,\+~>]/g
# **_split_on_ws_respecting_quotes** is used to split a string of
# CSS selectors into individual selectors.
_split_on_ws_respecting_quotes:(selector)->
result = []
while true
token = _SPLIT_ON_WS_REGEXP.exec(selector)
if token?[0]?
result.push(token[0])
else
break
return result
# **_parse_selectors** accepts a string containing one
# or more CSS selectors and returns the corresponding
# predicate (a boolean-valued function with the signature
# `(node,node_metadata,all_metadata)`)
_parse_selectors:(selectors)->
result = []
if typeof selectors is 'string'
selectors = @_split_on_ws_respecting_quotes(selectors)
child_operator = false # TODO there is probably a more elegant way to handle `>`, `+` and `,` here.
adjacent_operator = false
preceding_sibling_operator = false
or_operator = false
for selector in selectors
if selector is '>'
child_operator = true
else if selector is '+'
adjacent_operator = true
else if selector is '~'
preceding_sibling_operator = true
else if selector is ','
or_operator = true
else
predicate = @_parse_selector(selector)
if child_operator
result.push( @factory.direct_descendant_predicate( result.pop(), predicate ) )
child_operator = false
else if adjacent_operator
result.push( @factory.adjacent_sibling_predicate( result.pop(), predicate ) )
adjacent_operator = false
else if preceding_sibling_operator
result.push( @factory.preceding_sibling_predicate( result.pop(), predicate ) )
preceding_sibling_operator = false
else if or_operator
result.push( @factory.or_predicate( [ result.pop(), predicate ] ) )
or_operator = false
else
result.push( predicate )
if result.length > 0
result = @factory.descendant_predicate(result)
return result
# **_CSS_SELECTOR_REGEXP** is a regular expression for parsing an individual CSS selector
# (which might include a tag name, an ID, one or more classes, one or more attributes and a pseudo class).
#
# `"tag#id.class-one.class-two[name~=\"value with spaces\"]".match(_CSS_SELECTOR_REGEXP)`
#
#{ TODO: Combine the `id` and `class` rules to make them order-indepedent? (I think CSS specifies the order, but still.)
#{############################################################################################################################################################################################################################################################################
#{ 11 1 11 11 1 112 2 2 2 2 2 22 22 3 33 3 3 3 #
#{ 12 3 4 56 7 89 01 2 34 56 7 890 1 2 3 4 5 67 89 0 12 3 4 5 #
_CSS_SELECTOR_REGEXP: /((\/[^\/]*\/[gmi]*)|(\*|[\w-]+))?(\#((\/[^\/]*\/[gmi]*)|([\w-]+)))?((\.((\/[^\/]*\/[gmi]*)|([\w-]+)))*)((\[((\/[^\/]*\/[gmi]*)|([\w-]+))(((=)|(~=)|(\|=)|(\*=)|(\^=)|(\$=))(("(([^\\"]|(\\"))*)")|((\/[^\/]*\/[gmi]*)|([\w- :]+))))?\])*)(:([\w-]+))?/ #
#{ \-name--------------------------/|\-id-----------------------------/\-class(es)-----------------------/|| \-attr-name-----------------/|\-operator----------------------/\-value-----------------------------------------------/| | |\-pseduo--/ #
#{ || \-operator-and-value---------------------------------------------------------------------/ | | #
#{ |\-attr-clause-([])----------------------------------------------------------------------------------------------------------/ | #
#{ \-attr-clauses-([][]...)-------------------------------------------------------------------------------------------------------/ #
#{############################################################################################################################################################################################################################################################################
# Indices of the important captured groups.
_NAME = 1
_ID = 4
_CLASSES = 8
_ATTRIBUTES = 13
_PSEUDO_CLASS = 35
# **_ATTRIBUTE_CLAUSE_REGEXP** is a regular expression used to
# split one or more `[<name> <op> <value>]` expressions
# into individual components.
#{###########################################################################################################################################################
#{ 1 1 1 11 11 1 11 2 #
#{ 1 23 4 567 8 9 0 1 2 34 56 7 89 0 #
_ATTRIBUTE_CLAUSE_REGEXP: /(\[((\/[^\/]*\/[gmi]*)|([\w-]+))(((=)|(~=)|(\|=)|(\*=)|(\^=)|(\$=))(("(([^\\"]|(\\"))*)")|((\/[^\/]*\/[gmi]*)|([\w- :]+))))?\])/g #
#{ \-name----------------------/|\-operator-----------------------/\-value-----------------------------------------------/| #
#{ \-operator-and-value----------------------------------------------------------------------/ #
#{###########################################################################################################################################################
# Indices of the important captured groups.
_ATTR_NAME = 2
_OPERATOR = 6
_DEQUOTED_ATTR_VALUE = 15
_NEVERQUOTED_ATTR_VALUE = 18
# **_parse_selector** returns a (possibly compound) predicate
# that matches the provided `selector` (string).
_parse_selector:(selector)->
match = @_CSS_SELECTOR_REGEXP.exec(selector)
clauses = []
# The name part.
if match[_NAME]?
if match[_NAME] is '*'
clauses.push(@factory.any_tag_predicate())
else
clauses.push(@factory.by_tag_predicate(@_to_string_or_regex(match[_NAME])))
# The ID part.
if match[_ID]?
clauses.push(@factory.by_id_predicate(@_to_string_or_regex(match[_ID].substring(1))))
# One or more class parts.
if match[_CLASSES]?.length > 0 # match[CLASSES] contains something like `.foo.bar`
cs = match[_CLASSES].split('.') # split the string into individual class names
cs.shift() # and skip the first (empty) token that is included
for c in cs
clauses.push(@factory.by_class_predicate(@_to_string_or_regex(c)))
# TODO FIXME Support for `*=`, `^=` and `$=` is kinda hacked-in here. Refactor to be more DRY.
# One or more attribute parts.
if match[_ATTRIBUTES]?.length > 0 # match[_ATTRIBUTES] contains one or more `[name=value]` (or `[name]`) strings
attr_match = @_ATTRIBUTE_CLAUSE_REGEXP.exec(match[_ATTRIBUTES])
while attr_match?
if attr_match[_ATTR_NAME]? and (not attr_match[_OPERATOR]?)
clauses.push(@factory.by_attr_exists_predicate(@_to_string_or_regex(attr_match[_ATTR_NAME])))
if attr_match[_ATTR_NAME]? and attr_match[_OPERATOR]? and (attr_match[_DEQUOTED_ATTR_VALUE]? or attr_match[_NEVERQUOTED_ATTR_VALUE]?)
delim = null
if attr_match[_OPERATOR] is '~='
delim = /\s+/
if attr_match[_OPERATOR] is '|='
clauses.push(
@factory.by_attr_value_pipe_equals(
@_to_string_or_regex(attr_match[_ATTR_NAME]),
@_to_string_or_regex(attr_match[_DEQUOTED_ATTR_VALUE] ? attr_match[_NEVERQUOTED_ATTR_VALUE])
)
)
else if attr_match[_OPERATOR] is '^=' # starts with
aval = @_to_string_or_regex(attr_match[_DEQUOTED_ATTR_VALUE] ? attr_match[_NEVERQUOTED_ATTR_VALUE])
if typeof aval is 'string'
regexp_source = @factory._escape_for_regexp(aval)
aval = new RegExp("^#{regexp_source}")
else
regexp_source = aval.source
modifier = ''
modifier += 'i' if aval.ignoreCase
modifier += 'g' if aval.global
modifier += 'm' if aval.multiline
unless /^\^/.test regexp_source
aval = new RegExp("^#{regexp_source}")
clauses.push(@factory.by_attr_value_predicate(@_to_string_or_regex(attr_match[_ATTR_NAME]),aval))
else if attr_match[_OPERATOR] is '$=' # ends with
aval = @_to_string_or_regex(attr_match[_DEQUOTED_ATTR_VALUE] ? attr_match[_NEVERQUOTED_ATTR_VALUE])
if typeof aval is 'string'
regexp_source = @factory._escape_for_regexp(aval)
aval = new RegExp("#{regexp_source}$")
else
regexp_source = aval.source
modifier = ''
modifier += 'i' if aval.ignoreCase
modifier += 'g' if aval.global
modifier += 'm' if aval.multiline
unless /\$$/.test regexp_source
aval = new RegExp("#{regexp_source}$")
clauses.push(@factory.by_attr_value_predicate(@_to_string_or_regex(attr_match[_ATTR_NAME]),aval))
else if attr_match[_OPERATOR] is '*=' # contains
aval = @_to_string_or_regex(attr_match[_DEQUOTED_ATTR_VALUE] ? attr_match[_NEVERQUOTED_ATTR_VALUE])
if typeof aval is 'string'
regexp_source = @factory._escape_for_regexp(aval)
aval = new RegExp(regexp_source)
clauses.push(@factory.by_attr_value_predicate(@_to_string_or_regex(attr_match[_ATTR_NAME]),aval))
else
clauses.push(
@factory.by_attr_value_predicate(
@_to_string_or_regex(attr_match[_ATTR_NAME]),
@_to_string_or_regex(attr_match[_DEQUOTED_ATTR_VALUE] ? attr_match[_NEVERQUOTED_ATTR_VALUE]),
delim
)
)
attr_match = @_ATTRIBUTE_CLAUSE_REGEXP.exec(match[_ATTRIBUTES])
# The pseudo-class part.
if match[_PSEUDO_CLASS]?
if match[_PSEUDO_CLASS] is 'first-child'
clauses.push(@factory.first_child_predicate())
# Combine them with `and` if needed.
if clauses.length > 0
clauses = @factory.and_predicate(clauses)
return clauses
# **_to_string_or_regex** converts a string that starts and ends with `/`
# (with an optional `g`, `m` or `i` suffix) into a regular expression,
# and otherwise returns the original `str` value.
_to_string_or_regex:(str)->
match = str.match /^\/(.*)\/([gmi]*)$/
if match?[1]?
return new RegExp(match[1],match[2])
else
return str
# Public API includes `Stew` and `DOMUtil`
exports = exports ? this
exports.Stew = Stew
exports.DOMUtil = DOMUtil