stew-select
Version:
CSS selectors that allow regular expressions. Stew is a meatier soup.
294 lines (280 loc) • 12 kB
text/coffeescript
# **DOMUtil** provides utilty functions for working with DOM trees.
#
# DOMUtil is designed to work with the DOM structure generated by
# [Chris Winberry's node-htmlparser](https://github.com/tautologistics/node-htmlparser).
# DOMUtil doesn't have a strict dependency on node-htmlparser, but it
# expects DOM structures compatible with those generated by node-htmlparser.
# See [the htmlparser documentation](https://github.com/tautologistics/node-htmlparser#example-output)
# for more information about that format.
#
# (Currently DOMUtil should work with any DOM format that supports the `type`,
# `name`, `children`, `attribs` and `raw` attributes as used in htmlparser, but
# that may change in future verions.)
#
#
# Method names that start with `_` are subject to change without notice. Other methods may be considered a part of the public API.
class DOMUtil
# **The DOMUtil constructor.**
#
# While the DOMUtil methods are essentially stateless
# (and hence thread-safe, i.e., calls to the same DOMUtil instance can be safely
# interleaved), DOMUtil is implemented as an instantiable class to allow for
# alternative configurations.
#
# The constuctor accepts an optional `params` map. Currently one parameter key
# is supported:
#
# - The value `params.decode` (optionally) specifies a function to use
# when converting HTML text nodes into "plain text" (in the `to_text` and
# `inner_text` functions). This method can be used, for example,
# to decode HTML entities into their text equivalents.
# By default no conversion is made, the text nodes are output in exactly
# the same format as they are found in the DOM.
#
constructor:(params = {})->
@decode = params.decode ? (str)->str
# **parse_html** is a convenience function that parses a given HTML string
# into one or more DOM trees using the `htmlparser` library (if present).
# If `htmlparser` is not available, an error will be passed to the `callback`
# function.
#
# - The `html` parameter must be a string containing one or more HTML/XML trees.
#
# - The `options` parameter is optional, and may contain a map of
# [options to pass to htmlparser](https://github.com/tautologistics/node-htmlparser/#defaulthandler-options).
#
# - The `callback` parameter should contain a function with the signature
# `callback(err,dom)`, where:
#
# - The `err` argument will be a non-`null` value if an error occurs
# during the parsing.
#
# - Otherwise the `dom` argument will contain a single DOM object
# (when there is a single root tag in the given `html` string) or
# an array of DOM objects (when there is more than one HTML/XML
# structure in the given `html` string).
#
parse_html:(html,options,callback)->
# The `options` parameter is optional, so swap `options` and `callback` if necessary.
if typeof options is 'function' and typeof callback isnt 'function'
[ options, callback ] = [ callback, options ]
# If we haven't yet loaded `htmlparser`, do so now.
unless @htmlparser?
try
@htmlparser = require 'htmlparser'
catch err
callback(err,null)
if @htmlparser?
# Now create a simple handler that invokes the given `callback`...
handler = new @htmlparser.DefaultHandler (err,domset)->
if err?
callback(err,null)
else if Array.isArray(domset) and domset.length <= 1
callback(null,domset[0])
else
callback(null,domset)
# ...create the parser...
parser = new @htmlparser.Parser(handler,options)
# ...and parse the HTML.
parser.parseComplete(html)
# **as_node** returns `nodeset[0]` if `nodeset` is an array, `nodeset` otherwise.
as_node: (nodeset)->
if Array.isArray(nodeset)
return nodeset[0]
else
return nodeset
# **as_nodeset** returns `node` if `node` is an array, `[ node ]` otherwise.
as_nodeset: (node)->
if Array.isArray(node)
return node
else if node?
return [node]
else
return []
# **_kt** returns `true`. It's the default filter for `to_text`.
_kt: ()->true
# **to_text** returns a concatenation of all text nodes found
# within the given DOM `elt`.
#
# An optional `filter` parameter may contain a function with
# the signature `filter(node)` that returns `true` if the
# text found in or beneath the given `node` should be included
# in the concatenation or `false` if the text found at or
# below the given `node` should be excluded.
#
# E.g., the function:
#
# ```javascript
# var skip_em = function(node) { return node.name != 'em' };
# ```
#
# will cause `to_text` to exclude any text found within an
# `<em>` tag.
#
to_text:(elt,filter = @_kt)->
buffer = ''
@walk_dom elt, visit:(node,node_metadata,all_metadata)=>
# If `node` is acceptable to `filter`, then append any text, and visit its children.
if(filter(node,node_metadata,all_metadata))
buffer += @decode(node.raw) if node?.type is 'text' and node?.raw?
return {'continue':true,'visit_children':true}
else
# If `node` is *not* acceptable to `filter`, then skip it and its children.
return {'continue':true,'visit_children':false}
return buffer
# **inner_text** is an alias for `to_text` (which see).
inner_text:(elt,filter)->@to_text(elt,filter)
# **to_html** returns an HTML string representation of
# the given `elt` and its children (if any).
#
# (Currently only `text` and `tag` node types are converted,
# but that may change in the future.)
to_html:(elt)->
buffer = ''
@walk_dom elt, {
# When `visit`ing a node...
visit:(node)->
switch node.type
# ...concat the value of `text` nodes.
when 'text'
buffer += node.raw
# ...concat the name and attributes of `tag` nodes.
when 'tag'
buffer += "<#{node.name}"
if node.attribs?
for name,value of node.attribs
buffer += " #{name}=\"#{value}\""
buffer += ">"
return true
# `after_visit`ing a node...
after_visit:(node)->
switch node.type
# ...concat the "end tag" for `tag` nodes.
when 'tag'
buffer += "</#{node.name}>"
return true
}
return buffer
# **inner_html** returns an HTML string representation of
# the the children (if any) of the given `elt`.
#
# (Otherwise it behaves just like `to_html`, which see.)
inner_html:(elt)->
buffer = null
# If `elt` is an array, invoke `to_html` on the children of each element of in the array.
if Array.isArray(elt)
buffer = ''
for node in elt
if node.children?
buffer += @to_html(node.children)
# Otherwise `to_html` on the childen of `elt`.
else if elt?.children?
buffer = @to_html elt.children
return buffer
# **walk_dom** performs a depth-first walk of the given DOM tree (or trees),
# invoking a specified "visit" function for each node.
#
# * The `dom` parameter is either a single DOM node or an array of DOM nodes.
#
# * The `callbacks` parameter is a map that contains (at minimum) an
# attribute named `visit` containing a function with the signature:
#
# visit(node,node_metadata,all_metadata)
#
# where:
#
# - `node` is the DOM node currently being visited,
# - `node_metadata` is a map containing `parent`, `path`, `siblings`
# and `sib_index` keys, and
# - `all_metadata` is an array of `node_metadata` values
# for each previously visited nodes, indexed by the value
# stored at `node._stew_node_id`.
#
# * The `callbacks.visit` function should return a map containing
# `continue` and `visit-children` attributes.
#
# - When `visit-children` is `true`, the children of
# `node` (if any) will be visited next. When `false`,
# the `node`'s children will be skipped, but processing
# will continue with `node`'s siblings (or `node`'s
# parent's, siblings, etc.)
#
# - When `continue` is `false`, all subsequent processing
# will be aborted and the `walk_dom` method will exit
# as soon as possible.
#
# - If the value returned by `visit` is a boolean, that
# value will be used for both `continue` and `visit-children`.
#
# * If `callbacks` is a function (rather than a map) it be
# used as the `visit` function.
#
walk_dom:(dom,callbacks)->
# Fiddle with the input parameters if needed.
if typeof callbacks is 'function'
callbacks = { visit:callbacks }
nodes = @as_nodeset(dom)
# Create a container for all the node metadata.
dom_metadata = []
for node, sib_index in nodes
# Create the metadata for this node...
node_metadata = { parent:null, path:[], siblings:nodes, sib_index: sib_index }
node._stew_node_id = dom_metadata.length
# ...add it to the container...
dom_metadata.push node_metadata
# ...visit the node...
should_continue = @_unguarded_walk_dom(node,node_metadata,dom_metadata,callbacks)
# ...and exit if needed.
if not should_continue
break
# **_unguarded_walk_dom** is the "inner" implementation of `walk_dom`.
# See `walk_dom` for more information
#
# * `node` is the current DOM node to visit.
# * `node_metadata` is a map containing:
# - `parent` - the parent of this node, if any
# - `path` - an array of this node's ancestors (from "root" to parent)
# - `siblings` - an array of this node's parent's children
# - `sib_index` - the index of this node in the `siblings` array
# * `dom_metadata` is an array of `node_metadata` objects, indexed by
# `node._stew_node_id`. Only the already visited nodes are contained
# in this array.
# * `callbacks` is the map of callbacks passed to `walk_dom`, which see.
#
# `_unguarded_walk_dom` will return `true` if processing should continue
# (typically with `node`'s next sibling), or `false` if processing is
# complete an no more nodes should be visited.
#
_unguarded_walk_dom:(node,node_metadata,dom_metadata,callbacks)->
# Visit the current node.
response = {'continue':true,'visit_children':true}
if callbacks.visit?
response = callbacks.visit(node,node_metadata,dom_metadata)
# If processing should continue...
if response is true or response?['continue'] is true or (not response?['continue']?)
# ...and this node's children should be processed...
if node.children? and (response is true or response?['visit_children'] is true or (not response?['visit_children']?))
# ...create the `path` to this `node`'s children...
new_path = [].concat(node_metadata.path)
new_path.push(node)
# ...and recursively visit each child in turn...
for child,index in node.children
new_node_metadata = { parent:node, path:new_path, siblings:node.children, sib_index: index }
child._stew_node_id = dom_metadata.length
dom_metadata.push new_node_metadata
should_continue = @_unguarded_walk_dom(child,new_node_metadata,dom_metadata,callbacks)
# ...aborting further processing if needed.
if not should_continue
return false
# ...invoke the post-visit callback, if any...
if callbacks['after_visit']?
response = callbacks.after_visit(node,node_metadata,dom_metadata)
# ...aborting further processing if needed.
return response is true or response?['continue'] is true or (not response?['continue']?)
else # no `after_visit` callback
return true
else # processing should not continue
return false
# The DOMUtil class is exported under the name `DOMUtil`.
exports = exports ? this
exports.DOMUtil = DOMUtil