UNPKG

html-parser

Version:

HTML/XML parser with less explosions

github.com/tmont/html-parser

tmont/html-parser

544 lines (480 loc) • 14.4 kB

JavaScript

var parseContext = require('./context'); var fs = require('fs'); function readAttribute(context) { var name = context.readRegex(context.regex.attribute); var value = null, quote = ''; if (context.current === '=' || context.peekIgnoreWhitespace() === '=') { context.readRegex(/\s*=\s*/); var attributeValueRegex; switch (context.current) { case "'": attributeValueRegex = /('(\\'|<%.*?%>|[^'])*?')/; quote = "'"; break; case '"': attributeValueRegex = /("(\\"|<%.*?%>|[^"])*?")/; quote = '"'; break; case '<': attributeValueRegex = (context.peek() === '%') ? /(<%.*?%>)/ : /(.*?)(?=[\s><])/; break; default: attributeValueRegex = /(.*?)(?=[\s><])/; break; } var match = attributeValueRegex.exec(context.substring) || [0, '']; value = match[1]; context.read(match[0].length); if (value[0] === '"' || value[0] === "'") { value = value.substring(1); } if (value[value.length-1] === '"' || value[value.length-1] === "'") { value = value.substring(0, value.length-1); } } context.callbacks.attribute(name, value, quote); } function readAttributes(context, isXml) { function isClosingToken() { if (isXml) { return context.current === '?' && context.peek() === '>'; } return context.current === '>' || (context.current === '/' && context.peekIgnoreWhitespace() === '>'); } var next = context.current, handled; while (!context.isEof() && !isClosingToken()) { handled = false; if (context.current === '<') { for (var callbackName in context.regex.dataElements) { if (!context.regex.dataElements.hasOwnProperty(callbackName)) { continue; } var dataElement = context.regex.dataElements[callbackName], start = dataElement.start, isValid = false; switch (typeof start) { case 'string': isValid = context.substring.slice(0, start.length) === start; break; case 'object': isValid = start.test(context.substring); break; case 'function': isValid = start(context.substring) > -1; break; } if (isValid) { callbackText(context); context.callbacks[callbackName](parseDataElement(context, dataElement)); next = context.current; handled = true; break; } next = context.current; } } if (!handled) { if (context.regex.attribute.test(next)) { readAttribute(context); next = context.current; } else { next = context.read(); } } } } function readCloserForOpenedElement(context, name) { var emptyElements = { 'area': true, 'base': true, 'basefont': true, 'br': true, 'col': true, 'frame': true, 'hr': true, 'img': true, 'input': true, 'isindex': true, 'link': true, 'meta': true, 'param': true, 'embed': true }; var isUnary = name in emptyElements; if (context.current === '/') { //self closing tag "/>" context.readUntilNonWhitespace(); context.read(); context.callbacks.closeOpenedElement(name, '/>', isUnary); } else if (context.current === '?') { //xml closing "?>" context.read(2); context.callbacks.closeOpenedElement(name, '?>', isUnary); } else { //normal closing ">" context.read(); context.callbacks.closeOpenedElement(name, '>', isUnary); } } function parseOpenElement(context) { var name = context.readRegex(context.regex.name); context.callbacks.openElement(name); readAttributes(context, false); readCloserForOpenedElement(context, name); if (!/^(script|xmp|style)$/i.test(name)) { return; } //just read until the closing tags for elements that allow cdata var regex = new RegExp('^([\\s\\S]*?)(?:$|</(' + name + ')\\s*>)', 'i'); var match = regex.exec(context.substring); context.read(match[0].length); if (match[1]) { context.callbacks.cdata(match[1]); } if (match[2]) { context.callbacks.closeElement(match[2]); } } function parseEndElement(context) { var name = context.readRegex(context.regex.name); context.callbacks.closeElement(name); context.readRegex(/.*?(?:>|$)/); } function parseDataElement(context, dataElement) { var start = dataElement.start, data = dataElement.data, end = dataElement.end; switch (typeof start) { case 'string': start = start.length; break; case 'object': start = start.exec(context.substring); start = start[start.length - 1].length; break; case 'function': start = start(context.substring); break; } context.read(start); switch (typeof data) { case 'object': data = data.exec(context.substring); data = data[data.length - 1]; break; case 'function': data = data(context.substring); break; case 'undefined': var index = -1; switch (typeof end) { case 'string': index = context.substring.indexOf(end); break; case 'object': var match = context.substring.match(end); if (match) { match = match[match.length - 1]; index = context.substring.indexOf(match); } break; } data = index > -1 ? context.substring.slice(0, index) : context.substring; break; } context.read(data.length); switch (typeof end) { case 'string': end = end.length; break; case 'object': end = end.exec(context.substring); end = end[end.length - 1].length; break; case 'function': end = end(context.substring); break; } context.read(end); return data; } function parseXmlProlog(context) { //read "?xml" context.read(4); context.callbacks.xmlProlog(); readAttributes(context, true); readCloserForOpenedElement(context, '?xml'); } function appendText(value, context) { context.text += value; } function callbackText(context) { if (context.text) { context.callbacks.text(context.text); context.text = ''; } } function parseNext(context) { if (context.current === '<') { var next = context.substring.charAt(1); if (next === '/' && context.regex.name.test(context.substring.charAt(2))) { context.read(2); callbackText(context); parseEndElement(context); return; } else if (next === '?' && /^<\?xml/.test(context.substring)) { context.read(1); callbackText(context); parseXmlProlog(context); return; } else if (context.regex.name.test(next)) { context.read(1); callbackText(context); parseOpenElement(context); return; } } for (var callbackName in context.regex.dataElements) { if (!context.regex.dataElements.hasOwnProperty(callbackName)) { continue; } var dataElement = context.regex.dataElements[callbackName], start = dataElement.start, isValid = false; switch (typeof start) { case 'string': isValid = context.substring.slice(0, start.length) === start; break; case 'object': isValid = start.test(context.substring); break; case 'function': isValid = start(context.substring) > -1; break; } if (isValid) { callbackText(context); context.callbacks[callbackName](parseDataElement(context, dataElement)); return; } } appendText(context.current, context); context.read(); } /** * Parses the given string o' HTML, executing each callback when it * encounters a token. * * @param {String} htmlString A string o' HTML * @param {Object} [callbacks] Callbacks for each token * @param {Function} [callbacks.attribute] Takes the name of the attribute and its value * @param {Function} [callbacks.openElement] Takes the tag name of the element * @param {Function} [callbacks.closeOpenedElement] Takes the tag name of the element, the token used to * close it (">", "/>", "?>") and a boolean telling if it is unary or not (i.e., if it doesn't requires * another tag closing it later) * @param {Function} [callbacks.closeElement] Takes the name of the element * @param {Function} [callbacks.comment] Takes the content of the comment * @param {Function} [callbacks.docType] Takes the content of the document type declaration * @param {Function} [callbacks.cdata] Takes the content of the CDATA * @param {Function} [callbacks.xmlProlog] Takes no arguments * @param {Function} [callbacks.text] Takes the value of the text node * @param {Object} [regex] * @param {RegExp} [regex.name] Regex for element name. Default is [a-zA-Z_][\w:\-\.]* * @param {RegExp} [regex.attribute] Regex for attribute name. Default is [a-zA-Z_][\w:\-\.]* * @param {Object.<string,DataElementConfig>} [regex.dataElements] Config of data elements like docType, comment and your own custom data elements */ exports.parse = function(htmlString, callbacks, regex) { htmlString = htmlString.replace(/\r\n/g, '\n').replace(/\r/g, '\n'); var context = parseContext.create(htmlString, callbacks, regex); do { parseNext(context); } while (!context.isEof()); callbackText(context); }; /** * @typedef {Object} DataElementConfig * @property {String|RegExp|Function} start - start of data element, for example '<%' or /^<\?=/ or function(string){return string.slice(0, 2) === '<%' ? 2 : -1;} * @property {RegExp|Function} data - content of data element, for example /^[^\s]+/ or function(string){return string.match(/^[^\s]+/)[0];} * @property {String|RegExp|Function} end - end of data element, for example '%>' or /^\?>/ or function(string){return 2;} */ /** * Parses the HTML contained in the given file asynchronously. * * Note that this is merely a convenience function, it will still read the entire * contents of the file into memory. * * @param {String} fileName Name of the file to parse * @param {String} [encoding] Optional encoding to read the file in, defaults to utf8 * @param {Object} [callbacks] Callbacks to pass to parse() * @param {Function} [callback] */ exports.parseFile = function(fileName, encoding, callbacks, callback) { fs.readFile(fileName, encoding || 'utf8', function(err, contents) { if (err) { callback && callback(err); return; } exports.parse(contents, callbacks); callback && callback(); }); }; /** * Sanitizes an HTML string. * * If removalCallbacks is not given, it will simply reformat the HTML * (i.e. converting all tags to lowercase, etc.). Note that this function * assumes that the HTML is decently formatted and kind of valid. It * may exhibit undefined or unexpected behavior if your HTML is trash. * * @param {String} htmlString A string o' HTML * @param {Object} [removalCallbacks] Callbacks for each token type * @param {Function|Array} [removalCallbacks.attributes] Callback or array of specific attributes to strip * @param {Function|Array} [removalCallbacks.elements] Callback or array of specific elements to strip * @param {Function|Boolean} [removalCallbacks.comments] Callback or boolean indicating to strip comments * @param {Function|Boolean} [removalCallbacks.docTypes] Callback or boolean indicating to strip doc type declarations * @return {String} The sanitized HTML */ exports.sanitize = function(htmlString, removalCallbacks) { removalCallbacks = removalCallbacks || {}; function createArrayCallback(index) { var callbackOrArray = removalCallbacks[index] || []; if (typeof(callbackOrArray) === 'function') { return function() { return callbackOrArray.apply(null, arguments); } } else { return function(value) { return callbackOrArray.indexOf(value) !== -1; } } } function createBoolCallback(index) { var callbackOrBool = removalCallbacks[index] || false; if (typeof(callbackOrBool) === 'function') { return function() { return callbackOrBool.apply(null, arguments); } } else { return function() { return callbackOrBool; } } } function last(arr) { return arr[arr.length - 1]; } var toRemove = { attributes: createArrayCallback('attributes'), elements: createArrayCallback('elements'), comments: createBoolCallback('comments'), docTypes: createBoolCallback('docTypes') }; var sanitized = '', tagStack = []; var ignoreStack = []; var selfClosingTags = { meta: 1, br: 1, link: 1, area: 1, base: 1, col: 1, command: 1, embed: 1, hr: 1, img: 1, input: 1, param: 1, source: 1 }; var callbacks = { docType: function(value) { if (toRemove.docTypes(value)) { return; } sanitized += '<!doctype ' + value + '>'; }, openElement: function(name) { name = name.toLowerCase(); //if there is an unclosed self-closing tag in the stack, then //pop it off (assumed to be malformed html). if (tagStack.length) { var scope = last(tagStack); if (selfClosingTags[scope]) { tagStack.pop(); if (scope === last(ignoreStack)) { ignoreStack.pop(); } } } if (ignoreStack.length) { return; } tagStack.push(name); if (toRemove.elements(name)) { ignoreStack.push(name); return; } sanitized += '<' + name; }, closeOpenedElement: function(name, token) { name = name.toLowerCase(); if (token.length === 2) { //self closing var scope = tagStack.pop(); if (scope === last(ignoreStack)) { ignoreStack.pop(); } } if (ignoreStack.length || toRemove.elements(name)) { return; } sanitized += token; }, closeElement: function(name) { name = name.toLowerCase(); if (tagStack.length && last(tagStack) === name) { if (tagStack.pop() === last(ignoreStack)) { ignoreStack.pop(); } } if (ignoreStack.length || toRemove.elements(name)) { return; } sanitized += '</' + name + '>'; }, attribute: function(name, value, quote) { if (ignoreStack.length) { return; } name = name.toLowerCase(); if (toRemove.attributes(name, value)) { return; } sanitized += ' ' + name; if (value) { // reuse the existing quote style if possible sanitized += '=' + quote + ((quote === '"') ? value.replace(/"/g, '"') : value.replace(/'/g, ''')) + quote; } }, text: function(value) { if (ignoreStack.length) { return; } sanitized += value; }, comment: function(value) { if (ignoreStack.length || toRemove.comments(value)) { return; } sanitized += ''; }, cdata: function(value) { if (ignoreStack.length) { return; } for (var i = tagStack.length - 1; i >= 0; i--) { if (tagStack[i] === 'script' || tagStack[i] === 'xmp' || tagStack[i] === 'style') { sanitized += value; return; } } sanitized += '<![CDATA[' + value + ']]>'; } }; exports.parse(htmlString, callbacks); return sanitized; };