UNPKG

remarkup

Version:

HTML semantics and program logic separator

438 lines (367 loc) 12.9 kB
;(function() { 'use strict'; const $ = require('cheerio'); const munkres = require('munkres-js'); const levenshtein = require('levenshtein-sse'); const assert = require('assert'); /** * Internally used class for specifying sets of attributes. * Wraps around a list, whose elements can be strings (e.g. "href"), * regexes (/^data-/) or functions (name, element) => ... */ class AttributeSet { constructor(initialElements) { this.functions = []; this.regexps = []; this.strings = []; this.add(initialElements || []); } add(otherElements) { if (!Array.isArray(otherElements)) { otherElements = [otherElements]; } otherElements.forEach(e => { if (typeof e === 'string') { return this.strings.push(e); } if (typeof e === 'function') { return this.functions.push(e); } this.regexps.push(e); }); } test(string, element, originalElement) { if (this.strings.indexOf(string) !== -1) { return true; } for (let r of this.regexps) { if (r.test(string)) { return true; } } for (let fn of this.functions) { if (fn.call(element, string, originalElement, element)) { return true; } } return false; } } /** * Provides methods for removing attributes from HTML fragments * and re-adding them later, possibly on a modified * (e.g. translated) HTML string. * * @param {object} [opt] * Options for matching and modifying the HTML elements * @param {function[]} [opt.elementFilters] * An array of callbacks for modifying the * HTML elements (passed as DOM nodes). * The default is a {@link ReMarkup.defaultElementFilter} * which preserves <code>id</code> and <code>translate-*</code> * attributes, as well as all semantically relevant attributes. * (See {@link ReMarkup#semanticAttributes}). * (For {@link ReMarkup#unMarkup}). * @param {function[]} [opt.additionalElementFilters] * Like <code>elementFilters</code>, but appended in addition * to the standard filters. * @param {number} [opt.nonexistentChildDistance] * The distance that will be used when an child * element is present in the original tree * but not the modified one or vice versa. * The default value is 10. * (For {@link ReMarkup#reMarkup}). * @param {function} [opt.rawElementMetric] * A distance function for DOM HTML elements. * The default is {@link ReMarkup.defaultRawElementMetric}. * * @constructor ReMarkup * @public */ class ReMarkup { constructor(opt) { opt = opt || {}; this.keepAttributes = new AttributeSet(['id', /^(remarkup|translate)-.+$/]); this.keepAttributes.add(this.semanticAttributes()); this.elementFilters = opt.elementFilters || [ ReMarkup.defaultElementFilter(this.keepAttributes) ].concat(opt.additionalElementFilters || []); this.nonexistentChildDistance = opt.nonexistentChildDistance || 10; this.rawElementMetric = opt.rawElementMetric || ReMarkup.defaultRawElementMetric; } /** * Add a filter to the elementFilters list. * * @param {function} filter The element filter. * * @public * @method ReMarkup#addElementFilter */ addElementFilter(filter) { this.elementFilters.push(filter); } /** * List of semantically relevant HTML attributes. * These will be preserved by the default {@link ReMarkup#unMarkup} * element filters and ignored by the default {@link ReMarkup#reMarkup} * metric and its copy mechanism. * * @private * @method ReMarkup#semanticAttributes */ semanticAttributes() { return [ 'alt', 'label', 'placeholder', 'title', 'tooltip', 'data-info', 'popover', (name, element) => { return name == 'value' && element && ['button', 'submit'].indexOf(element.attr('type')) != -1; } ]; } /** * Applies the list of element filters to a single element. * * @param {DOMElement} element The target element. * * @private * @method ReMarkup#applyElementFilters */ applyElementFilters(element) { for (let i = 0; i < this.elementFilters.length; ++i) { this.elementFilters[i](element); } } /** * Recursively apply the element filters to an element and all its children. * * @param {DOMElement} element The target element. * * @return {DOMElement} The original target element. * * @private * @method ReMarkup#unMarkupRecurse */ unMarkupRecurse(element) { this.applyElementFilters(element); element.children().each((i, child) => { this.unMarkupRecurse($(child)); }); return element; } /** * Apply the element filters to an HTML fragment. * * @param {string} original The target HTML fragment. * * @return {string} A modified HTML fragment. * * @public * @method ReMarkup#unMarkup */ unMarkup(original) { const doc = $.load(original); const root = doc.root(); this.unMarkupRecurse(root); return root.html(); } /** * Re-adds attributes from an original HTML fragment * to a, possibly modified, one. * * @param {string} original The original HTML fragment, including all attributes. * @param {string} modified The target HTML fragment. * * @return {string} An HTML fragment, with the attributes from the original string * added to the modified one. * * @public * @method ReMarkup#reMarkup */ reMarkup(original, modified) { const origDoc = $.load(original).root(), modDoc = $.load(modified).root(); // convert lists of all elements to arrays so that indices work const origElements = Array.prototype.slice.call(origDoc.find('*')); const modElements = Array.prototype.slice.call(modDoc .find('*')); if (origElements.length == 0 || modElements.length == 0) return modified; const distanceMatrix = []; for (let i = 0; i < origElements.length; ++i) distanceMatrix[i] = []; // compute the distance of a original and a modified element // and enter it into the distance matrix const computeElementDistance = (e1unwrapped, e2unwrapped) => { const e1i = origElements.indexOf(e1unwrapped); const e2i = modElements .indexOf(e2unwrapped); assert.notStrictEqual(e1i, -1); assert.notStrictEqual(e2i, -1); const e1 = $(e1unwrapped), e2 = $(e2unwrapped); // do we already know the distance? if (typeof distanceMatrix[e1i][e2i] != 'undefined') return distanceMatrix[e1i][e2i]; let totalChildDistance = 0; let e1children = e1.children(); let e2children = e2.children(); if (e1children.length > 0 && e2children.length > 0) { // compute all distances between the children of the elements... const childMatrix = []; for (let i = 0; i < e1children.length; ++i) { childMatrix[i] = []; for (let j = 0; j < e2children.length; ++j) { childMatrix[i][j] = computeElementDistance(e1children[i], e2children[j]); } } // ... and find the minimal assignment between these const m = new munkres.Munkres(); const indices = m.compute(childMatrix); for (let k = 0; k < indices.length; ++k) { const ci = indices[k][0], cj = indices[k][1]; totalChildDistance += childMatrix[ci][cj]; } } // add penalty for differing number of child elements totalChildDistance += Math.abs(e1children.length - e2children.length) * this.nonexistentChildDistance; // compare to the element that unMarkup produces from e1 const e1_ = this.unMarkupRecurse(e1.clone()); const rawElementDistance = this.rawElementMetric( e1_, e2, e1i, e2i, e1.parent().children().length, e2.parent().children().length); return distanceMatrix[e1i][e2i] = totalChildDistance + rawElementDistance; } for (let i = 0; i < origElements.length; ++i) for (let j = 0; j < modElements.length; ++j) computeElementDistance(origElements[i], modElements[j]); const m = new munkres.Munkres(); const indices = m.compute(distanceMatrix); for (let k = 0; k < indices.length; ++k) { const ci = indices[k][0], cj = indices[k][1]; const e1 = origElements[ci]; const e2 = modElements [cj]; copyAttributes(e1, e2, this.keepAttributes); } return modDoc.html(); } } /** * An element filter for stripping whitespace after/before * tags and newlines and collapse multiple spaces into a single one. * * @param {DOMElement} cElement The target element. * * @return {DOMElement} The original target element. * * @public * @function ReMarkup.stripSpaces */ ReMarkup.stripSpaces = function (cElement) { const element = cElement[0]; for (let i = 0; i < element.children.length; ++i) { const node = element.children[i]; if (node.type !== 'text') continue; // collapse multiple spaces /* only \t, \n, \r, space since other spaces (e.g. nbsp) * may carry some semantic meaning */ node.data = node.data .replace(/[\t\n\r ]+/g, ' '); // remove starting/ending whitespace if (i == 0) node.data = node.data.replace(/^[\t\n\r ]+/g, ''); if (i == element.childNodes.length - 1) node.data = node.data.replace(/[\t\n\r ]+$/g, ''); } return element; }; /** * Creates a default element filter that removes most attributes. * * @param {string[]} keepAttributes * A list of strings and/or regex objects that * element attributes are validated against. * * @return {function} An element filter that removes all attributes * but those specified as preserved. * * @public * @function ReMarkup.defaultElementFilter */ ReMarkup.defaultElementFilter = function (keepAttributes) { return element => { const originalElement = element.clone(); Object.keys(element.attr() || {}) .filter(attrName => !keepAttributes.test(attrName, element, originalElement)) .forEach(attrName => element.removeAttr(attrName)); }; }; /** * The default element metric. * This compares elements and their position in the DOM tree * and returns a number that indicates how un-similar the * given elements are. * * This returns a distance of 0 for elements which share the * same value for either of the <code>id</code>, * <code>translate-id</code> or <code>remarkup-id</code> attributes. * * @public * @function ReMarkup.defaultRawElementMetric */ ReMarkup.defaultRawElementMetric = function (e1, e2, e1i, e2i, e1pl, e2pl) { // attributes that lead to definite matching of elements const identAttr = ['id', 'translate-id', 'remarkup-id']; for (let i = 0; i < identAttr.length; ++i) { if (typeof e1.attr(identAttr[i]) !== 'undefined' && typeof e2.attr(identAttr[i]) !== 'undefined' && e1.attr(identAttr[i]) === e2.attr(identAttr[i])) { return 0; } } let distance = 5; // minimum distance for elements with different IDs assert.ok(e1[0].tagName); assert.ok(e2[0].tagName); if (e1[0].tagName !== e2[0].tagName) { distance += 3; } const e1attribs = Object.keys(e1.attr() || {}); const e2attribs = Object.keys(e2.attr() || {}); for (let i = 0; i < e1attribs.length; ++i) { if (e2attribs.indexOf(e1attribs[i]) === -1 && !this.keepAttributes.test(e1attribs[i])) { distance++; } } for (let i = 0; i < e2attribs.length; ++i) { if (this.keepAttributes.test(e2attribs[i])) { continue; } if (e1attribs.indexOf(e2attribs[i]) === -1) { distance++; } else { const attrValue1 = e1.attr(e2attribs[i]); const attrValue2 = e2.attr(e2attribs[i]); if (attrValue1 !== attrValue2) { distance += 2 * Math.log(levenshtein(attrValue1, attrValue2)); } } } const positionDistance = Math.abs(e1i - e2i); if (positionDistance > 0) distance += 2 * Math.log(positionDistance) + 1; return distance; }; module.exports = ReMarkup; // copy all DOM attributes from src to dst function copyAttributes (src, dst, ignored) { ignored = ignored || []; const srcAttribs = Object.keys(src.attribs || src.attributes); for (let i = 0; i < srcAttribs.length; ++i) { if (!ignored.test(srcAttribs[i], $(dst), $(src))) { dst.attribs[srcAttribs[i]] = src.attribs[srcAttribs[i]]; } } } })();