UNPKG

node-webodf-ilkkah

Version:

WebODF - JavaScript Document Engine http://webodf.org/

github.com/vandernorth/WebODF

vandernorth/WebODF

200 lines (185 loc) • 8.72 kB

JavaScript

/** * Copyright (C) 2014 KO GmbH <copyright@kogmbh.com> * * @licstart * This file is part of WebODF. * * WebODF is free software: you can redistribute it and/or modify it * under the terms of the GNU Affero General Public License (GNU AGPL) * as published by the Free Software Foundation, either version 3 of * the License, or (at your option) any later version. * * WebODF is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with WebODF. If not, see <http://www.gnu.org/licenses/>. * @licend * * @source: http://www.webodf.org/ * @source: https://github.com/kogmbh/WebODF/ */ /*global Node, NodeFilter, core, runtime, odf*/ /** * A filter that allows a position if it is in front of a word, picture etc. * * Word boundaries are detected by the presence of punctuation as defined in the unicode standard. * The included UTF categories are: * - Pc (connector punctuation) * - Pd (dash punctuation) * - Pe (close punctuation) * - Pf (final punctuation) * - Pi (initial punctuation) * - Po (other punctuation) * - Ps (Open punctuation) * * In addition, the following ranges are also included as boundaries: * - 2000-206F (general punctuation) * - 2E00-2E7F (supplemental punctuation) * - 3000-303F (CJK symbols and punctuation) * - 12400-1247F (cuneiform numbers and punctuation) * * Some libraries and sites exist for assisting in creation of the regex. The one * used for this particular expression was http://apps.timwhitlock.info/js/regex * which is based on the cset javascript library * (http://inimino.org/~inimino/blog/javascript_cset). * * * @constructor * @implements {core.PositionFilter} * @param {!ops.OdtDocument} odtDocument * @param {!odf.WordBoundaryFilter.IncludeWhitespace} includeWhitespace Specify the type of whitespace to include within * the word boundary. TRAILING causes the accepted position to be after the whitespace trailing a word, while LEADING * causes the accepted position to be just after the word boundary (but before the trailing whitespace). */ odf.WordBoundaryFilter = function WordBoundaryFilter(odtDocument, includeWhitespace) { "use strict"; var TEXT_NODE = Node.TEXT_NODE, ELEMENT_NODE = Node.ELEMENT_NODE, odfUtils = odf.OdfUtils, // Sourced from http://apps.timwhitlock.info/js/regex, including all punctuation components punctuation = /[!-#%-*,-\/:-;?-@\[-\]_{}¡«·»¿;·՚-՟։-֊־׀׃׆׳-״؉-؊،-؍؛؞-؟٪-٭۔܀-܍߷-߹।-॥॰෴๏๚-๛༄-༒༺-༽྅࿐-࿔၊-၏჻፡-፨᙭-᙮᚛-᚜᛫-᛭᜵-᜶។-៖៘-៚᠀-᠊᥄-᥅᧞-᧟᨞-᨟᭚-᭠᰻-᰿᱾-᱿\u2000-\u206e⁽-⁾₍-₎〈-〉❨-❵⟅-⟆⟦-⟯⦃-⦘⧘-⧛⧼-⧽⳹-⳼⳾-⳿⸀-\u2e7e\u3000-\u303f゠・꘍-꘏꙳꙾꡴-꡷꣎-꣏꤮-꤯꥟꩜-꩟﴾-﴿︐-︙︰-﹒﹔-﹡﹣﹨﹪-﹫！-＃％-＊，-／：-；？-＠［-］＿｛｝｟-･]|\ud800[\udd00-\udd01\udf9f\udfd0]|\ud802[\udd1f\udd3f\ude50-\ude58]|\ud809[\udc00-\udc7e]/, spacing = /\s/, /**@const*/ FILTER_ACCEPT = core.PositionFilter.FilterResult.FILTER_ACCEPT, /**@const*/ FILTER_REJECT = core.PositionFilter.FilterResult.FILTER_REJECT, /**@const*/ TRAILING = odf.WordBoundaryFilter.IncludeWhitespace.TRAILING, /**@const*/ LEADING = odf.WordBoundaryFilter.IncludeWhitespace.LEADING, /** * @enum {number} */ NeighborType = { NO_NEIGHBOUR: 0, SPACE_CHAR: 1, PUNCTUATION_CHAR: 2, WORD_CHAR: 3, OTHER: 4 }; /** * Returns the first filtered sibling ecountered while travelling up the dom from node until * before the documentRoot - or null if none is found. * @param {?Node} node * @param {!number} direction look for a left sibling when negative - for a right sibling otherwise * @param {!function(?Node):!number} nodeFilter * @return {?Node} */ function findHigherNeighborNode(node, direction, nodeFilter) { var neighboringNode = null, rootNode = odtDocument.getRootNode(), unfilteredCandidate; while (node !== rootNode && node !== null && neighboringNode === null) { unfilteredCandidate = (direction < 0) ? node.previousSibling : node.nextSibling; if (nodeFilter(unfilteredCandidate) === NodeFilter.FILTER_ACCEPT) { neighboringNode = unfilteredCandidate; } node = node.parentNode; } return neighboringNode; } /** * @param {?Node} node * @param {!function():!number} getOffset returns the offset inside the node * @return {!NeighborType} */ function typeOfNeighbor(node, getOffset) { var neighboringChar; if (node === null) { return NeighborType.NO_NEIGHBOUR; } if (odfUtils.isCharacterElement(node)) { return NeighborType.SPACE_CHAR; } if (node.nodeType === TEXT_NODE || odfUtils.isTextSpan(node) || odfUtils.isHyperlink(node)) { neighboringChar = node.textContent.charAt(getOffset()); if (spacing.test(neighboringChar)) { return NeighborType.SPACE_CHAR; } if (punctuation.test(neighboringChar)) { return NeighborType.PUNCTUATION_CHAR; } return NeighborType.WORD_CHAR; } return NeighborType.OTHER; } /** * @param {!core.PositionIterator} iterator * @return {!core.PositionFilter.FilterResult} */ this.acceptPosition = function (iterator) { var container = iterator.container(), /**@type{Node}*/ leftNode = iterator.leftNode(), rightNode = iterator.rightNode(), // For performance reasons, do not calculate the offset inside the dom until it is necessary getRightCharOffset = iterator.unfilteredDomOffset, getLeftCharOffset = function() {return iterator.unfilteredDomOffset() - 1;}, leftNeighborType, rightNeighborType; // If this could be the end of an element node, look for the neighboring node higher in the dom if (container.nodeType === ELEMENT_NODE) { if (rightNode === null) { rightNode = findHigherNeighborNode(container, 1, iterator.getNodeFilter()); } if (leftNode === null) { leftNode = findHigherNeighborNode(container, -1, iterator.getNodeFilter()); } } // If we dont stay inside the container node, the getOffset function needs to be modified so as to // return the offset of the characters just at the beginning/end of the respective neighboring node. if (container !== rightNode) { getRightCharOffset = function() {return 0;}; } if (container !== leftNode && leftNode !== null) { getLeftCharOffset = function() {return leftNode.textContent.length - 1;}; } leftNeighborType = typeOfNeighbor(leftNode, getLeftCharOffset); rightNeighborType = typeOfNeighbor(rightNode, getRightCharOffset); // Reject if: is between two usual characters (inside word) OR // is between two punctuation marks OR // (if including trailing space) is before a spacing and not behind the edge (word ending) // (if excluding trailing space) is before an edge (word start) and not behind the spacing if ((leftNeighborType === NeighborType.WORD_CHAR && rightNeighborType === NeighborType.WORD_CHAR) || (leftNeighborType === NeighborType.PUNCTUATION_CHAR && rightNeighborType === NeighborType.PUNCTUATION_CHAR) || (includeWhitespace === TRAILING && leftNeighborType !== NeighborType.NO_NEIGHBOUR && rightNeighborType === NeighborType.SPACE_CHAR) || (includeWhitespace === LEADING && leftNeighborType === NeighborType.SPACE_CHAR && rightNeighborType !== NeighborType.NO_NEIGHBOUR)) { return FILTER_REJECT; } return FILTER_ACCEPT; }; }; /** * Type of whitespace to include within the word boundary * @enum {!number} */ odf.WordBoundaryFilter.IncludeWhitespace = { /**@const*/None: 0, /**@const*/TRAILING: 1, /**@const*/LEADING: 2 };