text-clipper
Version:
Fast and correct clip functions for HTML and plain text.
601 lines (600 loc) • 24.2 kB
JavaScript
"use strict";
Object.defineProperty(exports, "__esModule", { value: true });
// Void elements are elements without inner content,
// which close themselves regardless of trailing slash.
// E.g. both <br> and <br /> are self-closing.
var VOID_ELEMENTS = [
"area",
"base",
"br",
"col",
"command",
"embed",
"hr",
"img",
"input",
"keygen",
"link",
"meta",
"param",
"source",
"track",
"wbr",
];
// Block elements trigger newlines where they're inserted,
// and are always safe places for truncation.
var BLOCK_ELEMENTS = [
"address",
"article",
"aside",
"blockquote",
"canvas",
"dd",
"div",
"dl",
"dt",
"fieldset",
"figcaption",
"figure",
"footer",
"form",
"h1",
"h2",
"h3",
"h4",
"h5",
"h6",
"header",
"hgroup",
"hr",
"li",
"main",
"nav",
"noscript",
"ol",
"output",
"p",
"pre",
"section",
"table",
"tbody",
"tfoot",
"thead",
"tr",
"ul",
"video",
];
// Elements that are unbreakable: they are either included verbatim, or omitted entirely.
var UNBREAKABLE_ELEMENTS = ["audio", "math", "svg", "video"];
var NEWLINE_CHAR_CODE = 10; // '\n'
var EXCLAMATION_CHAR_CODE = 33; // '!'
var DOUBLE_QUOTE_CHAR_CODE = 34; // '"'
var AMPERSAND_CHAR_CODE = 38; // '&'
var SINGLE_QUOTE_CHAR_CODE = 39; // '\''
var FORWARD_SLASH_CHAR_CODE = 47; // '/'
var SEMICOLON_CHAR_CODE = 59; // ';'
var TAG_OPEN_CHAR_CODE = 60; // '<'
var EQUAL_SIGN_CHAR_CODE = 61; // '='
var TAG_CLOSE_CHAR_CODE = 62; // '>'
var CHAR_OF_INTEREST_REGEX = /[<&\n\ud800-\udbff]/;
var CHAR_OF_INTEREST_NO_NEWLINE_REGEX = /[<&\ud800-\udbff]/;
var SIMPLIFY_WHITESPACE_REGEX = /\s+/g;
/**
* Clips a string to a maximum length. If the string exceeds the length, it is truncated and an
* indicator (an ellipsis, by default) is appended.
*
* In detail, the clipping rules are as follows:
* - The resulting clipped string may never contain more than maxLength characters. Examples:
* - clip("foo", 3) => "foo"
* - clip("foo", 2) => "f…"
* - The indicator is inserted if and only if the string is clipped at any place other than a
* newline. Examples:
* - clip("foo bar", 5) => "foo …"
* - clip("foo\nbar", 5) => "foo"
* - If the html option is true and valid HTML is inserted, the clipped output *must* also be valid
* HTML. If the input is not valid HTML, the result is undefined (not to be confused with JS'
* "undefined" type; some errors might be detected and result in an exception, but this is not
* guaranteed).
*
* @param string The string to clip.
* @param maxLength The maximum length of the clipped string in number of characters.
* @param options Optional options object.
*
* @return The clipped string.
*/
function clip(string, maxLength, options) {
if (options === void 0) { options = {}; }
if (!string) {
return "";
}
string = string.toString();
return options.html
? clipHtml(string, maxLength, options)
: clipPlainText(string, maxLength, options);
}
exports.default = clip;
function clipHtml(string, maxLength, options) {
var _a = options.imageWeight, imageWeight = _a === void 0 ? 2 : _a, _b = options.indicator, indicator = _b === void 0 ? "\u2026" : _b, _c = options.maxLines, maxLines = _c === void 0 ? Infinity : _c, _d = options.stripTags, stripTags = _d === void 0 ? false : _d;
var numChars = indicator.length;
var numLines = 1;
var shouldStrip = typeof stripTags === "boolean"
? function () { return stripTags; }
: function (tagName) { return stripTags.includes(tagName); };
var tagStack = []; // Stack of currently open HTML tags.
var popTagStack = function (result) {
var tagName;
while (((tagName = tagStack.pop()), tagName !== undefined)) {
if (!shouldStrip(tagName)) {
result += "</" + tagName + ">";
}
}
return result;
};
var i = 0;
var unbreakableElementIndex = -1;
var length = string.length;
for (; i < length; i++) {
var rest = i ? string.slice(i) : string;
var willSimplifyWhiteSpace = shouldSimplifyWhiteSpace(tagStack);
var regex = unbreakableElementIndex > -1 || willSimplifyWhiteSpace
? CHAR_OF_INTEREST_NO_NEWLINE_REGEX
: CHAR_OF_INTEREST_REGEX;
var nextIndex = rest.search(regex);
var nextBlockSize = nextIndex > -1 ? nextIndex : rest.length;
if (unbreakableElementIndex === -1) {
if (willSimplifyWhiteSpace) {
var simplifiedBlock = simplifyWhiteSpace(nextBlockSize === rest.length ? rest : rest.slice(0, nextIndex));
if (shouldStrip(tagStack[tagStack.length - 1])) {
// We want to strip whitespace, but we need to insert spaces if stripping the
// tags and whitespace together would otherwise inadvertently concatenate words:
var insertSpaceBefore = i > 0 && !isWhiteSpace(string.charCodeAt(i - 1));
var insertSpaceAfter = !isWhiteSpace(string.charCodeAt(i + nextBlockSize));
if (simplifiedBlock.length > 0) {
simplifiedBlock =
(insertSpaceBefore ? " " : "") +
simplifiedBlock +
(insertSpaceAfter ? " " : "");
}
else if (insertSpaceBefore && insertSpaceAfter) {
simplifiedBlock = " ";
}
string = string.slice(0, i) + simplifiedBlock + string.slice(i + nextBlockSize);
nextBlockSize = simplifiedBlock.length;
}
numChars += simplifiedBlock.length;
if (numChars > maxLength) {
break;
}
}
else {
numChars += nextBlockSize;
if (numChars > maxLength) {
i = Math.max(i + nextBlockSize - numChars + maxLength, 0);
break;
}
}
}
i += nextBlockSize;
if (nextIndex === -1) {
break;
}
var charCode = string.charCodeAt(i);
if (charCode === TAG_OPEN_CHAR_CODE) {
var nextCharCode = string.charCodeAt(i + 1);
var isSpecialTag = nextCharCode === EXCLAMATION_CHAR_CODE;
if (isSpecialTag && string.substr(i + 2, 2) === "--") {
var commentEndIndex = string.indexOf("-->", i + 4) + 3;
i = commentEndIndex - 1; // - 1 because the outer for loop will increment it
}
else if (isSpecialTag && string.substr(i + 2, 7) === "[CDATA[") {
var cdataEndIndex = string.indexOf("]]>", i + 9) + 3;
i = cdataEndIndex - 1; // - 1 because the outer for loop will increment it
// note we don't count CDATA text for our character limit because it is only
// allowed within SVG and MathML content, both of which we don't clip
}
else {
// don't open new tags if we are currently at the limit
var isEndTag = nextCharCode === FORWARD_SLASH_CHAR_CODE;
if (numChars === maxLength && !isEndTag) {
numChars++;
break;
}
var attributeQuoteCharCode = 0;
var endIndex = i;
var isAttributeValue = false;
while (true /* eslint-disable-line */) {
endIndex++;
if (endIndex >= length) {
throw new Error("Invalid HTML: " + string);
}
var charCode_1 = string.charCodeAt(endIndex);
if (isAttributeValue) {
if (attributeQuoteCharCode) {
if (charCode_1 === attributeQuoteCharCode) {
isAttributeValue = false;
}
}
else {
if (isWhiteSpace(charCode_1)) {
isAttributeValue = false;
}
else if (charCode_1 === TAG_CLOSE_CHAR_CODE) {
isAttributeValue = false;
endIndex--; // re-evaluate this character
}
}
}
else if (charCode_1 === EQUAL_SIGN_CHAR_CODE) {
while (isWhiteSpace(string.charCodeAt(endIndex + 1))) {
endIndex++; // skip whitespace
}
isAttributeValue = true;
var firstAttributeCharCode = string.charCodeAt(endIndex + 1);
if (firstAttributeCharCode === DOUBLE_QUOTE_CHAR_CODE ||
firstAttributeCharCode === SINGLE_QUOTE_CHAR_CODE) {
attributeQuoteCharCode = firstAttributeCharCode;
endIndex++;
}
else {
attributeQuoteCharCode = 0;
}
}
else if (charCode_1 === TAG_CLOSE_CHAR_CODE) {
var tagNameStartIndex = i + (isEndTag ? 2 : 1);
var tagNameEndIndex = Math.min(indexOfWhiteSpace(string, tagNameStartIndex), endIndex);
var tagName = string
.slice(tagNameStartIndex, tagNameEndIndex)
.toLowerCase();
if (tagName.charCodeAt(tagName.length - 1) === FORWARD_SLASH_CHAR_CODE) {
// Remove trailing slash for self-closing tag names like <br/>
tagName = tagName.slice(0, tagName.length - 1);
}
var strip = shouldStrip(tagName);
if (isEndTag) {
var currentTagName = tagStack.pop();
if (currentTagName !== tagName) {
throw new Error("Invalid HTML: " + string);
}
if (UNBREAKABLE_ELEMENTS.includes(tagName)) {
if (UNBREAKABLE_ELEMENTS.some(function (tagName) {
return tagStack.includes(tagName);
})) {
// It's a nested unbreakable element.
}
else if (strip) {
i = unbreakableElementIndex;
unbreakableElementIndex = -1;
}
else {
unbreakableElementIndex = -1;
numChars += imageWeight;
if (numChars > maxLength) {
break;
}
}
}
// Block level elements should trigger a new line, unless stripped or
// part of unbreakable content.
var isBlockElement = BLOCK_ELEMENTS.includes(tagName);
if (isBlockElement && unbreakableElementIndex === -1 && !strip) {
numLines++;
if (numLines > maxLines) {
// If we exceed the max lines, push the tag back onto the
// stack so that it will be added back correctly after
// truncation.
tagStack.push(tagName);
break;
}
}
}
else if (VOID_ELEMENTS.includes(tagName) ||
string.charCodeAt(endIndex - 1) === FORWARD_SLASH_CHAR_CODE) {
if (strip) {
// Stripped elements aren't counted towards anything.
}
else if (tagName === "br") {
numLines++;
if (numLines > maxLines) {
break;
}
}
else if (tagName === "img") {
numChars += imageWeight;
if (numChars > maxLength) {
break;
}
}
}
else {
if (UNBREAKABLE_ELEMENTS.some(function (tagName) { return tagStack.includes(tagName); })) {
// It's a nested unbreakable element.
}
else if (UNBREAKABLE_ELEMENTS.includes(tagName)) {
unbreakableElementIndex = i;
}
tagStack.push(tagName);
}
if (strip && unbreakableElementIndex === -1) {
string = string.slice(0, i) + string.slice(endIndex + 1);
i--; // Re-evaluate this index, because its contents changed.
}
else {
i = endIndex;
}
break;
}
}
if (numChars > maxLength || numLines > maxLines) {
break;
}
}
}
else if (charCode === AMPERSAND_CHAR_CODE) {
var endIndex = i + 1;
var isCharacterReference = true;
while (true /* eslint-disable-line */) {
var charCode_2 = string.charCodeAt(endIndex);
if (isCharacterReferenceCharacter(charCode_2)) {
endIndex++;
}
else if (charCode_2 === SEMICOLON_CHAR_CODE) {
break;
}
else {
isCharacterReference = false;
break;
}
}
if (unbreakableElementIndex === -1) {
numChars++;
if (numChars > maxLength) {
break;
}
}
if (isCharacterReference) {
i = endIndex;
}
}
else if (charCode === NEWLINE_CHAR_CODE) {
numChars++;
if (numChars > maxLength) {
break;
}
numLines++;
if (numLines > maxLines) {
break;
}
}
else {
if (unbreakableElementIndex === -1) {
numChars++;
if (numChars > maxLength) {
break;
}
}
if ((charCode & 0xfc00) === 0xd800) {
// high Unicode surrogate should never be separated from its matching low surrogate
var nextCharCode = string.charCodeAt(i + 1);
if ((nextCharCode & 0xfc00) === 0xdc00) {
i++;
}
}
}
}
if (numChars > maxLength) {
var nextChar = takeHtmlCharAt(string, i);
if (indicator) {
var peekIndex = i + nextChar.length;
while (string.charCodeAt(peekIndex) === TAG_OPEN_CHAR_CODE &&
string.charCodeAt(peekIndex + 1) === FORWARD_SLASH_CHAR_CODE) {
var nextPeekIndex = string.indexOf(">", peekIndex + 2) + 1;
if (nextPeekIndex) {
peekIndex = nextPeekIndex;
}
else {
break;
}
}
if (peekIndex && (peekIndex === string.length || isLineBreak(string, peekIndex))) {
// if there's only a single character remaining in the input string, or the next
// character is followed by a line-break, we can include it instead of the clipping
// indicator (provided it's not a special HTML character)
i += nextChar.length;
nextChar = string.charAt(i);
}
}
// include closing tags before adding the clipping indicator if that's where they
// are in the input string
while (nextChar === "<" && string.charCodeAt(i + 1) === FORWARD_SLASH_CHAR_CODE) {
var tagName = tagStack.pop();
if (!tagName) {
break;
}
var tagEndIndex = string.indexOf(">", i + 2);
if (tagEndIndex === -1 || string.slice(i + 2, tagEndIndex).trim() !== tagName) {
throw new Error("Invalid HTML: " + string);
}
if (shouldStrip(tagName)) {
string = string.slice(0, i) + string.slice(tagEndIndex + 1);
}
else {
i = tagEndIndex + 1;
}
nextChar = string.charAt(i);
}
if (i < string.length) {
if (!options.breakWords) {
// try to clip at word boundaries, if desired
for (var j = i - indicator.length; j >= 0; j--) {
var charCode = string.charCodeAt(j);
if (charCode === TAG_CLOSE_CHAR_CODE || charCode === SEMICOLON_CHAR_CODE) {
// these characters could be just regular characters, so if they occur in
// the middle of a word, they would "break" our attempt to prevent breaking
// of words, but given this seems highly unlikely and the alternative is
// doing another full parsing of the preceding text, this seems acceptable.
break;
}
else if (charCode === NEWLINE_CHAR_CODE || charCode === TAG_OPEN_CHAR_CODE) {
i = j;
break;
}
else if (isWhiteSpace(charCode)) {
i = j + (indicator ? 1 : 0);
break;
}
}
}
var result = string.slice(0, i);
if (!isLineBreak(string, i)) {
result += indicator;
}
return popTagStack(result);
}
}
else if (numLines > maxLines) {
return popTagStack(string.slice(0, i));
}
return string;
}
function clipPlainText(string, maxLength, options) {
var _a = options.indicator, indicator = _a === void 0 ? "\u2026" : _a, _b = options.maxLines, maxLines = _b === void 0 ? Infinity : _b;
var numChars = indicator.length;
var numLines = 1;
var i = 0;
var length = string.length;
for (; i < length; i++) {
numChars++;
if (numChars > maxLength) {
break;
}
var charCode = string.charCodeAt(i);
if (charCode === NEWLINE_CHAR_CODE) {
numLines++;
if (numLines > maxLines) {
break;
}
}
else if ((charCode & 0xfc00) === 0xd800) {
// high Unicode surrogate should never be separated from its matching low surrogate
var nextCharCode = string.charCodeAt(i + 1);
if ((nextCharCode & 0xfc00) === 0xdc00) {
i++;
}
}
}
if (numChars > maxLength) {
var nextChar = takeCharAt(string, i);
if (indicator) {
var peekIndex = i + nextChar.length;
if (peekIndex === string.length) {
return string;
}
else if (string.charCodeAt(peekIndex) === NEWLINE_CHAR_CODE) {
return string.slice(0, i + nextChar.length);
}
}
if (!options.breakWords) {
// try to clip at word boundaries, if desired
for (var j = i - indicator.length; j >= 0; j--) {
var charCode = string.charCodeAt(j);
if (charCode === NEWLINE_CHAR_CODE) {
i = j;
nextChar = "\n";
break;
}
else if (isWhiteSpace(charCode)) {
i = j + (indicator ? 1 : 0);
break;
}
}
}
return string.slice(0, i) + (nextChar === "\n" ? "" : indicator);
}
else if (numLines > maxLines) {
return string.slice(0, i);
}
return string;
}
function indexOfWhiteSpace(string, fromIndex) {
var length = string.length;
for (var i = fromIndex; i < length; i++) {
if (isWhiteSpace(string.charCodeAt(i))) {
return i;
}
}
// Rather than -1, this function returns the length of the string if no match is found,
// so it works well with the Math.min() usage above:
return length;
}
function isCharacterReferenceCharacter(charCode) {
return ((charCode >= 48 && charCode <= 57) ||
(charCode >= 65 && charCode <= 90) ||
(charCode >= 97 && charCode <= 122));
}
function isLineBreak(string, index) {
var firstCharCode = string.charCodeAt(index);
if (firstCharCode === NEWLINE_CHAR_CODE) {
return true;
}
else if (firstCharCode === TAG_OPEN_CHAR_CODE) {
var newlineElements = "(" + BLOCK_ELEMENTS.join("|") + "|br)";
var newlineRegExp = new RegExp("^<" + newlineElements + "[\t\n\f\r ]*/?>", "i");
return newlineRegExp.test(string.slice(index));
}
else {
return false;
}
}
function isWhiteSpace(charCode) {
return (charCode === 9 || charCode === 10 || charCode === 12 || charCode === 13 || charCode === 32);
}
/**
* Certain tags don't display their whitespace-only content. In such cases, we
* should simplify the whitespace before counting it.
*/
function shouldSimplifyWhiteSpace(tagStack) {
for (var i = tagStack.length - 1; i >= 0; i--) {
var tagName = tagStack[i];
if (tagName === "li" || tagName === "td") {
return false;
}
if (tagName === "ol" || tagName === "table" || tagName === "ul") {
return true;
}
}
return false;
}
function simplifyWhiteSpace(string) {
return string.trim().replace(SIMPLIFY_WHITESPACE_REGEX, " ");
}
function takeCharAt(string, index) {
var charCode = string.charCodeAt(index);
if ((charCode & 0xfc00) === 0xd800) {
// high Unicode surrogate should never be separated from its matching low surrogate
var nextCharCode = string.charCodeAt(index + 1);
if ((nextCharCode & 0xfc00) === 0xdc00) {
return String.fromCharCode(charCode, nextCharCode);
}
}
return String.fromCharCode(charCode);
}
function takeHtmlCharAt(string, index) {
var char = takeCharAt(string, index);
if (char === "&") {
while (true /* eslint-disable-line */) {
index++;
var nextCharCode = string.charCodeAt(index);
if (isCharacterReferenceCharacter(nextCharCode)) {
char += String.fromCharCode(nextCharCode);
}
else if (nextCharCode === SEMICOLON_CHAR_CODE) {
char += String.fromCharCode(nextCharCode);
break;
}
else {
break;
}
}
}
return char;
}