budoux
Version:
A small chunk segmenter.
580 lines • 20.5 kB
JavaScript
"use strict";
/**
* @license
* Copyright 2021 Google LLC
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* https://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
Object.defineProperty(exports, "__esModule", { value: true });
exports.HTMLProcessingParser = exports.HTMLProcessor = exports.ParagraphForTesting = exports.NodeOrTextForTesting = void 0;
const dom_js_1 = require("./dom.js");
const parser_js_1 = require("./parser.js");
const win_js_1 = require("./win.js");
const assert = console.assert;
const ZWSP_CODEPOINT = 0x200b; // U+200B ZERO WIDTH SPACE
const ZWSP = String.fromCharCode(ZWSP_CODEPOINT);
// We could use `Node.TEXT_NODE` and `Node.ELEMENT_NODE` in a browser context,
// but we define the same here for Node.js environments.
const NodeType = {
ELEMENT_NODE: 1,
TEXT_NODE: 3,
};
const DomAction = {
Inline: 0, // An inline content, becomes a part of a paragraph.
Block: 1, // A nested paragraph.
Skip: 2, // Skip the content. The content before and after are connected.
Break: 3, // A forced break. The content before and after become paragraphs.
NoBreak: 4, // The content provides context, but it's not breakable.
BreakOpportunity: 5, // Force a break opportunity.
};
/**
* Determines the action from an element name, as defined in
* {@link https://html.spec.whatwg.org/multipage/rendering.html HTML Rendering}.
* See also {@link actionForElement}.
*/
const domActions = {
// Hidden elements
// https://html.spec.whatwg.org/multipage/rendering.html#hidden-elements
AREA: DomAction.Skip,
BASE: DomAction.Skip,
BASEFONT: DomAction.Skip,
DATALIST: DomAction.Skip,
HEAD: DomAction.Skip,
LINK: DomAction.Skip,
META: DomAction.Skip,
NOEMBED: DomAction.Skip,
NOFRAMES: DomAction.Skip,
PARAM: DomAction.Skip,
RP: DomAction.Skip,
SCRIPT: DomAction.Skip,
STYLE: DomAction.Skip,
TEMPLATE: DomAction.Skip,
TITLE: DomAction.Skip,
NOSCRIPT: DomAction.Skip,
// Flow content
// https://html.spec.whatwg.org/multipage/rendering.html#flow-content-3
HR: DomAction.Break,
// Disable if `white-space: pre`.
LISTING: DomAction.Skip,
PLAINTEXT: DomAction.Skip,
PRE: DomAction.Skip,
XMP: DomAction.Skip,
// Phrasing content
// https://html.spec.whatwg.org/multipage/rendering.html#phrasing-content-3
BR: DomAction.Break,
RT: DomAction.Skip,
WBR: DomAction.BreakOpportunity,
// Form controls
// https://html.spec.whatwg.org/multipage/rendering.html#form-controls
INPUT: DomAction.Skip,
SELECT: DomAction.Skip,
BUTTON: DomAction.Skip,
TEXTAREA: DomAction.Skip,
// Other elements where the phrase-based line breaking should be disabled.
// https://github.com/google/budoux/blob/main/budoux/skip_nodes.json
ABBR: DomAction.Skip,
CODE: DomAction.Skip,
IFRAME: DomAction.Skip,
TIME: DomAction.Skip,
VAR: DomAction.Skip,
// Deprecated, but supported in all browsers.
// https://developer.mozilla.org/en-US/docs/Web/HTML/Element/nobr
NOBR: DomAction.NoBreak,
};
const defaultBlockElements = new Set([
// 15.3.2 The page
'HTML',
'BODY',
// 15.3.3 Flow content
'ADDRESS',
'BLOCKQUOTE',
'CENTER',
'DIALOG',
'DIV',
'FIGURE',
'FIGCAPTION',
'FOOTER',
'FORM',
'HEADER',
'LEGEND',
'LISTING',
'MAIN',
'P',
// 15.3.6 Sections and headings
'ARTICLE',
'ASIDE',
'H1',
'H2',
'H3',
'H4',
'H5',
'H6',
'HGROUP',
'NAV',
'SECTION',
// 15.3.7 Lists
'DIR',
'DD',
'DL',
'DT',
'MENU',
'OL',
'UL',
'LI',
// 15.3.8 Tables
'TABLE',
'CAPTION',
'COL',
'TR',
'TD',
'TH',
// 15.3.12 The fieldset and legend elements
'FIELDSET',
// 15.5.4 The details and summary elements
'DETAILS',
'SUMMARY',
// 15.5.12 The marquee element
'MARQUEE',
]);
// We could use `Node.TEXT_NODE` and `Node.ELEMENT_NODE` in a browser context,
// but we define the same here for Node.js environments.
const NODETYPE = {
ELEMENT: 1,
TEXT: 3,
};
/**
* Determine the action for an element.
* @param element An element to determine the action for.
* @return The {@link domActions} for the element.
*/
function actionForElement(element) {
const nodeName = element.nodeName;
const action = domActions[nodeName];
if (action !== undefined)
return action;
if (typeof win_js_1.win.getComputedStyle === 'function') {
const style = win_js_1.win.getComputedStyle(element);
switch (style.whiteSpace) {
case 'nowrap':
case 'pre':
return DomAction.NoBreak;
}
const display = style.display;
if (display)
return display === 'inline' ? DomAction.Inline : DomAction.Block;
// `display` is an empty string if the element is not connected.
}
// Use the built-in rules if the `display` property is empty, or if
// `getComputedStyle` is missing (e.g., jsdom.)
return defaultBlockElements.has(nodeName)
? DomAction.Block
: DomAction.Inline;
}
/**
* Represents a node in {@link Paragraph}.
*
* It wraps a {@link Text} or a {@link string}.
*
* A {@link string} provides the context for the parser, but it can't be split.
*/
class NodeOrText {
constructor(nodeOrText) {
this.chunks = [];
this.hasBreakOpportunityAfter = false;
this.nodeOrText = nodeOrText;
}
get isString() {
return typeof this.nodeOrText === 'string';
}
get canSplit() {
return !this.isString;
}
get text() {
return this.isString
? this.nodeOrText
: this.nodeOrText.nodeValue;
}
get length() {
var _a, _b;
return (_b = (_a = this.text) === null || _a === void 0 ? void 0 : _a.length) !== null && _b !== void 0 ? _b : 0;
}
/**
* Split the {@link Text} in the same way as the {@link chunks}.
* Joining all {@link chunks} must be equal to {@link text}.
*/
split(separator) {
const chunks = this.chunks;
assert(chunks.length === 0 || chunks.join('') === this.text);
if (chunks.length <= 1)
return;
assert(this.canSplit);
const node = this.nodeOrText;
if (typeof separator === 'string') {
// If the `separator` is a string, insert it at each boundary.
node.nodeValue = chunks.join(separator);
return;
}
// Otherwise create a `Text` node for each chunk, with the separator node
// between them, and replace the `node` with them.
const document = node.ownerDocument;
let nodes = [];
for (const chunk of chunks) {
if (chunk)
nodes.push(document.createTextNode(chunk));
// Add a separator between chunks. To simplify the logic, add a separator
// after each chunk, then remove the last one.
// To avoid `cloneNode` for the temporary one that is going to be removed,
// add `null` as a marker, then replace them with `cloneNode` later.
nodes.push(null);
}
nodes.pop();
nodes = nodes.map(n => (n ? n : separator.cloneNode(true)));
node.replaceWith(...nodes);
}
}
class NodeOrTextForTesting extends NodeOrText {
}
exports.NodeOrTextForTesting = NodeOrTextForTesting;
/**
* Represents a "paragraph", broken by block boundaries or forced breaks.
*
* A CSS
* {@link https://drafts.csswg.org/css2/#inline-formatting inline formatting context}
* is usually a "paragraph", but it can be broken into multiple paragraphs by
* forced breaks such as `<br>`.
*/
class Paragraph {
constructor(element) {
this.nodes = [];
this.element = element;
}
isEmpty() {
return this.nodes.length === 0;
}
get text() {
return this.nodes.map(node => node.text).join('');
}
get lastNode() {
return this.nodes.length ? this.nodes[this.nodes.length - 1] : undefined;
}
setHasBreakOpportunityAfter() {
const lastNode = this.lastNode;
if (lastNode)
lastNode.hasBreakOpportunityAfter = true;
}
/**
* @return Indices of forced break opportunities in the source.
* They can be created by `<wbr>` tag or `​`.
*/
getForcedOpportunities() {
const opportunities = [];
let len = 0;
for (const node of this.nodes) {
if (node.canSplit) {
const text = node.text;
if (text) {
for (let i = 0; i < text.length; ++i) {
if (text.charCodeAt(i) === ZWSP_CODEPOINT) {
opportunities.push(len + i + 1);
}
}
}
}
len += node.length;
if (node.hasBreakOpportunityAfter) {
opportunities.push(len);
}
}
return opportunities;
}
/**
* @return Filtered {@param boundaries} by excluding
* {@link getForcedOpportunities} if it's not empty.
* Otherwise {@param boundaries}.
*/
excludeForcedOpportunities(boundaries) {
const forcedOpportunities = this.getForcedOpportunities();
if (!forcedOpportunities.length)
return boundaries;
const set = new Set(forcedOpportunities);
return boundaries.filter(i => !set.has(i));
}
}
class ParagraphForTesting extends Paragraph {
}
exports.ParagraphForTesting = ParagraphForTesting;
/**
* Adds HTML processing support to a BudouX {@link Parser}.
*/
class HTMLProcessor {
/**
* @param parser A BudouX {@link Parser} to compute semantic line breaks.
*/
constructor(parser, options) {
/** See {@link HTMLProcessorOptions.separator}. */
this.separator = ZWSP;
this.parser_ = parser;
if (options !== undefined) {
if (options.className !== undefined)
this.className = options.className;
if (options.separator !== undefined)
this.separator = options.separator;
}
}
/**
* Checks if the given element has a text node in its children.
*
* @param ele An element to be checked.
* @return Whether the element has a child text node.
*/
static hasChildTextNode(ele) {
for (const child of ele.childNodes) {
if (child.nodeType === NODETYPE.TEXT)
return true;
}
return false;
}
/**
* Applies markups for semantic line breaks to the given HTML element.
*
* It breaks descendant nodes into paragraphs,
* and applies the BudouX to each paragraph.
* @param element The input element.
*/
applyToElement(element) {
for (const block of this.getBlocks(element)) {
assert(!block.isEmpty());
this.applyToParagraph(block);
}
}
/**
* Find paragraphs from a given HTML element.
* @param element The root element to find paragraphs.
* @param parent The parent {@link Paragraph} if any.
* @return A list of {@link Paragraph}s.
*/
*getBlocks(element, parent) {
assert(element.nodeType === NodeType.ELEMENT_NODE);
// Skip if it was once applied to this element.
if (this.className && element.classList.contains(this.className))
return;
const action = actionForElement(element);
if (action === DomAction.Skip)
return;
if (action === DomAction.Break) {
if (parent && !parent.isEmpty()) {
parent.setHasBreakOpportunityAfter();
yield parent;
parent.nodes = [];
}
assert(!element.firstChild);
return;
}
if (action === DomAction.BreakOpportunity) {
if (parent)
parent.setHasBreakOpportunityAfter();
return;
}
// Determine if this element creates a new inline formatting context, or if
// this element belongs to the parent inline formatting context.
assert(action === DomAction.Block ||
action === DomAction.Inline ||
action === DomAction.NoBreak);
const isNewBlock = !parent || action === DomAction.Block;
const block = isNewBlock ? new Paragraph(element) : parent;
// Collect all text nodes in this inline formatting context, while searching
// descendant elements recursively.
for (const child of element.childNodes) {
switch (child.nodeType) {
case NodeType.ELEMENT_NODE:
for (const childBlock of this.getBlocks(child, block))
yield childBlock;
break;
case NodeType.TEXT_NODE:
if (action === DomAction.NoBreak) {
const text = child.nodeValue;
if (text) {
block.nodes.push(new NodeOrText(text));
}
break;
}
block.nodes.push(new NodeOrText(child));
break;
}
}
// Apply if this is an inline formatting context.
if (isNewBlock && !block.isEmpty())
yield block;
}
/**
* Apply the BudouX to the given {@link Paragraph}.
* @param paragraph The {@link Paragraph} to apply.
*/
applyToParagraph(paragraph) {
assert(paragraph.nodes.length > 0);
if (!paragraph.nodes.some(node => node.canSplit))
return;
const text = paragraph.text;
// No changes if whitespace-only.
if (/^\s*$/.test(text))
return;
// Compute the phrase boundaries.
const boundaries = this.parser_.parseBoundaries(text);
// No changes if single phrase.
if (boundaries.length <= 0)
return;
// The boundaries should be between 1 and `text.length - 1` in the
// ascending order.
assert(boundaries[0] > 0);
assert(boundaries.every((x, i) => i === 0 || x > boundaries[i - 1]));
assert(boundaries[boundaries.length - 1] < text.length);
const adjustedBoundaries = paragraph.excludeForcedOpportunities(boundaries);
// Add a sentinel to help iterating.
adjustedBoundaries.push(text.length + 1);
this.splitNodes(paragraph.nodes, adjustedBoundaries);
this.applyBlockStyle(paragraph.element);
}
/**
* Split {@link NodeOrText} at the specified boundaries.
* @param nodes A list of {@link NodeOrText}.
* @param boundaries A list of indices of the text to split at.
*/
splitNodes(nodes, boundaries) {
var _a;
assert(boundaries.length > 0);
assert(boundaries.every((x, i) => i === 0 || x > boundaries[i - 1]));
const textLen = nodes.reduce((sum, node) => sum + node.length, 0);
// The last boundary must be a sentinel.
assert(boundaries[boundaries.length - 1] > textLen);
// Distribute `boundaries` to `node.chunks`.
let boundary_index = 0;
let boundary = boundaries[0];
assert(boundary > 0);
let nodeStart = 0; // the start index of the `nodeText` in the whole text.
let lastNode = null;
for (const node of nodes) {
assert(boundary >= nodeStart);
assert(node.chunks.length === 0);
const nodeText = node.text;
if (!nodeText)
continue;
const nodeLength = nodeText.length;
const nodeEnd = nodeStart + nodeLength;
assert(!lastNode || lastNode.canSplit);
if (!node.canSplit) {
// If there's a boundary between nodes and `lastNode.canSplit`, add a
// boundary to the end of the `lastNode`.
if (lastNode && boundary === nodeStart) {
if (lastNode.chunks.length === 0)
lastNode.chunks.push((_a = lastNode.text) !== null && _a !== void 0 ? _a : '');
lastNode.chunks.push('');
}
while (boundary < nodeEnd) {
boundary = boundaries[++boundary_index];
}
lastNode = null;
nodeStart = nodeEnd;
continue;
}
// Check if the next boundary is in this `node`.
lastNode = node;
if (boundary >= nodeEnd) {
nodeStart = nodeEnd;
continue;
}
// Compute the boundary indices in the `node`.
const chunks = node.chunks;
let chunkStartInNode = 0;
while (boundary < nodeEnd) {
const boundaryInNode = boundary - nodeStart;
assert(boundaryInNode >= chunkStartInNode);
chunks.push(nodeText.slice(chunkStartInNode, boundaryInNode));
chunkStartInNode = boundaryInNode;
boundary = boundaries[++boundary_index];
}
// Add the rest of the `nodeText`.
assert(chunkStartInNode < nodeLength);
chunks.push(nodeText.slice(chunkStartInNode));
nodeStart = nodeEnd;
}
// Check if all nodes and boundaries are consumed.
assert(nodeStart === textLen);
assert(boundary_index < boundaries.length);
assert(boundaries[boundary_index] >= textLen);
// `node.chunks` are finalized. Split them.
for (const node of nodes) {
node.split(this.separator);
}
}
/**
* Applies the block style to the given element.
* @param element The element to apply the block style.
*/
applyBlockStyle(element) {
if (this.className) {
element.classList.add(this.className);
return;
}
(0, dom_js_1.applyWrapStyle)(element);
}
}
exports.HTMLProcessor = HTMLProcessor;
/**
* BudouX {@link Parser} with HTML processing support.
*/
class HTMLProcessingParser extends parser_js_1.Parser {
constructor(model, htmlProcessorOptions = {
separator: ZWSP,
}) {
super(model);
this.htmlProcessor = new HTMLProcessor(this, htmlProcessorOptions);
}
/**
* @deprecated Use `applyToElement` instead. `applyElement` will be removed
* in v0.7.0 to align the function name with `HTMLProcessor`'s API.
*
* Applies markups for semantic line breaks to the given HTML element.
* @param parentElement The input element.
*/
applyElement(parentElement) {
console.warn('`applyElement` is deprecated. Please use `applyToElement` instead. ' +
'`applyElement` will be removed in v0.7.0.');
this.applyToElement(parentElement);
}
/**
* Applies markups for semantic line breaks to the given HTML element.
* @param parentElement The input element.
*/
applyToElement(parentElement) {
this.htmlProcessor.applyToElement(parentElement);
}
/**
* Translates the given HTML string to another HTML string with markups
* for semantic line breaks.
* @param html An input html string.
* @return The translated HTML string.
*/
translateHTMLString(html) {
if (html === '')
return html;
const doc = (0, dom_js_1.parseFromString)(html);
if (HTMLProcessor.hasChildTextNode(doc.body)) {
const wrapper = doc.createElement('span');
wrapper.append(...doc.body.childNodes);
doc.body.append(wrapper);
}
this.applyToElement(doc.body.childNodes[0]);
return doc.body.innerHTML;
}
}
exports.HTMLProcessingParser = HTMLProcessingParser;
//# sourceMappingURL=html_processor.js.map