UNPKG

budoux

Version:

A small chunk segmenter.

github.com/google/budoux

373 lines • 15.6 kB

JavaScript

/** * @license * Copyright 2021 Google LLC * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * https://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import { loadDefaultJapaneseParser } from '../index.js'; import { HTMLProcessingParser, HTMLProcessor, NodeOrTextForTesting, } from '../html_processor.js'; import { win } from '../win.js'; import { parseFromString, setInnerHtml } from '../dom.js'; const parser = loadDefaultJapaneseParser(); class MockHTMLProcessorBase extends HTMLProcessor { constructor(options) { super(parser, options); } } function getBlocks(html) { const document = win.document; setInnerHtml(document.body, html); const processor = new MockHTMLProcessorBase(); return processor.getBlocks(document.body); } describe('HTMLProcessor.applyToElement', () => { const document = win.document; const wbr = document.createElement('wbr'); function apply(html, separator) { setInnerHtml(document.body, html); const processor = new MockHTMLProcessorBase({ separator: separator, className: 'applied', }); processor.applyToElement(document.body); return document.body.innerHTML; } for (const test of [ { in: '<div>晴れ</div>', out: '<div>晴れ</div>', }, { in: '<div>今日は晴れです</div>', out: '<div class="applied">今日は|晴れです</div>', }, { in: '<div><span>今日は</span>晴れです</div>', out: '<div class="applied"><span>今日は</span>|晴れです</div>', }, { in: '<div><span>今日は晴れ</span>です</div>', out: '<div class="applied"><span>今日は|晴れ</span>です</div>', }, { in: '<code>今日は晴れです</code>', out: '<code>今日は晴れです</code>', }, { in: '<div>今日は<code>code</code>晴れです</div>', out: '<div class="applied">今日は<code>code</code>|晴れです</div>', }, { in: '<div>今日は晴れ、今日は晴れ</div>', out: '<div class="applied">今日は|晴れ、|今日は|晴れ</div>', }, { in: '<div>今日は<nobr>晴れ、今日は</nobr>晴れ</div>', out: '<div class="applied">今日は|<nobr>晴れ、今日は</nobr>|晴れ</div>', }, { in: '<div>今日は<span style="white-space: nowrap">晴れ、今日は</span>晴れ</div>', out: '<div class="applied">今日は|<span style="white-space: nowrap">晴れ、今日は</span>|晴れ</div>', }, ]) { // Test when the separator is an `Element`. it(test.in, () => { const out = test.out.replace(/\|/g, '<wbr>'); expect(apply(test.in, wbr)).toEqual(out); }); // Test when the separator is a `string`. it(test.in, () => { const out = test.out.replace(/\|/g, '/'); expect(apply(test.in, '/')).toEqual(out); }); } }); describe('HTMLProcessor.applyToElement.separator.node', () => { it('should clone separator element deeply', () => { const doc = win.document; setInnerHtml(doc.body, '<div>今日は良い天気です</div>'); const separator = doc.createElement('span'); separator.style.whiteSpace = 'normal'; separator.textContent = '\u200B'; const processor = new MockHTMLProcessorBase({ separator: separator, className: 'applied', }); processor.applyToElement(doc.body); expect(doc.body.innerHTML).toEqual('<div class="applied">今日は' + '<span style="white-space: normal;">\u200B</span>良い' + '<span style="white-space: normal;">\u200B</span>天気です</div>'); }); }); describe('HTMLProcessor.getBlocks', () => { function getText(html) { const blocks = getBlocks(html); return Array.from((function* (blocks) { for (const block of blocks) yield block.text; })(blocks)); } it('should collect all text of a simple block', () => { expect(getText('<div>123</div>')).toEqual(['123']); }); it('should collect two blocks separately', () => { expect(getText('<div>123</div><div>456</div>')).toEqual(['123', '456']); }); it('should break at <br> elements', () => { expect(getText('<div>123<br>456</div>')).toEqual(['123', '456']); }); it('should break at <br> elements inside a span', () => { expect(getText('<div>1<span>23<br>45</span>6</div>')).toEqual([ '123', '456', ]); }); it('should collect inline boxes as part of the block', () => { expect(getText('<div>123<span>456</span>789</div>')).toEqual(['123456789']); }); it('should collect nested blocks separately from the parent block', () => { expect(getText('<div>123<div>456</div>789</div>')).toEqual([ '456', '123789', ]); }); it('should collect inline-blocks separately from the parent block', () => { expect(getText('<div>123<div style="display: inline-block">456</div>789</div>')).toEqual(['456', '123789']); expect(getText('<div>123<span style="display: inline-block">456</span>789</div>')).toEqual(['456', '123789']); }); it('should skip textarea elements', () => { expect(getText('<textarea>123</textarea>')).toEqual([]); }); it('should skip <rt> and <rp> elements for <ruby>', () => { expect(getText('before<ruby>b1<rp>(</rp><rt>r1</rt>b2<rt>r2</rt></ruby>after')).toEqual(['beforeb1b2after']); }); it('should use the built-in rules if the `display` property is empty', () => { expect(getText('<div>123<span>456</span></div>')).toEqual(['123456']); expect(getText('<div>123<div>456</div></div>')).toEqual(['456', '123']); expect(getText('<div><h1>123</h1><li>456</li></div>')).toEqual([ '123', '456', ]); }); }); describe('HTMLProcessor.forcedOpportunities', () => { function forcedOpportunities(html) { const blocks = getBlocks(html); return Array.from((function* (blocks) { for (const block of blocks) { yield { indices: block.getForcedOpportunities(), after: block.nodes.map(block => block.hasBreakOpportunityAfter), }; } })(blocks)); } it('<wbr> should set has_break_opportunity_after', () => { expect(forcedOpportunities('123<wbr>456')).toEqual([ { indices: [3], after: [true, false] }, ]); }); it('Nested <wbr> should set has_break_opportunity_after', () => { expect(forcedOpportunities('123<span><wbr></span>456')).toEqual([ { indices: [3], after: [true, false] }, ]); }); it('ZWSP should be in forcedOpportunities', () => { expect(forcedOpportunities('123<span>\u200B456</span>')).toEqual([ { indices: [4], after: [false, false] }, ]); }); }); describe('HTMLProcessor.splitNodes', () => { class MockNode extends NodeOrTextForTesting { constructor(text) { super(text); } clear() { this.chunks = []; } get canSplit() { return true; } split() { } } const node123 = new MockNode('123'); const node456 = new MockNode('456'); function split(nodes, boundaries) { for (const node of nodes) { node.clear(); } const processor = new MockHTMLProcessorBase(); processor.splitNodes(nodes, boundaries); const result = nodes.map(node => node.chunks); return result; } it('should not split nodes', () => { expect(split([node123], [4])).toEqual([[]]); }); it('should not split single node at the end', () => { expect(split([node123], [3, 4])).toEqual([[]]); }); it('should not split two nodes at the end', () => { expect(split([node123, node456], [6, 7])).toEqual([[], []]); }); it('should split single node at the middle', () => { expect(split([node123], [2, 4])).toEqual([['12', '3']]); }); it('should split the first node twice', () => { expect(split([node123], [1, 2, 4])).toEqual([['1', '2', '3']]); }); it('should split the first node at the middle', () => { expect(split([node123, node456], [2, 7])).toEqual([['12', '3'], []]); }); it('should split the first node twice', () => { expect(split([node123, node456], [1, 2, 7])).toEqual([['1', '2', '3'], []]); }); it('should split the second node at the start', () => { expect(split([node123, node456], [3, 7])).toEqual([[], ['', '456']]); }); it('should split the second node at the middle', () => { expect(split([node123, node456], [5, 7])).toEqual([[], ['45', '6']]); }); it('should split the second node twice', () => { expect(split([node123, node456], [4, 5, 7])).toEqual([[], ['4', '5', '6']]); }); it('should split both nodes at the middle', () => { expect(split([node123, node456], [2, 5, 7])).toEqual([ ['12', '3'], ['45', '6'], ]); }); it('should split both nodes twice', () => { expect(split([node123, node456], [1, 2, 4, 5, 7])).toEqual([ ['1', '2', '3'], ['4', '5', '6'], ]); }); it('should split at every character', () => { expect(split([node123, node456], [1, 2, 3, 4, 5, 7])).toEqual([ ['1', '2', '3'], ['', '4', '5', '6'], ]); }); }); describe('HTMLProcessingParser.applyToElement', () => { const checkEqual = (model, inputHTML, expectedHTML) => { const inputDOM = parseFromString(inputHTML); const inputDocument = inputDOM.querySelector('p'); const parser = new HTMLProcessingParser(model); parser.applyToElement(inputDocument); const expectedDocument = parseFromString(expectedHTML); const expectedElement = expectedDocument.querySelector('p'); expect(inputDocument.isEqualNode(expectedElement)).toBeTrue(); }; const style = 'word-break: keep-all; overflow-wrap: anywhere;'; it('should insert ZWSPs where the sentence should break.', () => { const inputHTML = '<p>xyzabcabc</p>'; const expectedHTML = `<p style="${style}">xyz\u200Babc\u200Babc</p>`; const model = { UW4: { a: 1001 }, // means "should separate right before 'a'". }; checkEqual(model, inputHTML, expectedHTML); }); it('should insert ZWSPs even it overlaps with other HTML tags.', () => { const inputHTML = '<p>xy<a href="#">zabca</a>bc</p>'; const expectedHTML = `<p style="${style}">xy<a href="#">z\u200Babc\u200Ba</a>bc</p>`; const model = { UW4: { a: 1001 }, // means "should separate right before 'a'". }; checkEqual(model, inputHTML, expectedHTML); }); it('should not insert ZWSPs to where input has WBR tags already.', () => { const inputHTML = '<p>xyz<wbr>abcabc</p>'; const expectedHTML = `<p style="${style}">xyz<wbr>abc\u200Babc</p>`; const model = { UW4: { a: 1001 }, // means "should separate right before 'a'". }; checkEqual(model, inputHTML, expectedHTML); }); it('should not insert ZWSPs to where input has ZWSPs.', () => { const inputHTML = '<p>xyz\u200Babcabc</p>'; const expectedHTML = `<p style="${style}">xyz\u200babc\u200Babc</p>`; const model = { UW4: { a: 1001 }, // means "should separate right before 'a'". }; checkEqual(model, inputHTML, expectedHTML); }); }); describe('HTMLProcessingParser.translateHTMLString', () => { const defaultModel = { UW4: { a: 1001 }, // means "should separate right before 'a'". }; const checkEqual = (model, inputHTML, expectedHTML) => { const parser = new HTMLProcessingParser(model); const result = parser.translateHTMLString(inputHTML); const resultDocument = parseFromString(result); const expectedDocument = parseFromString(expectedHTML); expect(resultDocument.isEqualNode(expectedDocument)).toBeTrue(); }; it('should output a html string with a SPAN parent with proper style attributes.', () => { const inputHTML = 'xyzabcd'; const expectedHTML = ` <span style="word-break: keep-all; overflow-wrap: anywhere;">xyz\u200Babcd</span>`; checkEqual(defaultModel, inputHTML, expectedHTML); }); it('should not add a SPAN parent if the input already has one single parent.', () => { const inputHTML = '<p class="foo" style="color: red">xyzabcd</p>'; const expectedHTML = ` <p class="foo" style="color: red; word-break: keep-all; overflow-wrap: anywhere;" >xyz\u200Babcd</p>`; checkEqual(defaultModel, inputHTML, expectedHTML); }); it('should return a blank string if the input is blank.', () => { const inputHTML = ''; const expectedHTML = ''; checkEqual({}, inputHTML, expectedHTML); }); it('should pass script tags as-is.', () => { const inputHTML = 'xyz<script>alert(1);</script>xyzabc'; const expectedHTML = `<span style="word-break: keep-all; overflow-wrap: anywhere;" >xyz<script>alert(1);</script>xyz\u200Babc</span>`; checkEqual(defaultModel, inputHTML, expectedHTML); }); it('script tags on top should be discarded by the DOMParser.', () => { const inputHTML = '<script>alert(1);</script>xyzabc'; const expectedHTML = `<span style="word-break: keep-all; overflow-wrap: anywhere;" >xyz\u200Babc</span>`; checkEqual(defaultModel, inputHTML, expectedHTML); }); it('should skip some specific tags.', () => { const inputHTML = 'xyz<code>abc</code>abc'; const expectedHTML = `<span style="word-break: keep-all; overflow-wrap: anywhere;" >xyz<code>abc</code>\u200Babc</span>`; checkEqual(defaultModel, inputHTML, expectedHTML); }); it('should not ruin attributes of child elements.', () => { const inputHTML = 'xyza<a href="#" hidden>bc</a>abc'; const expectedHTML = `<span style="word-break: keep-all; overflow-wrap: anywhere;" >xyz\u200Ba<a href="#" hidden>bc</a>\u200Babc</span>`; checkEqual(defaultModel, inputHTML, expectedHTML); }); it('should work with emojis.', () => { const inputHTML = 'xyza🇯🇵🇵🇹abc'; const expectedHTML = `<span style="word-break: keep-all; overflow-wrap: anywhere;" >xyz\u200Ba🇯🇵🇵🇹\u200Babc</span>`; checkEqual(defaultModel, inputHTML, expectedHTML); }); }); //# sourceMappingURL=test_html_processor.js.map