UNPKG

@ckeditor/ckeditor5-paste-from-office

Version:

Paste from Office feature for CKEditor 5.

97 lines (96 loc) 4.14 kB
/** * @license Copyright (c) 2003-2024, CKSource Holding sp. z o.o. All rights reserved. * For licensing, see LICENSE.md or https://ckeditor.com/legal/ckeditor-oss-license */ /** * @module paste-from-office/filters/parse */ /* globals DOMParser */ import { DomConverter, ViewDocument } from 'ckeditor5/src/engine.js'; import { normalizeSpacing, normalizeSpacerunSpans } from './space.js'; /** * Parses the provided HTML extracting contents of `<body>` and `<style>` tags. * * @param htmlString HTML string to be parsed. */ export function parseHtml(htmlString, stylesProcessor) { const domParser = new DOMParser(); // Remove Word specific "if comments" so content inside is not omitted by the parser. htmlString = htmlString.replace(/<!--\[if gte vml 1]>/g, ''); // Clean the <head> section of MS Windows specific tags. See https://github.com/ckeditor/ckeditor5/issues/15333. // The regular expression matches the <o:SmartTagType> tag with optional attributes (with or without values). htmlString = htmlString.replace(/<o:SmartTagType(?:\s+[^\s>=]+(?:="[^"]*")?)*\s*\/?>/gi, ''); const normalizedHtml = normalizeSpacing(cleanContentAfterBody(htmlString)); // Parse htmlString as native Document object. const htmlDocument = domParser.parseFromString(normalizedHtml, 'text/html'); normalizeSpacerunSpans(htmlDocument); // Get `innerHTML` first as transforming to View modifies the source document. const bodyString = htmlDocument.body.innerHTML; // Transform document.body to View. const bodyView = documentToView(htmlDocument, stylesProcessor); // Extract stylesheets. const stylesObject = extractStyles(htmlDocument); return { body: bodyView, bodyString, styles: stylesObject.styles, stylesString: stylesObject.stylesString }; } /** * Transforms native `Document` object into {@link module:engine/view/documentfragment~DocumentFragment}. Comments are skipped. * * @param htmlDocument Native `Document` object to be transformed. */ function documentToView(htmlDocument, stylesProcessor) { const viewDocument = new ViewDocument(stylesProcessor); const domConverter = new DomConverter(viewDocument, { renderingMode: 'data' }); const fragment = htmlDocument.createDocumentFragment(); const nodes = htmlDocument.body.childNodes; while (nodes.length > 0) { fragment.appendChild(nodes[0]); } return domConverter.domToView(fragment, { skipComments: true }); } /** * Extracts both `CSSStyleSheet` and string representation from all `style` elements available in a provided `htmlDocument`. * * @param htmlDocument Native `Document` object from which styles will be extracted. */ function extractStyles(htmlDocument) { const styles = []; const stylesString = []; const styleTags = Array.from(htmlDocument.getElementsByTagName('style')); for (const style of styleTags) { if (style.sheet && style.sheet.cssRules && style.sheet.cssRules.length) { styles.push(style.sheet); stylesString.push(style.innerHTML); } } return { styles, stylesString: stylesString.join(' ') }; } /** * Removes leftover content from between closing </body> and closing </html> tag: * * ```html * <html><body><p>Foo Bar</p></body><span>Fo</span></html> -> <html><body><p>Foo Bar</p></body></html> * ``` * * This function is used as specific browsers (Edge) add some random content after `body` tag when pasting from Word. * @param htmlString The HTML string to be cleaned. * @returns The HTML string with leftover content removed. */ function cleanContentAfterBody(htmlString) { const bodyCloseTag = '</body>'; const htmlCloseTag = '</html>'; const bodyCloseIndex = htmlString.indexOf(bodyCloseTag); if (bodyCloseIndex < 0) { return htmlString; } const htmlCloseIndex = htmlString.indexOf(htmlCloseTag, bodyCloseIndex + bodyCloseTag.length); return htmlString.substring(0, bodyCloseIndex + bodyCloseTag.length) + (htmlCloseIndex >= 0 ? htmlString.substring(htmlCloseIndex) : ''); }