UNPKG

datocms-html-to-structured-text

Version:

Convert HTML (or a `hast` syntax tree) to a valid DatoCMS Structured Text `dast` document

632 lines (550 loc) 16.4 kB
import { convertElement } from 'hast-util-is-element'; import { toText } from 'hast-util-to-text'; import { hasProperty } from 'hast-util-has-property'; import { allowedChildren, inlineNodeTypes, Mark, } from 'datocms-structured-text-utils'; import type { Handler, Context, Node, CreateNodeFunction } from './types.js'; import type { Nodes as HastNodes, Element as HastElement } from 'hast'; import { Heading as DastHeading } from 'datocms-structured-text-utils'; import visitChildren from './visit-children.js'; import { wrap } from './wrap.js'; export const root: Handler = async function root( createNode: CreateNodeFunction, node: HastNodes, context: Context, ) { let children = await visitChildren(createNode, node, { ...context, parentNodeType: 'root', }); if ( Array.isArray(children) && children.some( (child) => child && !allowedChildren.root.includes(child.type), ) ) { children = wrap(children); } if (!Array.isArray(children) || children.length === 0) { return null; } return createNode('root', { children: Array.isArray(children) ? children : [], }); }; export const paragraph: Handler = async function paragraph( createNode: CreateNodeFunction, node: HastNodes, context: Context, ) { const isAllowedChild = allowedChildren[context.parentNodeType].includes('paragraph'); const children = await visitChildren(createNode, node, { ...context, parentNodeType: isAllowedChild ? 'paragraph' : context.parentNodeType, }); if (Array.isArray(children) && children.length) { return isAllowedChild ? createNode('paragraph', { children }) : children; } return undefined; }; export const thematicBreak: Handler = async function thematicBreak( createNode: CreateNodeFunction, _node: HastNodes, context: Context, ) { const isAllowedChild = allowedChildren[context.parentNodeType].includes('thematicBreak'); return isAllowedChild ? createNode('thematicBreak', {}) : undefined; }; export const heading: Handler = async function heading( createNode: CreateNodeFunction, node: HastNodes, context: Context, ) { if (node.type !== 'element') return undefined; const level = (Number(node.tagName.charAt(1)) || 1) as DastHeading['level']; const isAllowedChild = allowedChildren[context.parentNodeType].includes('heading') && context.allowedBlocks.includes('heading') && context.allowedHeadingLevels.includes(level); const children = await visitChildren(createNode, node, { ...context, parentNodeType: isAllowedChild ? 'heading' : context.parentNodeType, wrapText: isAllowedChild ? false : context.wrapText, }); if (Array.isArray(children) && children.length) { return isAllowedChild ? createNode('heading', { level, children, }) : children; } return undefined; }; export const code: Handler = async function code( createNode: CreateNodeFunction, node: HastNodes, context: Context, ) { const isAllowedChild = allowedChildren[context.parentNodeType].includes('code'); if (!isAllowedChild) { return inlineCode(createNode, node, context); } if (!context.allowedBlocks.includes('code')) { return visitChildren(createNode, node, context); } const prefix = typeof context.codePrefix === 'string' ? context.codePrefix : 'language-'; const isPre = convertElement('pre'); const isCode = convertElement('code'); const children = node.type === 'element' || node.type === 'root' ? node.children || [] : []; let index = -1; let classList: Array<string | number> | null = null; let language: Record<string, string> = {}; if (isPre(node)) { while (++index < children.length) { const child = children[index]; if ( typeof child === 'object' && child.type === 'element' && isCode(child) && hasProperty(child, 'className') ) { const cn = child.properties?.className; classList = Array.isArray(cn) ? cn : null; break; } } } else if ( node.type === 'element' && isCode(node) && hasProperty(node, 'className') ) { const cn = node.properties?.className; classList = Array.isArray(cn) ? cn : null; } if (Array.isArray(classList)) { index = -1; while (++index < classList.length) { const className = String(classList[index]); if (className.slice(0, prefix.length) === prefix) { language = { language: className.slice(prefix.length) }; break; } } } return createNode('code', { ...language, code: String(wrapText(context, toText(node))).replace(/\n+$/, ''), }); }; export const blockquote: Handler = async function blockquote( createNode: CreateNodeFunction, node: HastNodes, context: Context, ) { const isAllowedChild = allowedChildren[context.parentNodeType].includes('blockquote') && context.allowedBlocks.includes('blockquote'); const children = await visitChildren(createNode, node, { ...context, parentNodeType: isAllowedChild ? 'blockquote' : context.parentNodeType, }); if (Array.isArray(children) && children.length) { return isAllowedChild ? createNode('blockquote', { children: wrap(children) }) : children; } return undefined; }; export const list: Handler = async function list( createNode: CreateNodeFunction, node: HastNodes, context: Context, ) { const isAllowedChild = allowedChildren[context.parentNodeType].includes('list') && context.allowedBlocks.includes('list'); if (!isAllowedChild) { return await visitChildren(createNode, node, context); } const children = await wrapListItems(createNode, node, { ...context, parentNodeType: 'list', }); if (Array.isArray(children) && children.length) { return createNode('list', { children, style: node.type === 'element' && node.tagName === 'ol' ? 'numbered' : 'bulleted', }); } return undefined; }; export const listItem: Handler = async function listItem( createNode: CreateNodeFunction, node: HastNodes, context: Context, ) { const isAllowedChild = allowedChildren[context.parentNodeType].includes('listItem') && context.allowedBlocks.includes('list'); const children = await visitChildren(createNode, node, { ...context, parentNodeType: isAllowedChild ? 'listItem' : context.parentNodeType, }); if (Array.isArray(children) && children.length) { return isAllowedChild ? createNode('listItem', { children: wrap(children), }) : children; } return undefined; }; export const link: Handler = async function link( createNode: CreateNodeFunction, node: HastNodes, context: Context, ) { if (!context.allowedBlocks.includes('link')) { return visitChildren(createNode, node, context); } let isAllowedChild = false; if (allowedChildren[context.parentNodeType] === 'inlineNodes') { isAllowedChild = inlineNodeTypes.includes('link'); } else if (Array.isArray(allowedChildren[context.parentNodeType])) { isAllowedChild = allowedChildren[context.parentNodeType].includes('link'); } if (!isAllowedChild) { // Links that aren't inside of a allowedChildren context // can still be valid `dast` nodes in the following contexts if wrapped. const allowedChildrenWrapped = ['root', 'list', 'listItem']; isAllowedChild = allowedChildrenWrapped.includes(context.parentNodeType); } if (node.type !== 'element') return undefined; const e = node; const nodeChildren = e.children || []; // When a link wraps headings we try to preserve the heading by inverting the parent-child relationship. // Essentially we tweak the nodes so that the heading wraps the link. // // @TODO this is only checking for headings that are direct descendants of links. // Decide if it is worth looking deeper. const wrapsHeadings = nodeChildren.some( (child) => child.type === 'element' && child.tagName.startsWith('h'), ); if (wrapsHeadings) { let i = 0; const splitChildren: HastElement[] = []; nodeChildren.forEach((child) => { if (child.type === 'element' && child.tagName.startsWith('h')) { if (splitChildren.length > 0) { i++; } splitChildren.push({ ...child, children: [ { ...e, children: child.children, }, ], }); i++; } else if (splitChildren[i]) { const sc = splitChildren[i]; sc.children = sc.children || []; sc.children.push(child); } else { splitChildren[i] = { ...e, children: [child], }; } }); e.children = splitChildren; isAllowedChild = false; } const children = await visitChildren(createNode, node, { ...context, parentNodeType: isAllowedChild ? 'link' : context.parentNodeType, marks: Array.isArray(context.marks) ? context.marks.filter((m) => m !== 'underline') : context.marks, }); if (Array.isArray(children) && children.length) { if (!isAllowedChild) { return children; } const nodeProps = e.properties; const meta: Array<{ id: string; value: string }> = []; if (nodeProps) { ['target', 'rel', 'title'].forEach((attr) => { const raw = nodeProps[attr]; const value = Array.isArray(raw) ? raw.join(' ') : raw; if ( value !== undefined && value !== null && value !== false && value !== '' ) { meta.push({ id: attr, value: String(value) }); } }); } return createNode('link', { url: resolveUrl(context, nodeProps?.href), children, ...(meta.length > 0 ? { meta } : {}), }); } return undefined; }; export const span: Handler = async function span( createNode: CreateNodeFunction, node: HastNodes, context: Context, ) { if (node.type !== 'text') return undefined; const marks: { marks?: Mark[] } = {}; if (Array.isArray(context.marks)) { const allowedMarks = context.marks.filter((mark) => context.allowedMarks.includes(mark), ); if (allowedMarks.length > 0) { marks.marks = allowedMarks; } } return createNode('span', { value: wrapText(context, node.value), ...marks, }); }; export const newLine: Handler = async function newLine( createNode: CreateNodeFunction, ) { return createNode('span', { value: '\n', }); }; export const inlineCode = withMark('code'); export const strong = withMark('strong'); export const italic = withMark('emphasis'); export const underline = withMark('underline'); export const strikethrough = withMark('strikethrough'); export const highlight = withMark('highlight'); export const head: Handler = async function head( createNode: CreateNodeFunction, node: HastNodes, context: Context, ) { if (node.type !== 'element') return undefined; const baseElement = (node.children || []).find( (child) => child.type === 'element' && child.tagName === 'base', ); if (baseElement) { return context.handlers.base(createNode, baseElement, context); } else { return undefined; } }; export const base: Handler = async function base( _createNode: CreateNodeFunction, node: HastNodes, context: Context, ) { if (node.type !== 'element') return undefined; if ( !context.global.baseUrlFound && typeof node.properties === 'object' && node.properties?.href ) { context.global.baseUrl = String(node.properties.href).replace(/\/$/, ''); context.global.baseUrlFound = true; } return undefined; }; export const extractInlineStyles: Handler = async function extractInlineStyles( createNode: CreateNodeFunction, node: HastNodes, context: Context, ) { const accumulated: Mark[] = Array.isArray(context.marks) ? [...context.marks] : []; const properties = node.type === 'element' ? node.properties : undefined; if (properties && typeof properties.style === 'string') { const newMarks: Mark[] = []; String(properties.style) .split(';') .forEach((declaration) => { const [firstChunk, ...otherChunks] = declaration.split(':'); const prop = firstChunk.trim(); const value = otherChunks.join(':').trim(); switch (prop) { case 'font-weight': if (value === 'bold' || Number(value) > 400) { newMarks.push('strong'); } break; case 'font-style': if (value === 'italic') { newMarks.push('emphasis'); } break; case 'text-decoration': if (value === 'underline') { newMarks.push('underline'); } break; default: break; } }); newMarks.forEach((mark) => { if ( !accumulated.includes(mark) && context.allowedMarks.includes(mark) && !(context.parentNodeType === 'link' && mark === 'underline') ) { accumulated.push(mark); } }); } const marksContext: { marks?: Mark[] } = accumulated.length > 0 ? { marks: accumulated } : {}; return visitChildren(createNode, node, { ...context, ...marksContext, }); }; // eslint-disable-next-line @typescript-eslint/no-empty-function export function noop(): void {} export function withMark(type: Mark): Handler { return function markHandler( createNode: CreateNodeFunction, node: HastNodes, context: Context, ) { if ( !context.allowedMarks.includes(type) || (context.parentNodeType === 'link' && type === 'underline') ) { return visitChildren(createNode, node, context); } const marks: { marks: Mark[] } = Array.isArray(context.marks) ? { marks: context.marks.includes(type) ? context.marks : context.marks.concat([type]), } : { marks: [type] }; return visitChildren(createNode, node, { ...context, ...marks, }); }; } export const handlers = { root: root, p: paragraph, summary: paragraph, h1: heading, h2: heading, h3: heading, h4: heading, h5: heading, h6: heading, ul: list, ol: list, dir: list, dt: listItem, dd: listItem, li: listItem, listing: code, plaintext: code, pre: code, xmp: code, blockquote: blockquote, a: link, code: code, kbd: code, samp: code, tt: code, var: code, strong: strong, b: strong, em: italic, i: italic, u: underline, strike: strikethrough, s: strikethrough, mark: highlight, base: base, span: extractInlineStyles, text: span, br: newLine, hr: thematicBreak, head: head, comment: noop, script: noop, style: noop, title: noop, video: noop, audio: noop, embed: noop, iframe: noop, }; export async function wrapListItems( createNode: CreateNodeFunction, node: HastNodes, context: Context, ): Promise<Node[]> { const children = await visitChildren(createNode, node, context); if (!Array.isArray(children)) { return []; } let index = -1; while (++index < children.length) { const child = children[index]; if (typeof child !== 'undefined' && child.type !== 'listItem') { const wrappedChild = allowedChildren.listItem.includes(child.type) ? child : createNode('paragraph', { children: [child] }); children[index] = createNode('listItem', { children: [wrappedChild], }); } } return children; } export function wrapText(context: Context, value: string): string { return context.wrapText ? value : value.replace(/\r?\n|\r/g, ' '); } export function resolveUrl(context: Context, url: unknown): string { if (url === null || url === undefined) { return ''; } const urlString = String(url); if (context.global.baseUrl && typeof URL !== 'undefined') { const isRelative = /^\.?\//.test(urlString); const parsed = new URL(urlString, context.global.baseUrl); if (isRelative) { const parsedBase = new URL(context.global.baseUrl); if (!parsed.pathname.startsWith(parsedBase.pathname)) { parsed.pathname = `${parsedBase.pathname}${parsed.pathname}`; } } return parsed.toString(); } return urlString; }