substance
Version:
Substance is a JavaScript library for web-based content editing. It provides building blocks for realizing custom text editors and web-based publishing systems.
678 lines (613 loc) • 21.7 kB
JavaScript
import { last, createCountingIdGenerator, ArrayIterator } from '../util'
const WS_LEFT = /^\s+/g
// TODO: this is probably incorrect, /^\s*/ would always be a match
// const WS_LEFT_ALL = /^\s*/g
const WS_RIGHT = /\s+$/g
const WS_ALL = /\s+/g
// var ALL_WS_NOTSPACE_LEFT = /^[\t\n]+/g
// var ALL_WS_NOTSPACE_RIGHT = /[\t\n]+$/g
const SPACE = " "
const TABS_OR_NL = /[\t\n\r]+/g
const INVISIBLE_CHARACTER = "\u200B"
/**
A generic base implementation for XML/HTML importers.
@param {Object} config
@param {DocumentSchema} config.schema
@param {object[]} config.converters
*/
export default
class DOMImporter {
constructor(config, context) {
this.context = context || {}
if (!config.schema) {
throw new Error('"config.schema" is mandatory')
}
if (!config.converters) {
throw new Error('"config.converters" is mandatory')
}
this.config = Object.assign({ idAttribute: 'id' }, config)
this.schema = config.schema
this.converters = config.converters
this.state = null
this._defaultBlockConverter = null
this._allConverters = []
this._blockConverters = []
this._propertyAnnotationConverters = []
this.state = new DOMImporter.State()
this._initialize()
}
/*
Goes through all converters, checks their consistency
and registers them depending on the type in different sets.
*/
_initialize() {
const schema = this.schema
const defaultTextType = schema.getDefaultTextType()
const converters = this.converters
for (let i = 0; i < converters.length; i++) {
let converter
if (typeof converters[i] === 'function') {
const Converter = converters[i]
converter = new Converter()
} else {
converter = converters[i]
}
if (!converter.type) {
throw new Error('Converter must provide the type of the associated node.')
}
if (!converter.matchElement && !converter.tagName) {
throw new Error('Converter must provide a matchElement function or a tagName property.')
}
if (!converter.matchElement) {
converter.matchElement = this._defaultElementMatcher.bind(converter)
}
const NodeClass = schema.getNodeClass(converter.type)
if (!NodeClass) {
throw new Error('No node type defined for converter')
}
if (!this._defaultBlockConverter && defaultTextType === converter.type) {
this._defaultBlockConverter = converter
}
this._allConverters.push(converter)
// Defaults to _blockConverters
if (NodeClass.prototype._isPropertyAnnotation) {
this._propertyAnnotationConverters.push(converter)
} else {
this._blockConverters.push(converter)
}
}
if (!this._defaultBlockConverter) {
throw new Error(`No converter for defaultTextType ${defaultTextType}`)
}
}
dispose() {
if (this.state.doc) {
this.state.doc.dispose()
}
}
/**
Resets this importer.
Make sure to either create a new importer instance or call this method
when you want to generate nodes belonging to different documents.
*/
reset() {
if (this.state.doc) {
this.state.doc.dispose()
}
this.state.reset()
this.state.doc = this._createDocument()
}
getDocument() {
return this.state.doc
}
/**
Converts all children of a given element and creates a Container node.
@param {DOMElement[]} elements All elements that should be converted into the container.
@param {String} containerId The id of the target container node.
@returns {Container} the container node
*/
convertContainer(elements, containerId) {
if (!this.state.doc) this.reset()
const state = this.state
const iterator = new ArrayIterator(elements)
const nodeIds = []
while(iterator.hasNext()) {
const el = iterator.next()
let node
const blockTypeConverter = this._getConverterForElement(el, 'block')
if (blockTypeConverter) {
state.pushContext(el.tagName, blockTypeConverter)
let nodeData = this._createNodeData(el, blockTypeConverter.type)
nodeData = blockTypeConverter.import(el, nodeData, this) || nodeData
node = this._createNode(nodeData)
let context = state.popContext()
context.annos.forEach((a) => {
this._createNode(a)
})
} else if (el.isCommentNode()) {
continue
} else {
// skip empty text nodes
if (el.isTextNode() && /^\s*$/.exec(el.textContent)) continue
// If we find text nodes on the block level we wrap
// it into a paragraph element (or what is configured as default block level element)
iterator.back()
node = this._wrapInlineElementsIntoBlockElement(iterator)
}
if (node) {
nodeIds.push(node.id)
}
}
return this._createNode({
type: 'container',
id: containerId,
nodes: nodeIds
})
}
/**
Converts a single HTML element and creates a node in the current document.
@param {ui/DOMElement} el the HTML element
@returns {object} the created node as JSON
*/
convertElement(el) {
if (!this.state.doc) this.reset()
let isTopLevel = !this.state.isConverting
if (isTopLevel) {
this.state.isConverting = true
}
let nodeData, annos
const converter = this._getConverterForElement(el)
if (converter) {
const NodeClass = this.schema.getNodeClass(converter.type)
nodeData = this._createNodeData(el, converter.type)
this.state.pushContext(el.tagName, converter)
// Note: special treatment for property annotations and inline nodes
// i.e. if someone calls `importer.convertElement(annoEl)`
// usually, annotations are imported via `importer.annotatedText(..)`
// The peculiarity here is that in such a case, it is not
// not clear, which property the annotation should be attached to.
if (NodeClass.isInline) {
nodeData = this._convertInlineNode(el, nodeData, converter)
}
else if (NodeClass.prototype._isPropertyAnnotation) {
nodeData = this._convertPropertyAnnotation(el, nodeData)
} else {
nodeData = converter.import(el, nodeData, this) || nodeData
}
let context = this.state.popContext()
annos = context.annos
} else {
throw new Error('No converter found for '+el.tagName)
}
// create the node
const node = this._createNode(nodeData)
// and all annos which have been created during this call
annos.forEach((a) => {
this._createNode(a)
})
// HACK: to allow using an importer stand-alone
// i.e. creating detached elements
if (this.config["stand-alone"] && isTopLevel) {
this.state.isConverting = false
this.reset()
}
return node
}
/**
Convert annotated text. You should call this method only for elements
containing rich-text.
@param {DOMElement} el
@param {String[]} path The target property where the extracted text (plus annotations) should be stored.
@param {Object} options
@param {Boolean} options.preserveWhitespace when true will preserve whitespace. Default: false.
@returns {String} The converted text as plain-text
@example
```
p.content = converter.annotatedText(pEl, [p.id, 'content'])
```
*/
annotatedText(el, path, options={}) {
if (!path) {
throw new Error('path is mandatory')
}
const state = this.state
const context = last(state.contexts)
// NOTE: this API is meant for node converters, which have been triggered
// via convertElement().
if (!context) {
throw new Error('This should be called from within an element converter.')
}
// TODO: are there more options?
const oldPreserveWhitespace = state.preserveWhitespace
if (options.preserveWhitespace) {
state.preserveWhitespace = true
}
state.stack.push({ path: path, offset: 0, text: "", annos: []})
// IMO we should reset the last char, as it is only relevant within one
// annotated text property. This feature is mainly used to eat up
// whitespace in XML/HTML at tag boundaries, produced by pretty-printed XML/HTML.
this.state.lastChar = ''
const iterator = this.getChildNodeIterator(el)
const text = this._annotatedText(iterator)
// now we can create all annotations which have been created during this
// call of annotatedText
const top = state.stack.pop()
context.annos = context.annos.concat(top.annos)
// reset state
state.preserveWhitespace = oldPreserveWhitespace
return text
}
/**
Converts the given element as plain-text.
@param {ui/DOMElement} el
@returns {String} The plain text
*/
plainText(el) {
var state = this.state
var text = el.textContent
if (state.stack.length > 0) {
var context = last(state.stack)
context.offset += text.length
context.text += context.text.concat(text)
}
return text
}
/*
Tells the converter to insert custom text.
During conversion of annotatedText this is used to insert different
text than taken from the DOM. E.g., for inline nodes we insert an invisible
character instead of the inner content.
@private
@param {String}
*/
_customText(text) {
var state = this.state
if (state.stack.length > 0) {
var context = last(state.stack)
context.offset += text.length
context.text += context.text.concat(text)
}
return text
}
/**
Generates an id. The generated id is unique with respect to all ids generated so far.
@param {String} a prefix
@return {String} the generated id
*/
nextId(prefix) {
// TODO: we could create more beautiful ids?
// however we would need to be careful as there might be another
// element in the HTML coming with that id
// For now we use shas
return this.state.uuid(prefix)
}
_getNextId(dom, type) {
let id = this.nextId(type)
while (this.state.ids[id] || dom.find('#'+id)) {
id = this.nextId(type)
}
return id
}
_getIdForElement(el, type) {
let id = el.getAttribute(this.config.idAttribute)
if (id && !this.state.ids[id]) return id
return this._getNextId(el.getOwnerDocument(), type)
}
// Note: this is e.g. shared by ClipboardImporter which has a different
// implementation of this.createDocument()
_createDocument() {
// create an empty document and initialize the container if not present
const schema = this.config.schema
const DocumentClass = schema.getDocumentClass()
return new DocumentClass(schema)
}
_convertPropertyAnnotation(el, nodeData) {
const path = [nodeData.id, '_content']
// if there is no context, this is called stand-alone
// i.e., user tries to convert an annotation element
// directly, not part of a block element, such as a paragraph
nodeData._content = this.annotatedText(el, path)
nodeData.start = { path, offset: 0 }
nodeData.end = { offset: nodeData._content.length }
return nodeData
}
_convertInlineNode(el, nodeData, converter) {
const path = [nodeData.id, '_content']
if (converter.import) {
nodeData = converter.import(el, nodeData, this) || nodeData
}
nodeData._content = '$'
nodeData.start = { path, offset: 0 }
nodeData.end = { offset: 1 }
return nodeData
}
_createNodeData(el, type) {
if (!type) {
throw new Error('type is mandatory.')
}
let nodeData = {
type,
id: this._getIdForElement(el, type)
}
this.state.ids[nodeData.id] = true
return nodeData
}
_createNode(nodeData) {
let doc = this.state.doc
// NOTE: if your Document implementation adds default nodes in the constructor
// and you have exported the node, we need to remove the default version first
// TODO: alternatively we could just update the existing one. For now we remove the old one.
let node = doc.get(nodeData.id)
if (node) {
// console.warn('Node with same it already exists.', node)
doc.delete(node.id)
}
return doc.create(nodeData)
}
getChildNodeIterator(el) {
return el.getChildNodeIterator()
}
_defaultElementMatcher(el) {
return el.is(this.tagName)
}
/*
Internal function for parsing annotated text
*/
_annotatedText(iterator) {
const state = this.state
const context = last(state.stack)
/* istanbul ignore next */
if (!context) {
throw new Error('Illegal state: context is null.')
}
while(iterator.hasNext()) {
var el = iterator.next()
var text = ""
/* istanbul ignore else */
// Plain text nodes...
if (el.isTextNode()) {
text = this._prepareText(el.textContent)
if (text.length) {
// Note: text is not merged into the reentrant state
// so that we are able to return for this reentrant call
context.text = context.text.concat(text)
context.offset += text.length
}
} else if (el.isCommentNode()) {
// skip comment nodes
continue
} else if (el.isElementNode()) {
const annoConverter = this._getConverterForElement(el, 'inline')
// if no inline converter is found we just traverse deeper
if (!annoConverter) {
/* istanbul ignore next */
if (!this.IGNORE_DEFAULT_WARNINGS) {
console.warn('Unsupported inline element. We will not create an annotation for it, but process its children to extract annotated text.', el.outerHTML)
}
// this descends into children elements without introducing a new stack frame
// and without creating an element.
const iterator = this.getChildNodeIterator(el)
this._annotatedText(iterator)
continue
}
// reentrant: we delegate the conversion to the inline node class
// it will either call us back (this.annotatedText) or give us a finished
// node instantly (self-managed)
var startOffset = context.offset
const annoType = annoConverter.type
const AnnoClass = this.schema.getNodeClass(annoType)
let annoData = this._createNodeData(el, annoType)
// push a new context so we can deal with reentrant calls
let stackFrame = {
path: context.path,
offset: startOffset,
text: "",
annos: []
}
state.stack.push(stackFrame)
// with custom import
if (annoConverter.import) {
state.pushContext(el.tagName, annoConverter)
annoData = annoConverter.import(el, annoData, this) || annoData
state.popContext()
}
// As opposed to earlier implementations we do not rely on
// let the content be converted by custom implementations
// as they do not own the content
// TODO: we should make sure to throw when the user tries to
if (AnnoClass.isInline) {
this._customText(INVISIBLE_CHARACTER)
// TODO: check if this is correct; after reading an inline,
// we need to reset the lastChar, so that the next whitespace
// does not get skipped
state.lastChar = ''
} else {
// We call this to descent into the element
// which could be 'forgotten' otherwise.
// TODO: what if the converter has processed the element already?
const iterator = this.getChildNodeIterator(el)
this._annotatedText(iterator)
}
// ... and transfer the result into the current context
state.stack.pop()
context.offset = stackFrame.offset
context.text = context.text.concat(stackFrame.text)
// in the mean time the offset will probably have changed to reentrant calls
const endOffset = context.offset
annoData.start = {
path: context.path.slice(0),
offset: startOffset
}
annoData.end = {
offset: endOffset
}
// merge annos into parent stack frame
let parentFrame = last(state.stack)
parentFrame.annos = parentFrame.annos.concat(stackFrame.annos, annoData)
} else {
console.warn('Unknown element type. Taking plain text.', el.outerHTML)
text = this._prepareText(el.textContent)
context.text = context.text.concat(text)
context.offset += text.length
}
}
// return the plain text collected during this reentrant call
return context.text
}
_getConverterForElement(el, mode) {
var converters
if (mode === "block") {
if (!el.tagName) return null
converters = this._blockConverters
} else if (mode === "inline") {
converters = this._propertyAnnotationConverters
} else {
converters = this._allConverters
}
var converter = null
for (var i = 0; i < converters.length; i++) {
if (this._converterCanBeApplied(converters[i], el)) {
converter = converters[i]
break
}
}
return converter
}
_converterCanBeApplied(converter, el) {
return converter.matchElement(el, this)
}
/*
Wraps the remaining (inline) elements of a node iterator into a default
block node.
@param {DOMImporter.ChildIterator} childIterator
@returns {object} node data
*/
_wrapInlineElementsIntoBlockElement(childIterator) {
if (!childIterator.hasNext()) return
let dom = childIterator.peek().getOwnerDocument()
let wrapper = dom.createElement('wrapper')
while(childIterator.hasNext()) {
const el = childIterator.next()
// if there is a block node we finish this wrapper
const blockTypeConverter = this._getConverterForElement(el, 'block')
if (blockTypeConverter) {
childIterator.back()
break
}
wrapper.append(el.clone())
}
const type = this.schema.getDefaultTextType()
const id = this._getNextId(dom, type)
const converter = this._defaultBlockConverter
let nodeData = { type, id }
this.state.pushContext('wrapper', converter)
nodeData = converter.import(wrapper, nodeData, this) || nodeData
let context = this.state.popContext()
let annos = context.annos
// create the node
const node = this._createNode(nodeData)
// and all annos which have been created during this call
annos.forEach((a) => {
this._createNode(a)
})
return node
}
// TODO: this needs to be tested and documented
// TODO: after recent work with XML we found that
// doing white-space handling here is not optimal
// instead it should be done as a preprocessing step
_prepareText(text) {
const state = this.state
if (state.preserveWhitespace) {
return text
}
var repl = SPACE
// replace multiple tabs and new-lines by one space
text = text.replace(TABS_OR_NL, '')
// TODO: the last char handling is only necessary for for nested calls
// i.e., when processing the content of an annotation, for instance
// we need to work out how we could control this with an inner state
// TODO: this is incorrect: replacing /\s*/ will insert a space
// even if there is not one present
if (state.lastChar === SPACE) {
// replace any double space, even if it is across element boundary
text = text.replace(WS_LEFT, '')
} else {
text = text.replace(WS_LEFT, repl)
}
text = text.replace(WS_RIGHT, repl)
// EXPERIMENTAL: also remove white-space within
// this happens if somebody treats the text more like it would be done in Markdown
// i.e. introducing line-breaks
if (this.config.REMOVE_INNER_WS || state.removeInnerWhitespace) {
text = text.replace(WS_ALL, SPACE)
}
state.lastChar = text[text.length-1] || state.lastChar
return text
}
/*
Removes any leading and trailing whitespaces from the content
within the given element.
Attention: this is not yet implemented fully. Atm, trimming is only done
on the first and last text node (if they exist).
*/
_trimTextContent(el) {
var nodes = el.getChildNodes()
var firstNode = nodes[0]
var lastNode = last(nodes)
var text, trimmed
// trim the first and last text
if (firstNode && firstNode.isTextNode()) {
text = firstNode.textContent
trimmed = this._trimLeft(text)
firstNode.textContent = trimmed
}
if (lastNode && lastNode.isTextNode()) {
text = lastNode.textContent
trimmed = this._trimRight(text)
lastNode.textContent = trimmed
}
return el
}
_trimLeft(text) {
return text.replace(WS_LEFT, "")
}
_trimRight(text) {
return text.replace(WS_RIGHT, "")
}
}
class DOMImporterState {
constructor() {
this.reset()
}
reset() {
this.preserveWhitespace = false
this.nodes = []
this.annotations = []
this.containerId = null
this.container = []
this.ids = {}
// stack for reentrant calls into convertElement()
this.contexts = []
// stack for reentrant calls into annotatedText()
this.stack = []
this.lastChar = ""
this.skipTypes = {}
this.ignoreAnnotations = false
this.isConverting = false
// experimental: trying to generate simpler ids during import
// this.uuid = uuid
this.uuid = createCountingIdGenerator()
}
pushContext(tagName, converter) {
this.contexts.push({ tagName: tagName, converter: converter, annos: []})
}
popContext() {
return this.contexts.pop()
}
getCurrentContext() {
return last(this.contexts)
}
}
DOMImporter.State = DOMImporterState
DOMImporter.INVISIBLE_CHARACTER = INVISIBLE_CHARACTER