opendocx-node
Version:
OpenDocx document assembly for Node.js
601 lines (572 loc) • 26.3 kB
JavaScript
/* eslint-disable comma-dangle */
const docxTemplater = require('./docx-templater')
const XmlAssembler = require('./docx-evaluator')
const yatte = require('yatte')
const fs = require('fs')
// const { Transform } = require('stream')
const OD = yatte.FieldTypes
const Atomizer = require('./field-expr-namer')
const version = require('./version')
const loadTemplateModule = require('./load-template-module')
const { docxToMarkdown, markdownToDocx } = require('./pandoc')
// const asyncPool = require('tiny-async-pool')
// const { different } = require('simple-comparator')
/**
* Transform an OpenDocx template to produce the following reusable artifacts:
* * DocxGenTemplate (a DOCX template compatible with DocxGen/OpenXmlPowerTools)
* * ExtractedLogicTree (a Yatte 'template AST' containing a minimal "logic tree":
* encapsulates all data and transformations that the template calls for)
* * ExtractedLogic (a CommonJS module that will dynamically transform
* any Yatte data context into an XML data file compatible with the above
* DocxGen/OpenXmlPowerTools template)
* * Preview (a markdown-format template representing a preview of the template's content)
* * HasErrors (a boolean value indicating whether errors were encountered in the transformation)
* * Errors (an array of strings representing error messages encountered in the transformation)
*
* Intended to be called once whenever a new template or version is put into service. The artifacts it
* creates on disk then remain in place and act as a cache to prevent unnecessary work, as re-creating
* these artifacts is relatively expensive.
*
* @param {string} templatePath the path to the OpenDocx template on the local disk
* @param {boolean} removeCustomProperties whether to remove any custom document properties
* that may be embedded in the OpenDocx template itself
* @param {array of string} keepPropertyNames If removeCustomProperties is true, this is a list of
* names of custom properties that should be ignored
* through that process. Essentially, it means
* remove all custom properties EXCEPT these.
* @param {boolean} cleanUpArtifacts whether interim artifacts created during the transformation process
* should be cleaned up (the default) or left in place for diagnistic purposes
*/
async function compileDocx (
templatePath,
removeCustomProperties = true,
keepPropertyNames = [],
cleanUpArtifacts = true
) {
// first pre-process the given template file, which
// (1) leaves a unique "tag" on each field in the template, which we will use to refer to those fields later; and
// (2) extracts the content of each fields (in order) into a JSON file for further processing
// This initial step also strips out and/or leaves in the requested custom properties of the template.
const options = {
templateFile: templatePath,
removeCustomProperties,
keepPropertyNames
}
const result = await docxTemplater.extractFields(options)
options.originalTemplateFile = templatePath
options.templateFile = result.TempTemplate
let previewPromise
if (!process.env.DISABLE_PANDOC || process.env.DISABLE_PANDOC === 'FALSE' || process.env.DISABLE_PANDOC === '0') {
previewPromise = docxTemplater.flattenFields(options)
}
const fieldList = JSON.parse(fs.readFileSync(result.ExtractedFields, 'utf8'))
const fieldLookup = indexFields(fieldList)
// use the yatte engine to parse all the fields, creating an AST for the template
const ast = yatte.Engine.parseContentArray(fieldList) // this will throw if there are mismatched paired fields!!
// create a map from field ID to nodes in the AST, and save it in a temp file
const fieldDict = {}
const atoms = new Atomizer()
buildFieldDictionary(ast, fieldDict, atoms) // this also atomizes expressions in fields
// note: as of 2.0.0-alpha, it ALSO mutates ast, adding atom annotations for expressions
const fieldDictPath = templatePath + 'obj.fields.json'
fs.writeFileSync(fieldDictPath, JSON.stringify(fieldDict)) // JSON Dictionary <string fieldNum, object atomizedExpr>
// now use the pre-processed template and the field map to create a DocxGen template
options.templateFile = result.TempTemplate
options.originalTemplateFile = templatePath
options.fieldInfoFile = fieldDictPath
const ttpl = await docxTemplater.compileTemplate(options)
ttpl.Template = templatePath
// sample: temporarily serialize the AST out to a temp file so we can inspect it!
// const astPath = templatePath + 'obj.ast.json'
// fs.writeFileSync(astPath, JSON.stringify(ast))
// sample: temporarily convert the field dict BACK to an AST, and compare with the original,
// to make sure we can faithfully reproduce it!
// const ast2 = fieldDictToContentTree(fieldDict, true)
// if (different(ast, ast2)) {
// throw new Error('Error in fieldDictToContentTree!')
// }
// simplify the logic of the AST and save it for potential future use
const simplifiedAstPath = templatePath + '.json'
const rast = yatte.Engine.buildLogicTree(ast) // prunes logically insignificant nodes from ast
fs.writeFileSync(simplifiedAstPath, JSON.stringify(rast))
ttpl.ExtractedLogicTree = simplifiedAstPath
// use the simplified AST to dynamically create CommonJS module capable of creating a DocxGen XML data file
// (matched to the DocxGen template) from any OpenDocx/Yatte data context
const outputJsPath = templatePath + '.js'
fs.writeFileSync(outputJsPath, createTemplateJsModule(rast))
ttpl.ExtractedLogic = outputJsPath
// NOTE: we will be investingating other ways of processing the AST dynamically,
// so maybe we just write out the .json rather than .js/CommonJS module at all? Might be more secure.
// The hangup is that the .js contains the necessary atomized expressions, and the .json does not.
let previewResult
if (!process.env.DISABLE_PANDOC || process.env.DISABLE_PANDOC === 'FALSE' || process.env.DISABLE_PANDOC === '0') {
try {
previewResult = await previewPromise // make sure this is done before cleanup
// TODO: the following streaming code works, but the converted Markdown ends with a line break (\n) that we want
// to truncate, and I am not sure how to do that with the streaming code. So we are reading everything into memory
// instead, for now.
// const fieldReplaceTransform = new Transform({
// transform (chunk, encoding, callback) {
// const schunk = chunk.toString('utf-8')
// .replace(/\r\n/g, '\n')
// schunk.split(/=:(\d+):=/g)
// .forEach((item, index) => {
// if (index % 2 === 0) {
// this.push(item)
// } else {
// this.push(`{[${fieldLookup[item]}]}`)
// }
// })
// callback()
// }
// })
// const translatedPreviewPromise = new Promise((resolve, reject) => {
// const inputStream = fs.createReadStream(previewResult.DocxGenTemplate)
// const outputStream = fs.createWriteStream(templatePath + '.md')
// docxToMarkdown.stream(inputStream)
// .pipe(fieldReplaceTransform)
// .pipe(outputStream)
// .on('finish', resolve)
// .on('error', reject)
// })
// await translatedPreviewPromise
const markdownStream = docxToMarkdown.stream(fs.createReadStream(previewResult.DocxGenTemplate))
const chunks = []
for await (const chunk of markdownStream) {
chunks.push(chunk)
}
const buffer = Buffer.concat(chunks)
let previewStr = buffer.toString('utf-8')
.replace(/\r\n/g, '\n') // normalize line breaks
if (previewStr.endsWith('\n')) {
previewStr = previewStr.slice(0, -1) // truncate final line break
}
// reconstitute fields
previewStr = previewStr.split(/=:(\d+):=/g)
.map((item, index) => (index % 2) === 0 ? item : `{[${fieldLookup[item]}]}`)
.join('')
// ensure the converted preview string is a valid yatte text template! (otherwise error)
const compiledPreview = yatte.compileText(previewStr)
if (!compiledPreview.error) {
// persist in preview file
await fs.promises.writeFile((ttpl.Preview = templatePath + '.md'), previewStr, 'utf-8')
} else {
console.log(`Warning: unable to generate valid markdown preview for template ${templatePath}`)
}
} catch (err) {
console.error(err)
}
} else {
console.log(`Warning: Pandoc disabled; unable to generate markdown preview for template ${templatePath}`)
}
// clean up interim/temp/obj files
if (cleanUpArtifacts) {
fs.unlinkSync(result.ExtractedFields)
fs.unlinkSync(fieldDictPath)
fs.unlinkSync(result.TempTemplate)
if (previewResult && previewResult.DocxGenTemplate) {
fs.unlinkSync(previewResult.DocxGenTemplate)
}
} else {
ttpl.ExtractedFields = result.ExtractedFields
ttpl.FieldMap = fieldDictPath
ttpl.TempTemplate = result.TempTemplate
if (previewResult && previewResult.DocxGenTemplate) {
ttpl.TempPreview = previewResult.DocxGenTemplate
}
}
// result looks like:
// {
// Template: "c:\path\to\template.docx",
// ExtractedLogic: "c:\path\to\template.docx.js",
// ExtractedLogicTree: "c:\path\to\template.docx.json",
// DocxGenTemplate: "c:\path\to\template.docxgen.docx",
// Preview: "c:\path\to\template.docx.md",
// HasErrors: false,
// Errors: [], // if there are errors, this is an array of strings
// }
return ttpl
}
compileDocx.version = version
exports.compileDocx = compileDocx
/**
* Does the minimal work to ensure that an OpenDocx template has been compiled/transformed
* for use with DocxGen/OpenXmlPowerTools. Performs transformations only if required artifacts
* do not already exist OR if they are outdated versions that no longer function correctly.
*
* @param {string} templatePath the path to the OpenDocx template on the local disk
*/
async function validateCompiledDocx (templatePath, logicOnly = false) {
// templatePath should have been compiled (previously) so the expected files will be on disk
// but if not we'll compile it now
const extractedLogic = templatePath + '.js'
const docxGenTemplate = templatePath + 'gen.docx'
const previewTemplate = templatePath + '.md'
let needRegen = false
if (!fs.existsSync(extractedLogic) || (!logicOnly && !fs.existsSync(docxGenTemplate))) {
console.log(
'Warning: compiled template not found; generating. Pre-compile to maximize performance\n ' + templatePath)
needRegen = true
} else {
try {
loadTemplateModule(extractedLogic)
} catch (e) {
console.log('Warning: ' + e.toString() +
'\nPre-compile templates when upgrading to avoid performance penalty on first use\n ' + templatePath)
needRegen = true
}
}
let compileResult
if (needRegen) {
compileResult = await compileDocx(templatePath)
} else {
compileResult = {
Template: templatePath,
HasErrors: false,
ExtractedLogic: extractedLogic,
ExtractedLogicTree: templatePath + '.json',
DocxGenTemplate: docxGenTemplate,
}
if (fs.existsSync(previewTemplate)) {
compileResult.Preview = previewTemplate
}
}
return compileResult
}
validateCompiledDocx.version = version
exports.validateCompiledDocx = validateCompiledDocx
/**
* Utility function to embed a task pane and web extension in a DOCX file, for the purpose of associating
* the DOCX file with an Office add-in so the appropriate task pane can open automatically when the document
* is opened.
*
* See this article for more information about these parameters:
* https://learn.microsoft.com/en-us/office/dev/add-ins/develop/automatically-open-a-task-pane-with-a-document#use-open-xml-to-tag-the-document
*
* @param {Buffer} docxBytes a NodeJS buffer containing the raw bytes of the DOCX file
* @param {string} guid the add-in's unique identifier (from the manifest)
* @param {string} addInId the AppSource asset ID of the add-in, e.g. "wa104380862"
* @param {string} version the full version string of the add-in, e.g. "1.1.0.0"
* @param {string} store Pointer to store or catalog, e.g. "en-US"
* @param {string} storeType Store or catalog type, e.g. "OMEX"
* @param {string} dockState The docking location for the task pane, e.g. "right"
* @param {boolean} visibility Force task pane to be visible, e.g. true
* @param {number} width Task pane initial width, e.g. 350
* @param {number} row Task pane row, e.g. 1
* @returns {Buffer} a NodeJS buffer containing the modified DOCX file (for saving or streaming)
*/
async function embedTaskPane (docxBytes, guid, addInId, version, store, storeType, dockState, visibility, width, row) {
const options = {
docxBytes, guid, addInId, version, store, storeType, dockState, visibility, width, row
}
const result = await docxTemplater.embedTaskPane(options)
return result
}
exports.embedTaskPane = embedTaskPane
/**
* Utility function to remove a task pane and web extension from a DOCX file.
*
* @param {Buffer} docxBytes a NodeJS buffer containing the raw bytes of the DOCX file
* @param {string} guid the add-in's unique identifier (from the manifest)
* @returns {Buffer} a NodeJS buffer containing the modified DOCX file (for saving or streaming)
*/
async function removeTaskPane (docxBytes, guid) {
const options = { docxBytes, guid }
const result = await docxTemplater.removeTaskPane(options)
return result
}
exports.removeTaskPane = removeTaskPane
/**
* Utility function to extract metadata about any task panes embedded in a DOCX file.
*
* @param {Buffer} docxBytes a NodeJS buffer containing the raw bytes of the DOCX file
* @returns {Array<object>} a JS array of metadata about all task panes found in the DOCX file
*/
async function getTaskPaneInfo (docxBytes) {
const options = { docxBytes }
const result = await docxTemplater.getTaskPaneInfo(options)
return result
}
exports.getTaskPaneInfo = getTaskPaneInfo
/**
* Assemble a DOCX file from an OpenDocx template and a Yatte data context. Produces a DOCX file as output.
*
* @param {string|object} template either the path to the OpenDocx template on the local disk, or an object
* that can be resolved TO such a path by getTemplatePath().
* Note that for optimal performance, a matching *gen.docx template should
* already exist in the same directory, as is normally ensured by calling
* validateCompiledDocx() whenever a template or version is put into service.
* @param {string} outputFile the path on disk to which the output document should be saved
* @param {object} data the Yatte data context on which the assembled document will be based
* @param {func} getTemplatePath if the provided template is anything but a simple string, this must be an
* async function capable of taking that (whatever) as input, retrieving or
* locating the template as an actual file on disk, and returning the path to that file
* @param {string} optionalSaveXmlFile Normally the transformed XML data file (an interim artifact of assembly) is
* not output; however if this parameter is provided (for diagnostic purposes),
* the interim XML data file will be saved to the provided path
*/
async function assembleDocx (template, outputFile, data, getTemplatePath, optionalSaveXmlFile) {
// recursively create all XML and "tap out" all inserts
// (so by the time we get to the .NET code below, we've already gotten all the templates!)
const dataAssembler = await assembleData(template, data, getTemplatePath, false)
const { templateFile, xmlData, indirects, missing, errors } = dataAssembler
const dataSuccess = !errors || !errors.length
if (optionalSaveXmlFile) {
fs.writeFileSync(optionalSaveXmlFile, dataSuccess ? xmlData : errors.join('\n'))
}
if (!dataSuccess) {
return ({
Document: undefined,
Missing: Object.keys(missing),
Errors: errors,
HasErrors: true,
})
}
// recursively assemble inserted indirects and convert markdown to DOCX (if necessary)
await processIndirects(indirects, templateFile, optionalSaveXmlFile)
// finally assemble the main document and compose it with its inserts (if any)
return assembleDocxWithIndirects(templateFile, xmlData, indirects, missing, outputFile)
}
assembleDocx.version = version
exports.assembleDocx = assembleDocx
async function assembleData (template, data, getTemplatePath, ignoreTemplates = false) {
if (typeof template !== 'string' && typeof getTemplatePath === 'function') {
template = await getTemplatePath(template)
}
// template should have been compiled (previously) so the expected files will be on disk
// but if not we'll compile it now
const { ExtractedLogic, DocxGenTemplate } = await validateCompiledDocx(template, ignoreTemplates)
const dataAssembler = new XmlAssembler(data)
dataAssembler.templateFile = DocxGenTemplate
dataAssembler.xmlData = dataAssembler.assembleXml(ExtractedLogic)
if (!dataAssembler.errors || !dataAssembler.errors.length) {
// assemble data for inserted indirects if there are any
if (dataAssembler.indirects && dataAssembler.indirects.length > 0) {
for (const indir of dataAssembler.indirects) {
if (!indir.contentType || indir.contentType === 'docx') {
indir.assembledData = await assembleData(indir, indir.scope, getTemplatePath, ignoreTemplates)
}
}
}
}
return dataAssembler
}
exports.assembleData = assembleData
async function processIndirects (indirects, parentTemplateFile, optionalSaveXmlFile) {
if (!indirects) return
for (const indir of indirects) {
if ((indir.contentType === 'markdown' || indir.contentType === 'text') && indir.toString) {
const mdContent = indir.toString() // todo: get Missing and Errors from this (if any) and pass on below!
const buffer = await markdownToDocx(mdContent, parentTemplateFile)
indir.result = {
Bytes: buffer,
Document: null,
Missing: [],
Errors: [],
HasErrors: false
}
} else if (!indir.contentType || indir.contentType === 'docx') {
// indir.assembledData was initialized by assembleData()'s recursive descent
const { templateFile, xmlData, indirects, missing } = indir.assembledData
if (optionalSaveXmlFile) {
fs.writeFileSync(templateFile + `_interim_${indir.id}_data.xml`, xmlData)
}
await processIndirects(indirects, templateFile, optionalSaveXmlFile)
indir.result = await assembleDocxWithIndirects(templateFile, xmlData, indirects, missing, null)
if (optionalSaveXmlFile) {
fs.writeFileSync(templateFile + `_interim_${indir.id}_assembled.docx`, indir.result.Bytes)
}
} else {
throw new Error(`Unexpected '${indir.contentType}' content type encountered during indirect processing`)
}
}
}
exports.processIndirects = processIndirects
async function assembleDocxWithIndirects (templateFile, xmlData, indirects, missingObj, outputFile = null) {
// const hasInserts = indirects && indirects.length > 0
// try {
// transform indirects into OXPT DocumentComposer sources:
const sources = []
const errors = []
for (const sub of indirects) {
if (sub.result.Missing) {
sub.result.Missing.forEach(m => {
missingObj[m] = true
})
}
if (sub.result.Errors) {
sub.result.Errors.forEach(e => {
errors.push(e)
})
}
sources.push({ id: sub.id, buffer: sub.result.Bytes, keepSections: Boolean(sub.KeepSections) })
}
// assemble document (which now takes care of compositing inserts too)
const mainDoc = await docxTemplater.assembleDocument({
templateFile,
xmlData,
sources,
documentFile: outputFile,
})
if (mainDoc.HasErrors) {
errors.unshift('Assembly error')
}
const result = mainDoc
result.Missing = Object.keys(missingObj)
result.Errors = errors
return result
}
exports.assembleDocxWithIndirects = assembleDocxWithIndirects
const indexFields = function (fieldList, lookup = []) {
for (const fldObj of fieldList) {
if (Array.isArray(fldObj)) {
indexFields(fldObj, lookup)
} else {
lookup[fldObj.id] = fldObj.content
}
}
return lookup
}
const buildFieldDictionary = function (astBody, fieldDict, atoms, parent = null) {
for (const obj of astBody) {
if (Array.isArray(obj.contentArray)) {
buildFieldDictionary(obj.contentArray, fieldDict, atoms, obj)
}
if (typeof obj.id !== 'undefined') {
const fieldObj = {
fieldType: obj.type
}
if (obj.expr) {
fieldObj.expr = obj.expr
fieldObj.atomizedExpr = atoms.getFieldAtom(obj)
// also cross-pollinate atomizedExpr across to ast (for later use)
obj.atom = fieldObj.atomizedExpr
} else {
fieldObj.parent = parent.id
// EndList fields are also stored with the atomized expression of their matching List field,
// because this is (or at least, used to be?) needed to make list punctuation work
if (obj.type === OD.EndList) {
fieldObj.atomizedExpr = atoms.getFieldAtom(parent)
}
}
fieldDict[obj.id] = fieldObj
}
}
}
const createTemplateJsModule = function (ast) {
const sb = ["'use strict';"]
sb.push(`exports.version='${version}';`)
sb.push('exports.evaluate=function(cx,cl,h)')
sb.push(serializeContextInDataJs(ast, '_odx', 'cx', 'cl', null))
return sb.join('\n')
}
const serializeContextInDataJs = function (contentArray, id, objIdent, locIdent, parentNode) {
return `{
h.beginObject('${id}',${objIdent}${locIdent ? (',' + locIdent) : ''});
${serializeContentArrayAsDataJs(contentArray, parentNode)}
h.endObject()
}`
}
const serializeAstNodeAsDataJs = function (astNode, parent) {
let atom
if (astNode.expr) {
if (astNode.expr === '_punc') {
// special case: list punctuation: use a customized "atom" derived from the list expression
atom = parent.atom + 'p'
} else if (astNode.type === OD.If || astNode.type === OD.ElseIf) {
// special case: evaluating an expression for purposes of determining its truthiness rather than its actual value
atom = astNode.atom + 'b'
} else { // regular case: atom based on expression
atom = astNode.atom
}
}
switch (astNode.type) {
case OD.Content:
return `h.define('${atom}','${escapeExpressionStr(astNode.expr)}');`
case OD.List: {
const a0 = atom + 'i' // special atom representing individual items in the list, rather than the entire list
return `for(const ${a0} of h.beginList('${atom}', '${escapeExpressionStr(astNode.expr)}'))
${serializeContextInDataJs(astNode.contentArray, a0, a0, '', astNode)}
h.endList();`
}
case OD.If:
return `if(h.beginCondition('${atom}','${escapeExpressionStr(astNode.expr)}'))
{
${serializeContentArrayAsDataJs(astNode.contentArray, astNode)}
}`
case OD.ElseIf:
return `} else {
if(h.beginCondition('${atom}','${escapeExpressionStr(astNode.expr)}'))
{
${serializeContentArrayAsDataJs(astNode.contentArray, astNode)}
}`
case OD.Else:
return `} else {
${serializeContentArrayAsDataJs(astNode.contentArray, astNode)}
`
default:
throw new Error('unexpected node type -- unable to serialize')
}
}
const serializeContentArrayAsDataJs = function (contentArray, parent) {
const sb = []
for (const obj of contentArray) {
sb.push(serializeAstNodeAsDataJs(obj, parent))
}
// in 2.0.0-alpha, we stopped including _punc nodes in the contentArray
// but the Js (insofar as we will actually use it?) still needs to capture the _punc, so synthesize it here
if (parent && parent.type === OD.List) {
var lastItem = !contentArray.length || contentArray[contentArray.length - 1]
if (!lastItem || lastItem.type !== OD.Content || lastItem.expr !== '_punc') {
sb.push(serializeAstNodeAsDataJs({ type: OD.Content, expr: '_punc' }, parent))
}
}
return sb.join('\n')
}
const singleQuotes = /(?<=\\\\)'|(?<!\\)'/g
const escapeExpressionStr = function (strExpr) {
return strExpr.replace(singleQuotes, "\\'")
}
const fieldDictToContentTree = function (fieldDict, addPunc = false) {
// Step 1: Map to output shape, ignore "parent"
const fields = Object.entries(fieldDict)
.map(([id, f]) => ({
type: f.fieldType,
id,
...(f.expr && { expr: f.expr }),
...(f.atomizedExpr && f.fieldType !== 'EndList' && { atom: f.atomizedExpr }),
}))
.sort((a, b) => Number(a.id) - Number(b.id))
// Step 2: nest field objects appropriately
const stack = [[]]
let current = stack.at(-1)
while (fields.length > 0) {
const field = fields.pop()
if (field.type === 'EndList' || field.type === 'EndIf') {
// Start a new accumulation block
current = [field]
if (addPunc && field.type === 'EndList') { // legacy ASTs have _punc nodes... add them here if requested
current.unshift({ type: 'Content', expr: '_punc' })
}
stack.push(current)
} else if (['List', 'If', 'ElseIf', 'Else'].includes(field.type)) {
// this field consumes the current accumulation block
field.contentArray = stack.pop()
if (field.type === 'List' || field.type === 'If') {
// this field gets added to the parent accumulation black
current = stack.at(-1)
current.unshift(field)
} else { // ElseIf or Else
// this field gets added to a new accumulation block
current = [field]
stack.push(current)
}
} else {
// Content node
current.unshift(field)
}
}
// assert stack.length === 1
return stack.pop()
}