pomljs

Version:

Prompt Orchestration Markup Language

1 lines • 132 kB

Source Map (JSON)

{"version":3,"file":"writer.cjs","sources":["../.build/writer.js"],"sourcesContent":["import * as cheerio from 'cheerio';\nimport * as xmlbuilder from 'xmlbuilder2';\nimport { ErrorCollection, SystemError, ValidSpeakers, WriteError, richContentFromSourceMap } from './base';\nimport yaml from 'js-yaml';\nimport { encodingForModel } from 'js-tiktoken';\n// Use the special character to indicate a placeholder for multimedia.\nconst SPECIAL_CHARACTER = 'À';\nclass Writer {\n ir = '';\n options;\n tokenizerCache = {};\n constructor(ir, options) {\n if (ir) {\n this.reset(ir);\n }\n this.options = this.initializeOptions(options);\n }\n initializeOptions(options) {\n return options || {};\n }\n reset(ir) {\n this.ir = ir;\n }\n truncateText(text, charLimit, tokenLimit, options) {\n const { truncateMarker = ' (...truncated)', truncateDirection = 'end', tokenEncodingModel = 'gpt-4o' } = options || this.options;\n let truncated = text;\n let changed = false;\n if (charLimit !== undefined && truncated.length > charLimit) {\n changed = true;\n if (truncateDirection === 'start') {\n truncated = truncated.slice(truncated.length - charLimit);\n }\n else if (truncateDirection === 'middle') {\n const head = Math.ceil(charLimit / 2);\n const tail = charLimit - head;\n truncated = truncated.slice(0, head) + truncated.slice(truncated.length - tail);\n }\n else {\n truncated = truncated.slice(0, charLimit);\n }\n }\n if (tokenLimit !== undefined) {\n // Optimization: Check byte count first to potentially bypass tokenizer loading\n // Since tokens are typically at least 1 byte, if byte count < token limit, we're safe\n const byteCount = Buffer.byteLength(truncated, 'utf8');\n if (byteCount <= tokenLimit) {\n // Byte count is within limit, so token count must also be within limit\n // Skip expensive tokenizer loading and encoding\n }\n else {\n let enc = this.tokenizerCache[tokenEncodingModel];\n if (!enc) {\n enc = encodingForModel(tokenEncodingModel);\n this.tokenizerCache[tokenEncodingModel] = enc;\n }\n const tokens = enc.encode(truncated);\n if (tokens.length > tokenLimit) {\n changed = true;\n if (truncateDirection === 'start') {\n truncated = enc.decode(tokens.slice(tokens.length - tokenLimit));\n }\n else if (truncateDirection === 'middle') {\n const head = Math.ceil(tokenLimit / 2);\n const tail = tokenLimit - head;\n truncated = enc.decode(tokens.slice(0, head).concat(tokens.slice(tokens.length - tail)));\n }\n else {\n truncated = enc.decode(tokens.slice(0, tokenLimit));\n }\n }\n }\n }\n if (!changed) {\n return text;\n }\n if (truncateDirection === 'start') {\n return truncateMarker + truncated;\n }\n else if (truncateDirection === 'middle') {\n const mid = Math.ceil(truncated.length / 2);\n return truncated.slice(0, mid) + truncateMarker + truncated.slice(mid);\n }\n else {\n return truncated + truncateMarker;\n }\n }\n createMappingNode(element, outputLength) {\n const parseAttrAsInt = (attrName) => {\n const attrValue = element.attr(attrName);\n return attrValue !== undefined && !isNaN(parseInt(attrValue, 10))\n ? parseInt(attrValue, 10)\n : undefined;\n };\n return {\n originalStart: parseAttrAsInt('original-start-index'),\n originalEnd: parseAttrAsInt('original-end-index'),\n inputStart: element[0].startIndex,\n inputEnd: element[0].endIndex,\n outputStart: 0,\n outputEnd: outputLength - 1\n };\n }\n /**\n * Add an offset to mapping nodes.\n *\n * @param mappings - Original mappings.\n * @param indent - The offset amount.\n * @param ignoreBefore - Ignore the mappings before this index.\n * @returns - The new mappings.\n */\n indentMappings(mappings, indent, ignoreBefore) {\n return mappings.map(mapping => {\n return {\n ...mapping,\n outputStart: mapping.outputStart >= ignoreBefore ? mapping.outputStart + indent : mapping.outputStart,\n outputEnd: mapping.outputStart >= ignoreBefore ? mapping.outputEnd + indent : mapping.outputEnd\n };\n });\n }\n indentMultiMedia(multimedia, indent, ignoreBefore) {\n return multimedia.map(media => {\n return {\n ...media,\n index: media.index >= ignoreBefore ? media.index + indent : media.index\n };\n });\n }\n raiseError(message, element) {\n const parseAttrAsInt = (attrName) => {\n const attrValue = element.attr(attrName);\n return attrValue !== undefined && !isNaN(parseInt(attrValue, 10))\n ? parseInt(attrValue, 10)\n : undefined;\n };\n const emptyOutput = {\n output: '',\n multimedia: [],\n mappings: []\n };\n if (element.length === 0) {\n // Ignore the error if the element is not even ready\n return emptyOutput;\n }\n ErrorCollection.add(new WriteError(message, parseAttrAsInt('original-start-index'), parseAttrAsInt('original-end-index'), element[0].sourcePath, element[0].startIndex, element[0].endIndex, this.ir));\n return emptyOutput;\n }\n writeElementTree(element, $) {\n throw new SystemError('Method not implemented.');\n }\n /**\n * Convert an IR string into {@link RichContent} without exposing mapping information.\n *\n * The method delegates to {@link writeWithSourceMap} and then collapses the\n * returned segments back into a single rich content value.\n */\n write(ir) {\n const segments = this.writeWithSourceMap(ir);\n return richContentFromSourceMap(segments);\n }\n /**\n * Convert an IR string into an array of speaker messages.\n *\n * It internally uses {@link writeMessagesWithSourceMap} and removes the\n * mapping information from each message.\n */\n writeMessages(ir) {\n const messages = this.writeMessagesWithSourceMap(ir);\n return messages.map(m => ({\n speaker: m.speaker,\n content: richContentFromSourceMap(m.content)\n }));\n }\n assignSpeakers(result, $) {\n const speakers = [];\n let defaultSpeaker = 'system';\n let systemSpeakerSpecified = false;\n const segments = [];\n const querySegmentFromMapping = (startIndex, endIndex) => {\n return result.mappings.find(segment => segment.inputStart === startIndex && segment.inputEnd === endIndex);\n };\n const getSpecifiedSpeaker = (element) => {\n const speaker = element.attr('speaker');\n if (speaker && !ValidSpeakers.includes(speaker)) {\n this.raiseError(`\"${speaker}\" is not a valid speaker.`, element);\n return undefined;\n }\n return speaker;\n };\n const assignSpeakerForElement = (element, inheritedSpeaker) => {\n let specifiedSpeaker = getSpecifiedSpeaker(element);\n if (specifiedSpeaker === 'system') {\n systemSpeakerSpecified = true;\n }\n // When human has appeared, the default speaker becomes human.\n if (specifiedSpeaker == 'human' && defaultSpeaker == 'system') {\n defaultSpeaker = 'human';\n }\n if (element.length === 0) {\n return;\n }\n const segment = querySegmentFromMapping(element[0].startIndex, element[0].endIndex);\n if (specifiedSpeaker && !segment) {\n console.warn(`Speaker is specified but no exact corresponding output can be found in ${element.html()}`);\n }\n const speaker = specifiedSpeaker || inheritedSpeaker || defaultSpeaker;\n if (segment) {\n segments.push({ start: segment.outputStart, end: segment.outputEnd, speaker });\n }\n if (specifiedSpeaker) {\n inheritedSpeaker = specifiedSpeaker;\n }\n element.children().each((_, child) => {\n const speaker = getSpecifiedSpeaker($(child));\n if (speaker) {\n inheritedSpeaker = speaker;\n }\n assignSpeakerForElement($(child), inheritedSpeaker);\n });\n };\n assignSpeakerForElement(this.getRoot($), undefined);\n const allIndicesSet = new Set();\n segments.forEach(segment => {\n allIndicesSet.add(segment.start);\n allIndicesSet.add(segment.end);\n });\n const essentialIndices = Array.from(allIndicesSet).sort((a, b) => a - b);\n const colorSpeakers = new Array(essentialIndices.length).fill('system');\n segments.forEach(segment => {\n const startIndex = essentialIndices.findIndex(index => index == segment.start);\n const endIndex = essentialIndices.findIndex(index => index == segment.end);\n for (let i = startIndex; i <= endIndex; i++) {\n colorSpeakers[i] = segment.speaker;\n }\n });\n let currentStart = undefined;\n for (let i = 0; i < essentialIndices.length; i++) {\n const speaker = colorSpeakers[i];\n if (i === 0 || (i > 0 && speaker !== colorSpeakers[i - 1])) {\n currentStart = essentialIndices[i];\n }\n if (i === essentialIndices.length - 1 ||\n (i < essentialIndices.length - 1 && speaker !== colorSpeakers[i + 1])) {\n // time to end this segment\n if (currentStart === undefined) {\n throw new SystemError('currentStart is not expected to be undefined');\n }\n speakers.push({ start: currentStart, end: essentialIndices[i], speaker: speaker });\n }\n }\n // If there's only one speaker and it's system, change it to human.\n if (speakers.length == 1 && speakers[0].speaker == 'system' && !systemSpeakerSpecified) {\n speakers[0].speaker = 'human';\n }\n return speakers;\n }\n /**\n * Render the IR string and return detailed mapping for each produced content\n * segment.\n *\n * Each returned {@link SourceMapRichContent} describes the slice of the input\n * IR that generated the piece of output.\n */\n writeWithSourceMap(ir) {\n const result = this.generateWriterResult(ir);\n const segments = this.buildSourceMap(result);\n return segments.map(s => ({\n startIndex: s.inputStart,\n endIndex: s.inputEnd,\n irStartIndex: s.irStart,\n irEndIndex: s.irEnd,\n content: s.content\n }));\n }\n /**\n * Similar to {@link writeWithSourceMap} but groups the segments into speaker\n * messages.\n */\n writeMessagesWithSourceMap(ir) {\n const result = this.generateWriterResult(ir);\n const segments = this.buildSourceMap(result);\n return result.speakers\n .map(sp => {\n const msgSegs = segments.filter(seg => seg.outStart >= sp.start && seg.outEnd <= sp.end);\n const nonWs = msgSegs.filter(seg => !(typeof seg.content === 'string' && seg.content.trim() === ''));\n // Use only non-whitespace segments when computing the overall source range\n // for this message so that trailing or leading padding does not expand the\n // reported span. If the message contains nothing but whitespace we fall\n // back to considering all segments.\n const relevant = nonWs.length ? nonWs : msgSegs;\n if (!relevant.length) {\n // If there are no relevant segments, we cannot produce an empty message.\n return {\n startIndex: 0, // in this case, we cannot determine the start index\n endIndex: 0,\n irStartIndex: 0,\n irEndIndex: 0,\n speaker: sp.speaker,\n content: []\n };\n }\n return {\n startIndex: Math.min(...relevant.map(seg => seg.inputStart)),\n endIndex: Math.max(...relevant.map(seg => seg.inputEnd)),\n irStartIndex: Math.min(...relevant.map(seg => seg.irStart)),\n irEndIndex: Math.max(...relevant.map(seg => seg.irEnd)),\n speaker: sp.speaker,\n content: msgSegs.map(seg => ({\n startIndex: seg.inputStart,\n endIndex: seg.inputEnd,\n irStartIndex: seg.irStart,\n irEndIndex: seg.irEnd,\n content: seg.content\n }))\n };\n })\n .filter(msg => msg !== undefined);\n }\n /**\n * Transform a {@link WriterResult} into discrete source map segments.\n *\n * The segments are ordered so that rich content can be reconstructed in\n * the correct visual order while preserving multimedia positioning.\n */\n buildSourceMap(result) {\n // Collect every boundary within the output that could signify a change in\n // source location. These come from the input/output mappings as well as\n // multimedia positions. Splitting the output on these boundaries ensures\n // each segment corresponds to a single source range.\n const boundaries = new Set();\n result.mappings.forEach(m => {\n boundaries.add(m.outputStart);\n boundaries.add(m.outputEnd + 1);\n });\n result.multimedia.forEach(m => {\n boundaries.add(m.index);\n boundaries.add(m.index + 1);\n });\n boundaries.add(0);\n boundaries.add(result.output.length);\n const points = Array.from(boundaries).sort((a, b) => a - b);\n // `top` multimedia should appear before all textual content while `bottom`\n // multimedia should come last. We therefore keep three buckets and merge\n // them at the end.\n const topSegments = [];\n const middleSegments = [];\n const bottomSegments = [];\n const originalStartIndices = result.mappings\n .map(m => m.originalStart)\n .filter(m => m !== undefined);\n const sourceStartIndex = originalStartIndices.length > 0 ? Math.min(...originalStartIndices) : 0;\n const originalEndIndices = result.mappings.map(m => m.originalEnd).filter(m => m !== undefined);\n const sourceEndIndex = originalEndIndices.length > 0 ? Math.max(...originalEndIndices) : 0;\n for (let i = 0; i < points.length - 1; i++) {\n const start = points[i];\n const end = points[i + 1];\n if (start >= end) {\n continue;\n }\n const slice = result.output.slice(start, end);\n // Find the most specific mapping that covers this slice. This allows the\n // resulting segment to map back to the tightest IR range responsible for\n // the output.\n let chosen;\n // The chosen IR might not have a precise original start or end index, so we\n // choose a fallback based on the original mappings.\n let chosenOriginal;\n for (const m of result.mappings) {\n if (start >= m.outputStart && end - 1 <= m.outputEnd) {\n if (!chosen || m.outputEnd - m.outputStart < chosen.outputEnd - chosen.outputStart) {\n chosen = m;\n }\n if (m.originalStart !== undefined &&\n m.originalEnd !== undefined &&\n (!chosenOriginal ||\n m.originalEnd - m.originalStart <\n chosenOriginal.originalEnd - chosenOriginal.originalStart)) {\n chosenOriginal = m;\n }\n }\n }\n if (!chosen) {\n // Mappings must be non-empty here because the points are derived from the\n // mappings. If we cannot find a mapping, use the first one as a fallback.\n chosen = result.mappings[0];\n }\n // If a multimedia item starts at this boundary, emit it instead of text.\n const media = result.multimedia.find(m => m.index === start);\n if (media) {\n const { position, index, ...rest } = media;\n const segment = {\n outStart: start,\n outEnd: end - 1,\n irStart: chosen.inputStart,\n irEnd: chosen.inputEnd,\n inputStart: chosenOriginal?.originalStart ?? sourceStartIndex,\n inputEnd: chosenOriginal?.originalEnd ?? sourceEndIndex,\n content: [rest]\n };\n if (position === 'top') {\n topSegments.push(segment);\n }\n else if (position === 'bottom') {\n bottomSegments.push(segment);\n }\n else {\n middleSegments.push(segment);\n }\n }\n else if (slice !== SPECIAL_CHARACTER && slice.length > 0) {\n // Normal textual slice.\n middleSegments.push({\n outStart: start,\n outEnd: end - 1,\n irStart: chosen.inputStart,\n irEnd: chosen.inputEnd,\n inputStart: chosenOriginal?.originalStart ?? sourceStartIndex,\n inputEnd: chosenOriginal?.originalEnd ?? sourceEndIndex,\n content: slice\n });\n }\n }\n middleSegments.sort((a, b) => a.outStart - b.outStart);\n // Order the buckets so that `top` items are emitted before any textual\n // content and `bottom` items are emitted last. When filtering these\n // segments by speaker boundaries, each top or bottom item still appears\n // within the correct message.\n return [...topSegments, ...middleSegments, ...bottomSegments];\n }\n /**\n * Execute the main writing logic and gather mapping, multimedia and speaker\n * information before it is broken down into smaller segments.\n */\n generateWriterResult(ir) {\n this.reset(ir);\n const $ = cheerio.load(ir, {\n scriptingEnabled: false,\n xml: { xmlMode: true, withStartIndices: true, withEndIndices: true }\n }, false);\n const partialResult = this.writeElementTree(this.getRoot($), $);\n return {\n input: ir,\n output: partialResult.output,\n mappings: partialResult.mappings,\n multimedia: partialResult.multimedia,\n speakers: this.assignSpeakers(partialResult, $)\n };\n }\n getRoot($) {\n return $($.root().children()[0]);\n }\n}\nexport class EnvironmentDispatcher extends Writer {\n writeElementTree(element, $) {\n if (element.is('env')) {\n let options = undefined;\n try {\n const optionsString = element.attr('writer-options');\n if (optionsString) {\n options = JSON.parse(optionsString);\n }\n }\n catch (e) {\n this.raiseError(`Invalid JSON for writer-options: ${element.attr('writer-options')}`, element);\n }\n if (element.attr('presentation') === 'markup') {\n const markupLanguage = element.attr('markup-lang') || 'markdown';\n if (markupLanguage === 'markdown') {\n return new MarkdownWriter(this.ir, options).writeElementTree(element, $);\n }\n else if (markupLanguage === 'html') {\n return new HtmlWriter(this.ir, options).writeElementTree(element, $);\n }\n else if (markupLanguage === 'csv') {\n return new CsvWriter(this.ir, options).writeElementTree(element, $);\n }\n else if (markupLanguage === 'tsv') {\n return new TsvWriter(this.ir, options).writeElementTree(element, $);\n }\n else {\n return this.raiseError(`Invalid markup language: ${markupLanguage}`, element);\n }\n }\n else if (element.attr('presentation') === 'serialize') {\n const serializer = element.attr('serializer') || 'json';\n if (serializer === 'json') {\n return new JsonWriter(this.ir, options).writeElementTree(element, $);\n }\n else if (serializer === 'yaml') {\n return new YamlWriter(this.ir, options).writeElementTree(element, $);\n }\n else if (serializer === 'xml') {\n return new XmlWriter(this.ir, options).writeElementTree(element, $);\n }\n else {\n return this.raiseError(`Invalid serializer: ${serializer}`, element);\n }\n }\n else if (element.attr('presentation') === 'free') {\n return new FreeWriter(this.ir, options).writeElementTree(element, $);\n }\n else if (element.attr('presentation') === 'multimedia') {\n return new MultiMediaWriter(this.ir, options).writeElementTree(element, $);\n }\n else {\n return this.raiseError(`Invalid presentation: ${element}`, element);\n }\n }\n else {\n // Not even an environment, consider writing it as a markdown\n return new MarkdownWriter(this.ir).writeElementTree(element, $);\n }\n }\n}\nexport class MarkdownWriter extends Writer {\n initializeOptions(options) {\n options = options || {};\n return {\n markdownBaseHeaderLevel: options.markdownBaseHeaderLevel ?? 1,\n markdownTableCollapse: options.markdownTableCollapse ?? false,\n csvSeparator: options.csvSeparator ?? ',',\n csvHeader: options.csvHeader ?? true,\n truncateMarker: options.truncateMarker ?? ' (...truncated)',\n truncateDirection: options.truncateDirection ?? 'end',\n tokenEncodingModel: options.tokenEncodingModel ?? 'gpt-4o'\n };\n }\n raiseErrorAndReturnEmpty(message, element) {\n this.raiseError(message, element);\n return { text: '', before: '', after: '', mappings: [], multimedia: [] };\n }\n makeBox(text, layout, element) {\n const newBeforeAfter = layout === 'block' ? '\\n\\n' : layout === 'newline' ? '\\n' : '';\n const charLimitAttr = element.attr('char-limit');\n const tokenLimitAttr = element.attr('token-limit');\n const priorityAttr = element.attr('priority');\n const charLimit = charLimitAttr !== undefined ? parseInt(charLimitAttr, 10) : undefined;\n const tokenLimit = tokenLimitAttr !== undefined ? parseInt(tokenLimitAttr, 10) : undefined;\n const priority = priorityAttr !== undefined ? parseFloat(priorityAttr) : undefined;\n if (typeof text === 'string') {\n const truncated = this.truncateText(text, charLimit, tokenLimit, this.options);\n return {\n text: truncated,\n before: newBeforeAfter,\n after: newBeforeAfter,\n mappings: [this.createMappingNode(element, truncated.length)],\n multimedia: [],\n priority\n };\n }\n else {\n const combinedText = text.text;\n const truncated = this.truncateText(combinedText, charLimit, tokenLimit, this.options);\n return {\n text: truncated,\n before: this.consolidateSpace(newBeforeAfter, text.before),\n after: this.consolidateSpace(text.after, newBeforeAfter),\n mappings: [...text.mappings, this.createMappingNode(element, truncated.length)],\n multimedia: text.multimedia,\n priority\n };\n }\n }\n wrapBox(box, wrapBefore, wrapAfter, element) {\n const text = wrapBefore + box.text + wrapAfter;\n const mappings = this.indentMappings(box.mappings, wrapBefore.length, 0);\n if (element) {\n mappings.push(this.createMappingNode(element, text.length));\n }\n return {\n text: text,\n before: box.before,\n after: box.after,\n mappings: mappings,\n multimedia: this.indentMultiMedia(box.multimedia, wrapBefore.length, 0)\n };\n }\n wrapBoxEveryLine(box, wrapBefore, wrapAfter) {\n const lines = box.text.split('\\n');\n let accumulatedLength = 0;\n let mappings = box.mappings;\n let multimedia = box.multimedia;\n const text = lines\n .map(line => {\n const result = wrapBefore + line + wrapAfter;\n mappings = this.indentMappings(mappings, wrapBefore.length, accumulatedLength);\n multimedia = this.indentMultiMedia(multimedia, wrapBefore.length, accumulatedLength);\n accumulatedLength += result.length + 1; // length of '\\n'\n return result;\n })\n .join('\\n');\n return {\n text: text,\n before: box.before,\n after: box.after,\n mappings: mappings,\n multimedia: multimedia\n };\n }\n consolidateSpace(space1, space2) {\n let result = space1 + space2;\n for (let i = 1; i <= Math.min(space1.length, space2.length); i++) {\n if (space1.slice(-i) === space2.slice(0, i)) {\n result = space1 + space2.slice(i);\n }\n }\n return result;\n }\n reduceBoxesByLimit(boxes, charLimit, tokenLimit) {\n if (boxes.length === 0 || (charLimit === undefined && tokenLimit === undefined)) {\n return boxes;\n }\n const tokenModel = this.options.tokenEncodingModel || 'gpt-4o';\n const getTokenLength = (t) => {\n if (tokenLimit === undefined) {\n return 0;\n }\n // Optimization: Use byte count as conservative estimate before tokenizing\n const byteCount = Buffer.byteLength(t, 'utf8');\n const BYTES_PER_TOKEN_ESTIMATE = 4;\n // If byte count is small enough, we can estimate it's within token limits\n // This is a heuristic - for very short strings, byte count ≈ token count\n if (byteCount <= tokenLimit) {\n return Math.ceil(byteCount / BYTES_PER_TOKEN_ESTIMATE); // Conservative estimate\n }\n let enc = this.tokenizerCache[tokenModel];\n if (!enc) {\n enc = encodingForModel(tokenModel);\n this.tokenizerCache[tokenModel] = enc;\n }\n return enc.encode(t).length;\n };\n const totalChars = (arr) => arr.reduce((a, b) => a + b.text.length, 0);\n const totalTokens = (arr) => arr.reduce((a, b) => a + getTokenLength(b.text), 0);\n let current = [...boxes];\n while (current.length > 0) {\n const exceeds = (charLimit !== undefined && totalChars(current) > charLimit) ||\n (tokenLimit !== undefined && totalTokens(current) > tokenLimit);\n if (!exceeds) {\n break;\n }\n const priorities = current.map(b => b.priority ?? 0);\n const minP = Math.min(...priorities);\n if (current.every(b => (b.priority ?? 0) === minP)) {\n break;\n }\n current = current.filter(b => (b.priority ?? 0) !== minP);\n }\n return current;\n }\n concatMarkdownBoxes(boxes, element) {\n const charLimitAttr = element?.attr('char-limit');\n const tokenLimitAttr = element?.attr('token-limit');\n const charLimit = charLimitAttr !== undefined ? parseInt(charLimitAttr, 10) : undefined;\n const tokenLimit = tokenLimitAttr !== undefined ? parseInt(tokenLimitAttr, 10) : undefined;\n const multimedia = [];\n // Remove all spaces children before and after block elements\n // or between two multimedia-only nodes so images do not create\n // stray blank lines when placed consecutively.\n let removedSpace = boxes;\n while (true) {\n let afterRemoveSpace = removedSpace.filter((child, i) => {\n const afterBlock = i > 0 &&\n (removedSpace[i - 1].after.includes('\\n') || /^\\n+$/.test(removedSpace[i - 1].text));\n const beforeBlock = i < removedSpace.length - 1 &&\n (removedSpace[i + 1].before.includes('\\n') || /^\\n+$/.test(removedSpace[i + 1].text));\n // When a whitespace-only box is sandwiched between two multimedia\n // boxes (e.g., two consecutive images), we treat it like the spaces\n // around a block element so it doesn't generate a blank line.\n const afterMedia = i > 0 &&\n removedSpace[i - 1].multimedia.length > 0 &&\n removedSpace[i - 1].multimedia.length === removedSpace[i - 1].text.length;\n const beforeMedia = i < removedSpace.length - 1 &&\n removedSpace[i + 1].multimedia.length > 0 &&\n removedSpace[i + 1].multimedia.length === removedSpace[i + 1].text.length;\n return !((afterBlock || beforeBlock || afterMedia || beforeMedia) &&\n /^[ \\t]*$/.test(child.text));\n });\n if (afterRemoveSpace.length === removedSpace.length) {\n break;\n }\n // Repeat until no more space can be removed\n removedSpace = afterRemoveSpace;\n }\n removedSpace = this.reduceBoxesByLimit(removedSpace, charLimit, tokenLimit);\n // When concatenating, we handle 3 cases.\n // 1. If both ends are text, the same space characters will be overlapped and consolidated.\n // 2. If one end is text and the other end is multimedia (floated), the multimedia will be as if it doesn't exist.\n // This case is only handled when it only contains multimedia. If there's text in between, we assume it's already handled.\n // 3. If one end is text and the other end is multimedia (adhered), the multimedia will eat up the space characters.\n const enumerate = (boxes) => {\n return boxes.map((box, i) => {\n return { box, index: i };\n });\n };\n // See the comment above for the explanation.\n const asIfNotExist = (box) => {\n return (box.multimedia.length > 0 &&\n box.multimedia.length === box.text.length &&\n box.multimedia.every(media => media.position !== 'here'));\n };\n const textBoxQueue = enumerate(removedSpace).filter(({ box }) => !asIfNotExist(box));\n const multimediaQueue = enumerate(removedSpace).filter(({ box }) => asIfNotExist(box));\n const mappings = [];\n // When concatenating, make sure all multimedia boxes are skipped.\n // Multimedia boxes are instead directly adhered to the previous box.\n // Kinda like a merge sort.\n let text = '';\n let before = '';\n let after = '';\n let i = 0, j = 0;\n while (i < textBoxQueue.length || j < multimediaQueue.length) {\n if (i === textBoxQueue.length ||\n (j < multimediaQueue.length && multimediaQueue[j].index < textBoxQueue[i].index)) {\n const multimediaBox = multimediaQueue[j].box;\n mappings.push(...this.indentMappings(multimediaBox.mappings, text.length, 0));\n multimedia.push(...this.indentMultiMedia(multimediaBox.multimedia, text.length, 0));\n text += multimediaBox.text;\n j++;\n }\n else {\n const box = textBoxQueue[i].box;\n if (i === 0) {\n before = box.before;\n }\n mappings.push(...this.indentMappings(box.mappings, text.length, 0));\n // It still could contain inner multimedia\n multimedia.push(...this.indentMultiMedia(box.multimedia, text.length, 0));\n text += box.text;\n if (i === textBoxQueue.length - 1) {\n after = box.after;\n }\n else {\n let thisAfter;\n if (box.multimedia.filter(media => media.position === 'here' && media.index + 1 === box.text.length).length > 0) {\n // Has an adhered multimedia at the end\n thisAfter = '';\n }\n else if (textBoxQueue[i + 1].box.multimedia.filter(media => media.position === 'here' && media.index === 0).length > 0) {\n thisAfter = '';\n }\n else {\n thisAfter = this.consolidateSpace(box.after, textBoxQueue[i + 1].box.before);\n }\n text += thisAfter;\n }\n i++;\n }\n }\n let finalText = text;\n if (charLimit !== undefined || tokenLimit !== undefined) {\n finalText = this.truncateText(finalText, charLimit, tokenLimit, this.options);\n }\n return { text: finalText, before, after, mappings, multimedia };\n }\n indentText(text, indent, firstLineIndent) {\n const lines = text.split('\\n');\n return lines\n .map((line, i) => {\n if (!line) {\n return line;\n }\n else if (i === 0) {\n return ' '.repeat(firstLineIndent) + line;\n }\n else {\n return ' '.repeat(indent) + line;\n }\n })\n .join('\\n');\n }\n handleParagraph = (innerParagraphs, element, indent, firstLineIndent, blankLine) => {\n innerParagraphs.text = this.indentText(innerParagraphs.text, indent ?? 0, Math.max(0, (firstLineIndent ?? 0) + (indent ?? 0)));\n if (element.attr('blank-line') === 'true') {\n blankLine = true;\n }\n else if (element.attr('blank-line') === 'false') {\n blankLine = false;\n }\n if (blankLine || blankLine === undefined) {\n return this.makeBox(innerParagraphs, 'block', element);\n }\n else {\n return this.makeBox(innerParagraphs, 'newline', element);\n }\n };\n writeElementTrees(elements, $, element) {\n const children = elements\n .toArray()\n .filter(element => element.type !== 'comment')\n .map(element => {\n if (element.type === 'text') {\n return { text: element.data, before: '', after: '', mappings: [], multimedia: [] };\n }\n else {\n return this.writeElementTreeImpl($(element), $);\n }\n });\n return this.concatMarkdownBoxes(children, element);\n }\n handleList(listStyle, listSelf, $) {\n let indexIncrement = 0;\n const renderListItem = (item) => {\n const selectedItem = $(item);\n if (item.type === 'text') {\n return this.makeBox(item.data, 'inline', selectedItem);\n }\n if (!selectedItem.is('item')) {\n return this.writeElementTreeImpl(selectedItem, $);\n }\n let bullet;\n ++indexIncrement;\n switch (listStyle) {\n case 'star':\n bullet = '* ';\n break;\n case 'dash':\n bullet = '- ';\n break;\n case 'plus':\n bullet = '+ ';\n break;\n case 'decimal':\n bullet = `${indexIncrement}. `;\n break;\n case 'latin':\n bullet = String.fromCharCode(0x61 + indexIncrement - 1) + '. ';\n break;\n default:\n this.raiseError(`Invalid list style: ${listStyle}`, selectedItem);\n return this.makeBox('', 'block', selectedItem);\n }\n const paragraph = this.writeElementTrees(selectedItem.contents(), $);\n const paragraphWithBullet = this.wrapBox(paragraph, bullet, '', selectedItem);\n const doubleNewLine = paragraphWithBullet.text.includes('\\n\\n');\n return this.handleParagraph(paragraphWithBullet, selectedItem, bullet.length, -bullet.length, doubleNewLine);\n };\n const items = listSelf\n .contents()\n .toArray()\n .map(item => renderListItem(item));\n return this.handleParagraph(this.concatMarkdownBoxes(items, listSelf), listSelf);\n }\n processMultipleTableRows(elements, $) {\n const escapeInTable = (text) => {\n return text.replace(/\\|/g, '\\\\|');\n };\n return elements\n .contents()\n .toArray()\n .map(element => {\n if (!$(element).is('trow')) {\n this.raiseError(`Invalid table head, expect trow: ${element}`, $(element));\n return [];\n }\n return $(element)\n .contents()\n .toArray()\n .map(cell => {\n if (!$(cell).is('tcell')) {\n this.raiseError(`Invalid table cell, expect tcell: ${cell}`, $(element));\n return '';\n }\n return escapeInTable(this.writeElementTrees($(cell).contents(), $).text);\n });\n });\n }\n handleTable(tableHeadElements, tableBodyElements, tableElement, $) {\n const tableHead = this.processMultipleTableRows(tableHeadElements, $);\n const tableBody = this.processMultipleTableRows(tableBodyElements, $);\n const numberOfColumns = Math.max(...tableHead.map(row => row.length), ...tableBody.map(row => row.length));\n const columnWidths = [...Array(numberOfColumns).keys()].map(i => {\n return Math.max(...tableHead.map(row => (row[i] ? row[i].length : 0)), ...tableBody.map(row => (row[i] ? row[i].length : 0)));\n });\n // TODO: alignment and collapse config\n // Currently follows the format here: https://docs.github.com/en/get-started/writing-on-github/working-with-advanced-formatting/organizing-information-with-tables\n const makeRow = (row, isHeader) => {\n if (isHeader && row.length !== numberOfColumns) {\n row = [...row, ...[...Array(numberOfColumns - row.length).keys()].map(() => '')];\n }\n return ('| ' +\n row\n .map((cell, i) => {\n if (this.options.markdownTableCollapse) {\n return cell + ' |';\n }\n else {\n return cell.padEnd(columnWidths[i]) + ' |';\n }\n })\n .join(' '));\n };\n const makeSeparator = () => {\n return ('| ' +\n columnWidths\n .map(width => '-'.repeat(this.options.markdownTableCollapse && width >= 3 ? 3 : width))\n .join(' | ') +\n ' |');\n };\n const renderedTable = [\n ...tableHead.map(row => makeRow(row, true)),\n makeSeparator(),\n ...tableBody.map(row => makeRow(row, false))\n ];\n return this.makeBox(renderedTable.join('\\n'), 'block', tableElement);\n }\n writeElementTreeImpl(element, $) {\n if (element.is('p')) {\n let paragraphs = this.writeElementTrees(element.contents(), $, element);\n return this.handleParagraph(paragraphs, element);\n }\n else if (element.is('span')) {\n return this.makeBox(this.writeElementTrees(element.contents(), $, element), 'inline', element);\n }\n else if (element.is('nl')) {\n const nlText = '\\n'.repeat(parseInt(element.attr('count') || '1'));\n return {\n text: nlText,\n before: '',\n after: '',\n mappings: [this.createMappingNode(element, nlText.length)],\n multimedia: []\n };\n }\n else if (element.is('h')) {\n let paragraphs = this.writeElementTrees(element.contents(), $, element);\n const level = parseInt(element.attr('level') || '1') + this.options.markdownBaseHeaderLevel - 1;\n return this.handleParagraph(this.wrapBoxEveryLine(paragraphs, '#'.repeat(level) + ' ', ''), element);\n }\n else if (element.is('b')) {\n return this.wrapBox(this.writeElementTrees(element.contents(), $, element), '**', '**', element);\n }\n else if (element.is('i')) {\n return this.wrapBox(this.writeElementTrees(element.contents(), $, element), '*', '*', element);\n }\n else if (element.is('s')) {\n return this.wrapBox(this.writeElementTrees(element.contents(), $, element), '~~', '~~', element);\n }\n else if (element.is('u')) {\n return this.wrapBox(this.writeElementTrees(element.contents(), $, element), '__', '__', element);\n }\n else if (element.is('code')) {\n let paragraphs;\n if (element.attr('inline') === 'false') {\n const lang = element.attr('lang') || '';\n paragraphs = this.wrapBox(this.writeElementTrees(element.contents(), $, element), '```' + lang + '\\n', '\\n```');\n return this.handleParagraph(paragraphs, element);\n }\n else {\n // inline = true or undefined\n return this.wrapBox(this.writeElementTrees(element.contents(), $, element), '`', '`', element);\n }\n }\n else if (element.is('table')) {\n const contents = element.contents();\n if (contents.length !== 2 ||\n (!contents.first().is('thead') && !contents.first().is('tbody'))) {\n return this.raiseErrorAndReturnEmpty(`Invalid table, expect two children thead and tbody: ${element}`, element);\n }\n const [tableHeadElements, tableBodyElements] = contents.toArray();\n return this.handleParagraph(this.handleTable($(tableHeadElements), $(tableBodyElements), $(element), $), element);\n }\n else if (element.is('thead') ||\n element.is('tbody') ||\n element.is('trow') ||\n element.is('tcell')) {\n return this.raiseErrorAndReturnEmpty('thead, tbody, trow, tcell do not appear alone without a table context', element);\n }\n else if (element.is('list')) {\n const listStyle = element.attr('list-style');\n return this.handleList(listStyle || 'dash', element, $);\n }\n else if (element.is('item')) {\n return this.raiseErrorAndReturnEmpty('item does not appear alone without a list context', element);\n }\n else if (element.is('env')) {\n if (element.attr('presentation') === 'markup' &&\n element.attr('markup-lang') === this.markupLanguage()) {\n return this.makeBox(this.writeElementTrees(element.contents(), $, element), 'inline', element);\n }\n else {\n const content = new EnvironmentDispatcher(this.ir).writeElementTree(element, $);\n const { output, mappings, multimedia } = content;\n return this.makeBox({ text: output, before: '', after: '', mappings, multimedia }, 'inline', $(element));\n }\n }\n else {\n return this.raiseErrorAndReturnEmpty(`Not implemented element type ${element}`, element);\n }\n }\n writeElementTree(element, $) {\n const markdownBox = this.writeElementTreeImpl(element, $);\n return {\n output: markdownBox.text,\n mappings: markdownBox.mappings,\n multimedia: markdownBox.multimedia\n };\n }\n markupLanguage() {\n return 'markdown';\n }\n}\nexport class HtmlWriter extends Writer {\n inTableHead = false;\n initializeOptions(options) {\n return {\n htmlPrettyPrint: options?.htmlPrettyPrint ?? true,\n htmlIndent: options?.htmlIndent ?? ' '\n };\n }\n handleTableHeadBody(document, element, $) {\n if (!(element.is('thead') || element.is('tbody') || element.is('tcell') || element.is('trow'))) {\n this.raiseError(`Only thead, tbody and tcell should be handled, not ${element}`, element);\n return;\n }\n const originalTableHead = this.inTableHead;\n if (element.is('thead')) {\n this.inTableHead = true;\n }\n if (element.is('tcell')) {\n if (this.inTableHead) {\n this.fillNodeContents(document.ele('th'), element, $);\n }\n else {\n this.fillNodeContents(document.ele('td'), element, $);\n }\n }\n else if (element.is('trow')) {\n this.fillNodeContents(document.ele('tr'), element, $);\n }\n else {\n const tagName = element.is('thead') ? 'thead' : 'tbody';\n this.fillNodeContents(document.ele(tagName), element, $);\n }\n this.inTableHead = originalTableHead;\n }\n fillNodeContents(document, element, $) {\n element\n .contents()\n .toArray()\n .forEach(child => {\n if (child.type === 'text') {\n document.txt(child.data);\n }\n else {\n this.addNode(document, $(child), $);\n }\n });\n }\n addNode(document, element, $) {\n if (element.is('h')) {\n const level = element.attr('level') || '1';\n const tagName = `h${level}`;\n this.fillNodeContents(document.ele(tagName), element, $);\n }\n else if (element.is('code')) {\n this.fillNodeContents(document.ele('pre').ele('code'), element, $);\n }\n else if (element.is('nl')) {\n const count = parseInt(element.attr('count') || '1');\n for (let i = 0; i < count; i++) {\n document.ele('br');\n }\n }\n else if (element.is('thead') ||\n element.is('tbody') ||\n element.is('trow') ||\n element.is('tcell')) {\n this.handleTableHeadBody(document, element, $);\n }\n else if (element.is('env')) {\n if (element.attr('presentation') === 'markup' && element.attr('markup-lang') === 'html') {\n this.fillNodeContents(document, element, $);\n }\n else {\n const inner = new EnvironmentDispatcher(this.ir).writeElementTree(element, $);\n if (inner.multimedia.length > 0) {\n this.raiseError('Multimedia cannot be nested in HTML.', element);\n }\n document.txt(inner.output);\n }\n }\n else {\n const tagName = element.prop('tagName')?.toLowerCase() || 'div';\n this.fillNodeContents(document.ele(tagName), element, $);\n }\n }\n writeElementTree(element, $) {\n const document = xmlbuilder.create();\n this.addNode(document, element, $);\n const html = document.end({\n prettyPrint: this.options.htmlPrettyPrint,\n indent: this.options.htmlIndent,\n headless: true\n });\n return {\n output: html,\n mappings: [this.createMappingNode(element, html.length)],\n multimedia: []\n };\n }\n}\nexport class CsvWriter extends MarkdownWriter {\n handleTable(tableHeadElemen