UNPKG

pomljs

Version:

Prompt Orchestration Markup Language

github.com/microsoft/poml

1,138 lines (1,137 loc) • 69.2 kB

JavaScript

import * as cheerio from 'cheerio'; import * as xmlbuilder from 'xmlbuilder2'; import { ErrorCollection, WriteError, SystemError, richContentFromSourceMap, ValidSpeakers } from './base.js'; import yaml from 'js-yaml'; import { encodingForModel } from 'js-tiktoken'; // Use the special character to indicate a placeholder for multimedia. const SPECIAL_CHARACTER = 'À'; class Writer { ir = ''; options; tokenizerCache = {}; constructor(ir, options) { if (ir) { this.reset(ir); } this.options = this.initializeOptions(options); } initializeOptions(options) { return options || {}; } reset(ir) { this.ir = ir; } truncateText(text, charLimit, tokenLimit, options) { const { truncateMarker = ' (...truncated)', truncateDirection = 'end', tokenEncodingModel = 'gpt-4o' } = options || this.options; let truncated = text; let changed = false; if (charLimit !== undefined && truncated.length > charLimit) { changed = true; if (truncateDirection === 'start') { truncated = truncated.slice(truncated.length - charLimit); } else if (truncateDirection === 'middle') { const head = Math.ceil(charLimit / 2); const tail = charLimit - head; truncated = truncated.slice(0, head) + truncated.slice(truncated.length - tail); } else { truncated = truncated.slice(0, charLimit); } } if (tokenLimit !== undefined) { // Optimization: Check byte count first to potentially bypass tokenizer loading // Since tokens are typically at least 1 byte, if byte count < token limit, we're safe const byteCount = Buffer.byteLength(truncated, 'utf8'); if (byteCount <= tokenLimit) ; else { let enc = this.tokenizerCache[tokenEncodingModel]; if (!enc) { enc = encodingForModel(tokenEncodingModel); this.tokenizerCache[tokenEncodingModel] = enc; } const tokens = enc.encode(truncated); if (tokens.length > tokenLimit) { changed = true; if (truncateDirection === 'start') { truncated = enc.decode(tokens.slice(tokens.length - tokenLimit)); } else if (truncateDirection === 'middle') { const head = Math.ceil(tokenLimit / 2); const tail = tokenLimit - head; truncated = enc.decode(tokens.slice(0, head).concat(tokens.slice(tokens.length - tail))); } else { truncated = enc.decode(tokens.slice(0, tokenLimit)); } } } } if (!changed) { return text; } if (truncateDirection === 'start') { return truncateMarker + truncated; } else if (truncateDirection === 'middle') { const mid = Math.ceil(truncated.length / 2); return truncated.slice(0, mid) + truncateMarker + truncated.slice(mid); } else { return truncated + truncateMarker; } } createMappingNode(element, outputLength) { const parseAttrAsInt = (attrName) => { const attrValue = element.attr(attrName); return attrValue !== undefined && !isNaN(parseInt(attrValue, 10)) ? parseInt(attrValue, 10) : undefined; }; return { originalStart: parseAttrAsInt('original-start-index'), originalEnd: parseAttrAsInt('original-end-index'), inputStart: element[0].startIndex, inputEnd: element[0].endIndex, outputStart: 0, outputEnd: outputLength - 1 }; } /** * Add an offset to mapping nodes. * * @param mappings - Original mappings. * @param indent - The offset amount. * @param ignoreBefore - Ignore the mappings before this index. * @returns - The new mappings. */ indentMappings(mappings, indent, ignoreBefore) { return mappings.map(mapping => { return { ...mapping, outputStart: mapping.outputStart >= ignoreBefore ? mapping.outputStart + indent : mapping.outputStart, outputEnd: mapping.outputStart >= ignoreBefore ? mapping.outputEnd + indent : mapping.outputEnd }; }); } indentMultiMedia(multimedia, indent, ignoreBefore) { return multimedia.map(media => { return { ...media, index: media.index >= ignoreBefore ? media.index + indent : media.index }; }); } raiseError(message, element) { const parseAttrAsInt = (attrName) => { const attrValue = element.attr(attrName); return attrValue !== undefined && !isNaN(parseInt(attrValue, 10)) ? parseInt(attrValue, 10) : undefined; }; const emptyOutput = { output: '', multimedia: [], mappings: [] }; if (element.length === 0) { // Ignore the error if the element is not even ready return emptyOutput; } ErrorCollection.add(new WriteError(message, parseAttrAsInt('original-start-index'), parseAttrAsInt('original-end-index'), element[0].sourcePath, element[0].startIndex, element[0].endIndex, this.ir)); return emptyOutput; } writeElementTree(element, $) { throw new SystemError('Method not implemented.'); } /** * Convert an IR string into {@link RichContent} without exposing mapping information. * * The method delegates to {@link writeWithSourceMap} and then collapses the * returned segments back into a single rich content value. */ write(ir) { const segments = this.writeWithSourceMap(ir); return richContentFromSourceMap(segments); } /** * Convert an IR string into an array of speaker messages. * * It internally uses {@link writeMessagesWithSourceMap} and removes the * mapping information from each message. */ writeMessages(ir) { const messages = this.writeMessagesWithSourceMap(ir); return messages.map(m => ({ speaker: m.speaker, content: richContentFromSourceMap(m.content) })); } assignSpeakers(result, $) { const speakers = []; let defaultSpeaker = 'system'; let systemSpeakerSpecified = false; const segments = []; const querySegmentFromMapping = (startIndex, endIndex) => { return result.mappings.find(segment => segment.inputStart === startIndex && segment.inputEnd === endIndex); }; const getSpecifiedSpeaker = (element) => { const speaker = element.attr('speaker'); if (speaker && !ValidSpeakers.includes(speaker)) { this.raiseError(`"${speaker}" is not a valid speaker.`, element); return undefined; } return speaker; }; const assignSpeakerForElement = (element, inheritedSpeaker) => { let specifiedSpeaker = getSpecifiedSpeaker(element); if (specifiedSpeaker === 'system') { systemSpeakerSpecified = true; } // When human has appeared, the default speaker becomes human. if (specifiedSpeaker == 'human' && defaultSpeaker == 'system') { defaultSpeaker = 'human'; } if (element.length === 0) { return; } const segment = querySegmentFromMapping(element[0].startIndex, element[0].endIndex); if (specifiedSpeaker && !segment) { console.warn(`Speaker is specified but no exact corresponding output can be found in ${element.html()}`); } const speaker = specifiedSpeaker || inheritedSpeaker || defaultSpeaker; if (segment) { segments.push({ start: segment.outputStart, end: segment.outputEnd, speaker }); } if (specifiedSpeaker) { inheritedSpeaker = specifiedSpeaker; } element.children().each((_, child) => { const speaker = getSpecifiedSpeaker($(child)); if (speaker) { inheritedSpeaker = speaker; } assignSpeakerForElement($(child), inheritedSpeaker); }); }; assignSpeakerForElement(this.getRoot($), undefined); const allIndicesSet = new Set(); segments.forEach(segment => { allIndicesSet.add(segment.start); allIndicesSet.add(segment.end); }); const essentialIndices = Array.from(allIndicesSet).sort((a, b) => a - b); const colorSpeakers = new Array(essentialIndices.length).fill('system'); segments.forEach(segment => { const startIndex = essentialIndices.findIndex(index => index == segment.start); const endIndex = essentialIndices.findIndex(index => index == segment.end); for (let i = startIndex; i <= endIndex; i++) { colorSpeakers[i] = segment.speaker; } }); let currentStart = undefined; for (let i = 0; i < essentialIndices.length; i++) { const speaker = colorSpeakers[i]; if (i === 0 || (i > 0 && speaker !== colorSpeakers[i - 1])) { currentStart = essentialIndices[i]; } if (i === essentialIndices.length - 1 || (i < essentialIndices.length - 1 && speaker !== colorSpeakers[i + 1])) { // time to end this segment if (currentStart === undefined) { throw new SystemError('currentStart is not expected to be undefined'); } speakers.push({ start: currentStart, end: essentialIndices[i], speaker: speaker }); } } // If there's only one speaker and it's system, change it to human. if (speakers.length == 1 && speakers[0].speaker == 'system' && !systemSpeakerSpecified) { speakers[0].speaker = 'human'; } return speakers; } /** * Render the IR string and return detailed mapping for each produced content * segment. * * Each returned {@link SourceMapRichContent} describes the slice of the input * IR that generated the piece of output. */ writeWithSourceMap(ir) { const result = this.generateWriterResult(ir); const segments = this.buildSourceMap(result); return segments.map(s => ({ startIndex: s.inputStart, endIndex: s.inputEnd, irStartIndex: s.irStart, irEndIndex: s.irEnd, content: s.content })); } /** * Similar to {@link writeWithSourceMap} but groups the segments into speaker * messages. */ writeMessagesWithSourceMap(ir) { const result = this.generateWriterResult(ir); const segments = this.buildSourceMap(result); return result.speakers .map(sp => { const msgSegs = segments.filter(seg => seg.outStart >= sp.start && seg.outEnd <= sp.end); const nonWs = msgSegs.filter(seg => !(typeof seg.content === 'string' && seg.content.trim() === '')); // Use only non-whitespace segments when computing the overall source range // for this message so that trailing or leading padding does not expand the // reported span. If the message contains nothing but whitespace we fall // back to considering all segments. const relevant = nonWs.length ? nonWs : msgSegs; if (!relevant.length) { // If there are no relevant segments, we cannot produce an empty message. return { startIndex: 0, // in this case, we cannot determine the start index endIndex: 0, irStartIndex: 0, irEndIndex: 0, speaker: sp.speaker, content: [] }; } return { startIndex: Math.min(...relevant.map(seg => seg.inputStart)), endIndex: Math.max(...relevant.map(seg => seg.inputEnd)), irStartIndex: Math.min(...relevant.map(seg => seg.irStart)), irEndIndex: Math.max(...relevant.map(seg => seg.irEnd)), speaker: sp.speaker, content: msgSegs.map(seg => ({ startIndex: seg.inputStart, endIndex: seg.inputEnd, irStartIndex: seg.irStart, irEndIndex: seg.irEnd, content: seg.content })) }; }) .filter(msg => msg !== undefined); } /** * Transform a {@link WriterResult} into discrete source map segments. * * The segments are ordered so that rich content can be reconstructed in * the correct visual order while preserving multimedia positioning. */ buildSourceMap(result) { // Collect every boundary within the output that could signify a change in // source location. These come from the input/output mappings as well as // multimedia positions. Splitting the output on these boundaries ensures // each segment corresponds to a single source range. const boundaries = new Set(); result.mappings.forEach(m => { boundaries.add(m.outputStart); boundaries.add(m.outputEnd + 1); }); result.multimedia.forEach(m => { boundaries.add(m.index); boundaries.add(m.index + 1); }); boundaries.add(0); boundaries.add(result.output.length); const points = Array.from(boundaries).sort((a, b) => a - b); // `top` multimedia should appear before all textual content while `bottom` // multimedia should come last. We therefore keep three buckets and merge // them at the end. const topSegments = []; const middleSegments = []; const bottomSegments = []; const originalStartIndices = result.mappings .map(m => m.originalStart) .filter(m => m !== undefined); const sourceStartIndex = originalStartIndices.length > 0 ? Math.min(...originalStartIndices) : 0; const originalEndIndices = result.mappings.map(m => m.originalEnd).filter(m => m !== undefined); const sourceEndIndex = originalEndIndices.length > 0 ? Math.max(...originalEndIndices) : 0; for (let i = 0; i < points.length - 1; i++) { const start = points[i]; const end = points[i + 1]; if (start >= end) { continue; } const slice = result.output.slice(start, end); // Find the most specific mapping that covers this slice. This allows the // resulting segment to map back to the tightest IR range responsible for // the output. let chosen; // The chosen IR might not have a precise original start or end index, so we // choose a fallback based on the original mappings. let chosenOriginal; for (const m of result.mappings) { if (start >= m.outputStart && end - 1 <= m.outputEnd) { if (!chosen || m.outputEnd - m.outputStart < chosen.outputEnd - chosen.outputStart) { chosen = m; } if (m.originalStart !== undefined && m.originalEnd !== undefined && (!chosenOriginal || m.originalEnd - m.originalStart < chosenOriginal.originalEnd - chosenOriginal.originalStart)) { chosenOriginal = m; } } } if (!chosen) { // Mappings must be non-empty here because the points are derived from the // mappings. If we cannot find a mapping, use the first one as a fallback. chosen = result.mappings[0]; } // If a multimedia item starts at this boundary, emit it instead of text. const media = result.multimedia.find(m => m.index === start); if (media) { const { position, index, ...rest } = media; const segment = { outStart: start, outEnd: end - 1, irStart: chosen.inputStart, irEnd: chosen.inputEnd, inputStart: chosenOriginal?.originalStart ?? sourceStartIndex, inputEnd: chosenOriginal?.originalEnd ?? sourceEndIndex, content: [rest] }; if (position === 'top') { topSegments.push(segment); } else if (position === 'bottom') { bottomSegments.push(segment); } else { middleSegments.push(segment); } } else if (slice !== SPECIAL_CHARACTER && slice.length > 0) { // Normal textual slice. middleSegments.push({ outStart: start, outEnd: end - 1, irStart: chosen.inputStart, irEnd: chosen.inputEnd, inputStart: chosenOriginal?.originalStart ?? sourceStartIndex, inputEnd: chosenOriginal?.originalEnd ?? sourceEndIndex, content: slice }); } } middleSegments.sort((a, b) => a.outStart - b.outStart); // Order the buckets so that `top` items are emitted before any textual // content and `bottom` items are emitted last. When filtering these // segments by speaker boundaries, each top or bottom item still appears // within the correct message. return [...topSegments, ...middleSegments, ...bottomSegments]; } /** * Execute the main writing logic and gather mapping, multimedia and speaker * information before it is broken down into smaller segments. */ generateWriterResult(ir) { this.reset(ir); const $ = cheerio.load(ir, { scriptingEnabled: false, xml: { xmlMode: true, withStartIndices: true, withEndIndices: true } }, false); const partialResult = this.writeElementTree(this.getRoot($), $); return { input: ir, output: partialResult.output, mappings: partialResult.mappings, multimedia: partialResult.multimedia, speakers: this.assignSpeakers(partialResult, $) }; } getRoot($) { return $($.root().children()[0]); } } class EnvironmentDispatcher extends Writer { writeElementTree(element, $) { if (element.is('env')) { let options = undefined; try { const optionsString = element.attr('writer-options'); if (optionsString) { options = JSON.parse(optionsString); } } catch (e) { this.raiseError(`Invalid JSON for writer-options: ${element.attr('writer-options')}`, element); } if (element.attr('presentation') === 'markup') { const markupLanguage = element.attr('markup-lang') || 'markdown'; if (markupLanguage === 'markdown') { return new MarkdownWriter(this.ir, options).writeElementTree(element, $); } else if (markupLanguage === 'html') { return new HtmlWriter(this.ir, options).writeElementTree(element, $); } else if (markupLanguage === 'csv') { return new CsvWriter(this.ir, options).writeElementTree(element, $); } else if (markupLanguage === 'tsv') { return new TsvWriter(this.ir, options).writeElementTree(element, $); } else { return this.raiseError(`Invalid markup language: ${markupLanguage}`, element); } } else if (element.attr('presentation') === 'serialize') { const serializer = element.attr('serializer') || 'json'; if (serializer === 'json') { return new JsonWriter(this.ir, options).writeElementTree(element, $); } else if (serializer === 'yaml') { return new YamlWriter(this.ir, options).writeElementTree(element, $); } else if (serializer === 'xml') { return new XmlWriter(this.ir, options).writeElementTree(element, $); } else { return this.raiseError(`Invalid serializer: ${serializer}`, element); } } else if (element.attr('presentation') === 'free') { return new FreeWriter(this.ir, options).writeElementTree(element, $); } else if (element.attr('presentation') === 'multimedia') { return new MultiMediaWriter(this.ir, options).writeElementTree(element, $); } else { return this.raiseError(`Invalid presentation: ${element}`, element); } } else { // Not even an environment, consider writing it as a markdown return new MarkdownWriter(this.ir).writeElementTree(element, $); } } } class MarkdownWriter extends Writer { initializeOptions(options) { options = options || {}; return { markdownBaseHeaderLevel: options.markdownBaseHeaderLevel ?? 1, markdownTableCollapse: options.markdownTableCollapse ?? false, csvSeparator: options.csvSeparator ?? ',', csvHeader: options.csvHeader ?? true, truncateMarker: options.truncateMarker ?? ' (...truncated)', truncateDirection: options.truncateDirection ?? 'end', tokenEncodingModel: options.tokenEncodingModel ?? 'gpt-4o' }; } raiseErrorAndReturnEmpty(message, element) { this.raiseError(message, element); return { text: '', before: '', after: '', mappings: [], multimedia: [] }; } makeBox(text, layout, element) { const newBeforeAfter = layout === 'block' ? '\n\n' : layout === 'newline' ? '\n' : ''; const charLimitAttr = element.attr('char-limit'); const tokenLimitAttr = element.attr('token-limit'); const priorityAttr = element.attr('priority'); const charLimit = charLimitAttr !== undefined ? parseInt(charLimitAttr, 10) : undefined; const tokenLimit = tokenLimitAttr !== undefined ? parseInt(tokenLimitAttr, 10) : undefined; const priority = priorityAttr !== undefined ? parseFloat(priorityAttr) : undefined; if (typeof text === 'string') { const truncated = this.truncateText(text, charLimit, tokenLimit, this.options); return { text: truncated, before: newBeforeAfter, after: newBeforeAfter, mappings: [this.createMappingNode(element, truncated.length)], multimedia: [], priority }; } else { const combinedText = text.text; const truncated = this.truncateText(combinedText, charLimit, tokenLimit, this.options); return { text: truncated, before: this.consolidateSpace(newBeforeAfter, text.before), after: this.consolidateSpace(text.after, newBeforeAfter), mappings: [...text.mappings, this.createMappingNode(element, truncated.length)], multimedia: text.multimedia, priority }; } } wrapBox(box, wrapBefore, wrapAfter, element) { const text = wrapBefore + box.text + wrapAfter; const mappings = this.indentMappings(box.mappings, wrapBefore.length, 0); if (element) { mappings.push(this.createMappingNode(element, text.length)); } return { text: text, before: box.before, after: box.after, mappings: mappings, multimedia: this.indentMultiMedia(box.multimedia, wrapBefore.length, 0) }; } wrapBoxEveryLine(box, wrapBefore, wrapAfter) { const lines = box.text.split('\n'); let accumulatedLength = 0; let mappings = box.mappings; let multimedia = box.multimedia; const text = lines .map(line => { const result = wrapBefore + line + wrapAfter; mappings = this.indentMappings(mappings, wrapBefore.length, accumulatedLength); multimedia = this.indentMultiMedia(multimedia, wrapBefore.length, accumulatedLength); accumulatedLength += result.length + 1; // length of '\n' return result; }) .join('\n'); return { text: text, before: box.before, after: box.after, mappings: mappings, multimedia: multimedia }; } consolidateSpace(space1, space2) { let result = space1 + space2; for (let i = 1; i <= Math.min(space1.length, space2.length); i++) { if (space1.slice(-i) === space2.slice(0, i)) { result = space1 + space2.slice(i); } } return result; } reduceBoxesByLimit(boxes, charLimit, tokenLimit) { if (boxes.length === 0 || (charLimit === undefined && tokenLimit === undefined)) { return boxes; } const tokenModel = this.options.tokenEncodingModel || 'gpt-4o'; const getTokenLength = (t) => { if (tokenLimit === undefined) { return 0; } // Optimization: Use byte count as conservative estimate before tokenizing const byteCount = Buffer.byteLength(t, 'utf8'); const BYTES_PER_TOKEN_ESTIMATE = 4; // If byte count is small enough, we can estimate it's within token limits // This is a heuristic - for very short strings, byte count ≈ token count if (byteCount <= tokenLimit) { return Math.ceil(byteCount / BYTES_PER_TOKEN_ESTIMATE); // Conservative estimate } let enc = this.tokenizerCache[tokenModel]; if (!enc) { enc = encodingForModel(tokenModel); this.tokenizerCache[tokenModel] = enc; } return enc.encode(t).length; }; const totalChars = (arr) => arr.reduce((a, b) => a + b.text.length, 0); const totalTokens = (arr) => arr.reduce((a, b) => a + getTokenLength(b.text), 0); let current = [...boxes]; while (current.length > 0) { const exceeds = (charLimit !== undefined && totalChars(current) > charLimit) || (tokenLimit !== undefined && totalTokens(current) > tokenLimit); if (!exceeds) { break; } const priorities = current.map(b => b.priority ?? 0); const minP = Math.min(...priorities); if (current.every(b => (b.priority ?? 0) === minP)) { break; } current = current.filter(b => (b.priority ?? 0) !== minP); } return current; } concatMarkdownBoxes(boxes, element) { const charLimitAttr = element?.attr('char-limit'); const tokenLimitAttr = element?.attr('token-limit'); const charLimit = charLimitAttr !== undefined ? parseInt(charLimitAttr, 10) : undefined; const tokenLimit = tokenLimitAttr !== undefined ? parseInt(tokenLimitAttr, 10) : undefined; const multimedia = []; // Remove all spaces children before and after block elements // or between two multimedia-only nodes so images do not create // stray blank lines when placed consecutively. let removedSpace = boxes; while (true) { let afterRemoveSpace = removedSpace.filter((child, i) => { const afterBlock = i > 0 && (removedSpace[i - 1].after.includes('\n') || /^\n+$/.test(removedSpace[i - 1].text)); const beforeBlock = i < removedSpace.length - 1 && (removedSpace[i + 1].before.includes('\n') || /^\n+$/.test(removedSpace[i + 1].text)); // When a whitespace-only box is sandwiched between two multimedia // boxes (e.g., two consecutive images), we treat it like the spaces // around a block element so it doesn't generate a blank line. const afterMedia = i > 0 && removedSpace[i - 1].multimedia.length > 0 && removedSpace[i - 1].multimedia.length === removedSpace[i - 1].text.length; const beforeMedia = i < removedSpace.length - 1 && removedSpace[i + 1].multimedia.length > 0 && removedSpace[i + 1].multimedia.length === removedSpace[i + 1].text.length; return !((afterBlock || beforeBlock || afterMedia || beforeMedia) && /^[ \t]*$/.test(child.text)); }); if (afterRemoveSpace.length === removedSpace.length) { break; } // Repeat until no more space can be removed removedSpace = afterRemoveSpace; } removedSpace = this.reduceBoxesByLimit(removedSpace, charLimit, tokenLimit); // When concatenating, we handle 3 cases. // 1. If both ends are text, the same space characters will be overlapped and consolidated. // 2. If one end is text and the other end is multimedia (floated), the multimedia will be as if it doesn't exist. // This case is only handled when it only contains multimedia. If there's text in between, we assume it's already handled. // 3. If one end is text and the other end is multimedia (adhered), the multimedia will eat up the space characters. const enumerate = (boxes) => { return boxes.map((box, i) => { return { box, index: i }; }); }; // See the comment above for the explanation. const asIfNotExist = (box) => { return (box.multimedia.length > 0 && box.multimedia.length === box.text.length && box.multimedia.every(media => media.position !== 'here')); }; const textBoxQueue = enumerate(removedSpace).filter(({ box }) => !asIfNotExist(box)); const multimediaQueue = enumerate(removedSpace).filter(({ box }) => asIfNotExist(box)); const mappings = []; // When concatenating, make sure all multimedia boxes are skipped. // Multimedia boxes are instead directly adhered to the previous box. // Kinda like a merge sort. let text = ''; let before = ''; let after = ''; let i = 0, j = 0; while (i < textBoxQueue.length || j < multimediaQueue.length) { if (i === textBoxQueue.length || (j < multimediaQueue.length && multimediaQueue[j].index < textBoxQueue[i].index)) { const multimediaBox = multimediaQueue[j].box; mappings.push(...this.indentMappings(multimediaBox.mappings, text.length, 0)); multimedia.push(...this.indentMultiMedia(multimediaBox.multimedia, text.length, 0)); text += multimediaBox.text; j++; } else { const box = textBoxQueue[i].box; if (i === 0) { before = box.before; } mappings.push(...this.indentMappings(box.mappings, text.length, 0)); // It still could contain inner multimedia multimedia.push(...this.indentMultiMedia(box.multimedia, text.length, 0)); text += box.text; if (i === textBoxQueue.length - 1) { after = box.after; } else { let thisAfter; if (box.multimedia.filter(media => media.position === 'here' && media.index + 1 === box.text.length).length > 0) { // Has an adhered multimedia at the end thisAfter = ''; } else if (textBoxQueue[i + 1].box.multimedia.filter(media => media.position === 'here' && media.index === 0).length > 0) { thisAfter = ''; } else { thisAfter = this.consolidateSpace(box.after, textBoxQueue[i + 1].box.before); } text += thisAfter; } i++; } } let finalText = text; if (charLimit !== undefined || tokenLimit !== undefined) { finalText = this.truncateText(finalText, charLimit, tokenLimit, this.options); } return { text: finalText, before, after, mappings, multimedia }; } indentText(text, indent, firstLineIndent) { const lines = text.split('\n'); return lines .map((line, i) => { if (!line) { return line; } else if (i === 0) { return ' '.repeat(firstLineIndent) + line; } else { return ' '.repeat(indent) + line; } }) .join('\n'); } handleParagraph = (innerParagraphs, element, indent, firstLineIndent, blankLine) => { innerParagraphs.text = this.indentText(innerParagraphs.text, indent ?? 0, Math.max(0, (firstLineIndent ?? 0) + (indent ?? 0))); if (element.attr('blank-line') === 'true') { blankLine = true; } else if (element.attr('blank-line') === 'false') { blankLine = false; } if (blankLine || blankLine === undefined) { return this.makeBox(innerParagraphs, 'block', element); } else { return this.makeBox(innerParagraphs, 'newline', element); } }; writeElementTrees(elements, $, element) { const children = elements .toArray() .filter(element => element.type !== 'comment') .map(element => { if (element.type === 'text') { return { text: element.data, before: '', after: '', mappings: [], multimedia: [] }; } else { return this.writeElementTreeImpl($(element), $); } }); return this.concatMarkdownBoxes(children, element); } handleList(listStyle, listSelf, $) { let indexIncrement = 0; const renderListItem = (item) => { const selectedItem = $(item); if (item.type === 'text') { return this.makeBox(item.data, 'inline', selectedItem); } if (!selectedItem.is('item')) { return this.writeElementTreeImpl(selectedItem, $); } let bullet; ++indexIncrement; switch (listStyle) { case 'star': bullet = '* '; break; case 'dash': bullet = '- '; break; case 'plus': bullet = '+ '; break; case 'decimal': bullet = `${indexIncrement}. `; break; case 'latin': bullet = String.fromCharCode(0x61 + indexIncrement - 1) + '. '; break; default: this.raiseError(`Invalid list style: ${listStyle}`, selectedItem); return this.makeBox('', 'block', selectedItem); } const paragraph = this.writeElementTrees(selectedItem.contents(), $); const paragraphWithBullet = this.wrapBox(paragraph, bullet, '', selectedItem); const doubleNewLine = paragraphWithBullet.text.includes('\n\n'); return this.handleParagraph(paragraphWithBullet, selectedItem, bullet.length, -bullet.length, doubleNewLine); }; const items = listSelf .contents() .toArray() .map(item => renderListItem(item)); return this.handleParagraph(this.concatMarkdownBoxes(items, listSelf), listSelf); } processMultipleTableRows(elements, $) { const escapeInTable = (text) => { return text.replace(/\|/g, '\\|'); }; return elements .contents() .toArray() .map(element => { if (!$(element).is('trow')) { this.raiseError(`Invalid table head, expect trow: ${element}`, $(element)); return []; } return $(element) .contents() .toArray() .map(cell => { if (!$(cell).is('tcell')) { this.raiseError(`Invalid table cell, expect tcell: ${cell}`, $(element)); return ''; } return escapeInTable(this.writeElementTrees($(cell).contents(), $).text); }); }); } handleTable(tableHeadElements, tableBodyElements, tableElement, $) { const tableHead = this.processMultipleTableRows(tableHeadElements, $); const tableBody = this.processMultipleTableRows(tableBodyElements, $); const numberOfColumns = Math.max(...tableHead.map(row => row.length), ...tableBody.map(row => row.length)); const columnWidths = [...Array(numberOfColumns).keys()].map(i => { return Math.max(...tableHead.map(row => (row[i] ? row[i].length : 0)), ...tableBody.map(row => (row[i] ? row[i].length : 0))); }); // TODO: alignment and collapse config // Currently follows the format here: https://docs.github.com/en/get-started/writing-on-github/working-with-advanced-formatting/organizing-information-with-tables const makeRow = (row, isHeader) => { if (isHeader && row.length !== numberOfColumns) { row = [...row, ...[...Array(numberOfColumns - row.length).keys()].map(() => '')]; } return ('| ' + row .map((cell, i) => { if (this.options.markdownTableCollapse) { return cell + ' |'; } else { return cell.padEnd(columnWidths[i]) + ' |'; } }) .join(' ')); }; const makeSeparator = () => { return ('| ' + columnWidths .map(width => '-'.repeat(this.options.markdownTableCollapse && width >= 3 ? 3 : width)) .join(' | ') + ' |'); }; const renderedTable = [ ...tableHead.map(row => makeRow(row, true)), makeSeparator(), ...tableBody.map(row => makeRow(row, false)) ]; return this.makeBox(renderedTable.join('\n'), 'block', tableElement); } writeElementTreeImpl(element, $) { if (element.is('p')) { let paragraphs = this.writeElementTrees(element.contents(), $, element); return this.handleParagraph(paragraphs, element); } else if (element.is('span')) { return this.makeBox(this.writeElementTrees(element.contents(), $, element), 'inline', element); } else if (element.is('nl')) { const nlText = '\n'.repeat(parseInt(element.attr('count') || '1')); return { text: nlText, before: '', after: '', mappings: [this.createMappingNode(element, nlText.length)], multimedia: [] }; } else if (element.is('h')) { let paragraphs = this.writeElementTrees(element.contents(), $, element); const level = parseInt(element.attr('level') || '1') + this.options.markdownBaseHeaderLevel - 1; return this.handleParagraph(this.wrapBoxEveryLine(paragraphs, '#'.repeat(level) + ' ', ''), element); } else if (element.is('b')) { return this.wrapBox(this.writeElementTrees(element.contents(), $, element), '**', '**', element); } else if (element.is('i')) { return this.wrapBox(this.writeElementTrees(element.contents(), $, element), '*', '*', element); } else if (element.is('s')) { return this.wrapBox(this.writeElementTrees(element.contents(), $, element), '~~', '~~', element); } else if (element.is('u')) { return this.wrapBox(this.writeElementTrees(element.contents(), $, element), '__', '__', element); } else if (element.is('code')) { let paragraphs; if (element.attr('inline') === 'false') { const lang = element.attr('lang') || ''; paragraphs = this.wrapBox(this.writeElementTrees(element.contents(), $, element), '```' + lang + '\n', '\n```'); return this.handleParagraph(paragraphs, element); } else { // inline = true or undefined return this.wrapBox(this.writeElementTrees(element.contents(), $, element), '`', '`', element); } } else if (element.is('table')) { const contents = element.contents(); if (contents.length !== 2 || (!contents.first().is('thead') && !contents.first().is('tbody'))) { return this.raiseErrorAndReturnEmpty(`Invalid table, expect two children thead and tbody: ${element}`, element); } const [tableHeadElements, tableBodyElements] = contents.toArray(); return this.handleParagraph(this.handleTable($(tableHeadElements), $(tableBodyElements), $(element), $), element); } else if (element.is('thead') || element.is('tbody') || element.is('trow') || element.is('tcell')) { return this.raiseErrorAndReturnEmpty('thead, tbody, trow, tcell do not appear alone without a table context', element); } else if (element.is('list')) { const listStyle = element.attr('list-style'); return this.handleList(listStyle || 'dash', element, $); } else if (element.is('item')) { return this.raiseErrorAndReturnEmpty('item does not appear alone without a list context', element); } else if (element.is('env')) { if (element.attr('presentation') === 'markup' && element.attr('markup-lang') === this.markupLanguage()) { return this.makeBox(this.writeElementTrees(element.contents(), $, element), 'inline', element); } else { const content = new EnvironmentDispatcher(this.ir).writeElementTree(element, $); const { output, mappings, multimedia } = content; return this.makeBox({ text: output, before: '', after: '', mappings, multimedia }, 'inline', $(element)); } } else { return this.raiseErrorAndReturnEmpty(`Not implemented element type ${element}`, element); } } writeElementTree(element, $) { const markdownBox = this.writeElementTreeImpl(element, $); return { output: markdownBox.text, mappings: markdownBox.mappings, multimedia: markdownBox.multimedia }; } markupLanguage() { return 'markdown'; } } class HtmlWriter extends Writer { inTableHead = false; initializeOptions(options) { return { htmlPrettyPrint: options?.htmlPrettyPrint ?? true, htmlIndent: options?.htmlIndent ?? ' ' }; } handleTableHeadBody(document, element, $) { if (!(element.is('thead') || element.is('tbody') || element.is('tcell') || element.is('trow'))) { this.raiseError(`Only thead, tbody and tcell should be handled, not ${element}`, element); return; } const originalTableHead = this.inTableHead; if (element.is('thead')) { this.inTableHead = true; } if (element.is('tcell')) { if (this.inTableHead) { this.fillNodeContents(document.ele('th'), element, $); } else { this.fillNodeContents(document.ele('td'), element, $); } } else if (element.is('trow')) { this.fillNodeContents(document.ele('tr'), element, $); } else { const tagName = element.is('thead') ? 'thead' : 'tbody'; this.fillNodeContents(document.ele(tagName), element, $); } this.inTableHead = originalTableHead; } fillNodeContents(document, element, $) { element .contents() .toArray() .forEach(child => { if (child.type === 'text') { document.txt(child.data); } else { this.addNode(document, $(child), $); } }); } addNode(document, element, $) { if (element.is('h')) { const level = element.attr('level') || '1'; const tagName = `h${level}`; this.fillNodeContents(document.ele(tagName), element, $); } else if (element.is('code')) { this.fillNodeContents(document.ele('pre').ele('code'), element, $); } else if (element.is('nl')) { const count = parseInt(element.attr('count') || '1'); for (let i = 0; i < count; i++) { document.ele('br'); } } else if (element.is('thead') || element.is('tbody') || element.is('trow') || element.is('tcell')) { this.handleTableHeadBody(document, element, $); } else if (element.is('env')) { if (element.attr('presentation') === 'markup' && element.attr('markup-lang') === 'html') { this.fillNodeContents(document, element, $); } else { const inner = new EnvironmentDispatcher(this.ir).writeElementTree(element, $); if (inner.multimedia.length > 0) { this.raiseError('Multimedia cannot be nested in HTML.', element); } document.txt(inner.output); } } else { const tagName = element.prop('tagName')?.toLowerCase() || 'div'; this.fillNodeContents(document.ele(tagName), element, $); } } writeElementTree(element, $) { const document = xmlbuilder.create(); this.addNode(document, element, $); const html = document.end({ prettyPrint: this.options.htmlPrettyPrint, indent: this.options.htmlIndent, headless: true }); return { output: html, mappings: [this.createMappingNode(element, html.length)], multimedia: [] }; } } class CsvWriter extends MarkdownWriter { handleTable(tableHeadElements, tableBodyElements, tableElement, $) { const tableHead = this.processMultipleTableRows(tableHeadElements, $); const tableBody = this.processMultipleTableRows(tableBodyElements, $); const makeCell = (cell) => { if (cell.includes(this.options.csvSeparator)) { if (cell.includes('"')) { cell = cell.replace(/"/g, '""'); } cell = '"' + cell + '"'; } return cell; }; const makeRow = (row) => { return row.map(makeCell).join(this.options.csvSeparator); }; let renderedTable; if (this.options.csvHeader) { renderedTable = [...tableHead.map(makeRow), ...tableBody.map(makeRow)]; } else { renderedTable = [...tableBody.map(makeRow)]; } return this.makeBox(renderedTable.join('\n'), 'block', tableElement); } writeElementTreeImpl(element, $) { if (element.is('table') || element.is('thead') || element.is('tbody') || element.is('trow') || element.is('tcell') || element.is('env')) { return super.writeElementTreeImpl(element, $); } else { return this.raiseErrorAndReturnEmpty(`Not implemented element type in csv ${element}`, element); } } ma