pomljs
Version:
Prompt Orchestration Markup Language
1,138 lines (1,137 loc) • 69.2 kB
JavaScript
import * as cheerio from 'cheerio';
import * as xmlbuilder from 'xmlbuilder2';
import { ErrorCollection, WriteError, SystemError, richContentFromSourceMap, ValidSpeakers } from './base.js';
import yaml from 'js-yaml';
import { encodingForModel } from 'js-tiktoken';
// Use the special character to indicate a placeholder for multimedia.
const SPECIAL_CHARACTER = 'À';
class Writer {
ir = '';
options;
tokenizerCache = {};
constructor(ir, options) {
if (ir) {
this.reset(ir);
}
this.options = this.initializeOptions(options);
}
initializeOptions(options) {
return options || {};
}
reset(ir) {
this.ir = ir;
}
truncateText(text, charLimit, tokenLimit, options) {
const { truncateMarker = ' (...truncated)', truncateDirection = 'end', tokenEncodingModel = 'gpt-4o' } = options || this.options;
let truncated = text;
let changed = false;
if (charLimit !== undefined && truncated.length > charLimit) {
changed = true;
if (truncateDirection === 'start') {
truncated = truncated.slice(truncated.length - charLimit);
}
else if (truncateDirection === 'middle') {
const head = Math.ceil(charLimit / 2);
const tail = charLimit - head;
truncated = truncated.slice(0, head) + truncated.slice(truncated.length - tail);
}
else {
truncated = truncated.slice(0, charLimit);
}
}
if (tokenLimit !== undefined) {
// Optimization: Check byte count first to potentially bypass tokenizer loading
// Since tokens are typically at least 1 byte, if byte count < token limit, we're safe
const byteCount = Buffer.byteLength(truncated, 'utf8');
if (byteCount <= tokenLimit) ;
else {
let enc = this.tokenizerCache[tokenEncodingModel];
if (!enc) {
enc = encodingForModel(tokenEncodingModel);
this.tokenizerCache[tokenEncodingModel] = enc;
}
const tokens = enc.encode(truncated);
if (tokens.length > tokenLimit) {
changed = true;
if (truncateDirection === 'start') {
truncated = enc.decode(tokens.slice(tokens.length - tokenLimit));
}
else if (truncateDirection === 'middle') {
const head = Math.ceil(tokenLimit / 2);
const tail = tokenLimit - head;
truncated = enc.decode(tokens.slice(0, head).concat(tokens.slice(tokens.length - tail)));
}
else {
truncated = enc.decode(tokens.slice(0, tokenLimit));
}
}
}
}
if (!changed) {
return text;
}
if (truncateDirection === 'start') {
return truncateMarker + truncated;
}
else if (truncateDirection === 'middle') {
const mid = Math.ceil(truncated.length / 2);
return truncated.slice(0, mid) + truncateMarker + truncated.slice(mid);
}
else {
return truncated + truncateMarker;
}
}
createMappingNode(element, outputLength) {
const parseAttrAsInt = (attrName) => {
const attrValue = element.attr(attrName);
return attrValue !== undefined && !isNaN(parseInt(attrValue, 10))
? parseInt(attrValue, 10)
: undefined;
};
return {
originalStart: parseAttrAsInt('original-start-index'),
originalEnd: parseAttrAsInt('original-end-index'),
inputStart: element[0].startIndex,
inputEnd: element[0].endIndex,
outputStart: 0,
outputEnd: outputLength - 1
};
}
/**
* Add an offset to mapping nodes.
*
* @param mappings - Original mappings.
* @param indent - The offset amount.
* @param ignoreBefore - Ignore the mappings before this index.
* @returns - The new mappings.
*/
indentMappings(mappings, indent, ignoreBefore) {
return mappings.map(mapping => {
return {
...mapping,
outputStart: mapping.outputStart >= ignoreBefore ? mapping.outputStart + indent : mapping.outputStart,
outputEnd: mapping.outputStart >= ignoreBefore ? mapping.outputEnd + indent : mapping.outputEnd
};
});
}
indentMultiMedia(multimedia, indent, ignoreBefore) {
return multimedia.map(media => {
return {
...media,
index: media.index >= ignoreBefore ? media.index + indent : media.index
};
});
}
raiseError(message, element) {
const parseAttrAsInt = (attrName) => {
const attrValue = element.attr(attrName);
return attrValue !== undefined && !isNaN(parseInt(attrValue, 10))
? parseInt(attrValue, 10)
: undefined;
};
const emptyOutput = {
output: '',
multimedia: [],
mappings: []
};
if (element.length === 0) {
// Ignore the error if the element is not even ready
return emptyOutput;
}
ErrorCollection.add(new WriteError(message, parseAttrAsInt('original-start-index'), parseAttrAsInt('original-end-index'), element[0].sourcePath, element[0].startIndex, element[0].endIndex, this.ir));
return emptyOutput;
}
writeElementTree(element, $) {
throw new SystemError('Method not implemented.');
}
/**
* Convert an IR string into {@link RichContent} without exposing mapping information.
*
* The method delegates to {@link writeWithSourceMap} and then collapses the
* returned segments back into a single rich content value.
*/
write(ir) {
const segments = this.writeWithSourceMap(ir);
return richContentFromSourceMap(segments);
}
/**
* Convert an IR string into an array of speaker messages.
*
* It internally uses {@link writeMessagesWithSourceMap} and removes the
* mapping information from each message.
*/
writeMessages(ir) {
const messages = this.writeMessagesWithSourceMap(ir);
return messages.map(m => ({
speaker: m.speaker,
content: richContentFromSourceMap(m.content)
}));
}
assignSpeakers(result, $) {
const speakers = [];
let defaultSpeaker = 'system';
let systemSpeakerSpecified = false;
const segments = [];
const querySegmentFromMapping = (startIndex, endIndex) => {
return result.mappings.find(segment => segment.inputStart === startIndex && segment.inputEnd === endIndex);
};
const getSpecifiedSpeaker = (element) => {
const speaker = element.attr('speaker');
if (speaker && !ValidSpeakers.includes(speaker)) {
this.raiseError(`"${speaker}" is not a valid speaker.`, element);
return undefined;
}
return speaker;
};
const assignSpeakerForElement = (element, inheritedSpeaker) => {
let specifiedSpeaker = getSpecifiedSpeaker(element);
if (specifiedSpeaker === 'system') {
systemSpeakerSpecified = true;
}
// When human has appeared, the default speaker becomes human.
if (specifiedSpeaker == 'human' && defaultSpeaker == 'system') {
defaultSpeaker = 'human';
}
if (element.length === 0) {
return;
}
const segment = querySegmentFromMapping(element[0].startIndex, element[0].endIndex);
if (specifiedSpeaker && !segment) {
console.warn(`Speaker is specified but no exact corresponding output can be found in ${element.html()}`);
}
const speaker = specifiedSpeaker || inheritedSpeaker || defaultSpeaker;
if (segment) {
segments.push({ start: segment.outputStart, end: segment.outputEnd, speaker });
}
if (specifiedSpeaker) {
inheritedSpeaker = specifiedSpeaker;
}
element.children().each((_, child) => {
const speaker = getSpecifiedSpeaker($(child));
if (speaker) {
inheritedSpeaker = speaker;
}
assignSpeakerForElement($(child), inheritedSpeaker);
});
};
assignSpeakerForElement(this.getRoot($), undefined);
const allIndicesSet = new Set();
segments.forEach(segment => {
allIndicesSet.add(segment.start);
allIndicesSet.add(segment.end);
});
const essentialIndices = Array.from(allIndicesSet).sort((a, b) => a - b);
const colorSpeakers = new Array(essentialIndices.length).fill('system');
segments.forEach(segment => {
const startIndex = essentialIndices.findIndex(index => index == segment.start);
const endIndex = essentialIndices.findIndex(index => index == segment.end);
for (let i = startIndex; i <= endIndex; i++) {
colorSpeakers[i] = segment.speaker;
}
});
let currentStart = undefined;
for (let i = 0; i < essentialIndices.length; i++) {
const speaker = colorSpeakers[i];
if (i === 0 || (i > 0 && speaker !== colorSpeakers[i - 1])) {
currentStart = essentialIndices[i];
}
if (i === essentialIndices.length - 1 ||
(i < essentialIndices.length - 1 && speaker !== colorSpeakers[i + 1])) {
// time to end this segment
if (currentStart === undefined) {
throw new SystemError('currentStart is not expected to be undefined');
}
speakers.push({ start: currentStart, end: essentialIndices[i], speaker: speaker });
}
}
// If there's only one speaker and it's system, change it to human.
if (speakers.length == 1 && speakers[0].speaker == 'system' && !systemSpeakerSpecified) {
speakers[0].speaker = 'human';
}
return speakers;
}
/**
* Render the IR string and return detailed mapping for each produced content
* segment.
*
* Each returned {@link SourceMapRichContent} describes the slice of the input
* IR that generated the piece of output.
*/
writeWithSourceMap(ir) {
const result = this.generateWriterResult(ir);
const segments = this.buildSourceMap(result);
return segments.map(s => ({
startIndex: s.inputStart,
endIndex: s.inputEnd,
irStartIndex: s.irStart,
irEndIndex: s.irEnd,
content: s.content
}));
}
/**
* Similar to {@link writeWithSourceMap} but groups the segments into speaker
* messages.
*/
writeMessagesWithSourceMap(ir) {
const result = this.generateWriterResult(ir);
const segments = this.buildSourceMap(result);
return result.speakers
.map(sp => {
const msgSegs = segments.filter(seg => seg.outStart >= sp.start && seg.outEnd <= sp.end);
const nonWs = msgSegs.filter(seg => !(typeof seg.content === 'string' && seg.content.trim() === ''));
// Use only non-whitespace segments when computing the overall source range
// for this message so that trailing or leading padding does not expand the
// reported span. If the message contains nothing but whitespace we fall
// back to considering all segments.
const relevant = nonWs.length ? nonWs : msgSegs;
if (!relevant.length) {
// If there are no relevant segments, we cannot produce an empty message.
return {
startIndex: 0, // in this case, we cannot determine the start index
endIndex: 0,
irStartIndex: 0,
irEndIndex: 0,
speaker: sp.speaker,
content: []
};
}
return {
startIndex: Math.min(...relevant.map(seg => seg.inputStart)),
endIndex: Math.max(...relevant.map(seg => seg.inputEnd)),
irStartIndex: Math.min(...relevant.map(seg => seg.irStart)),
irEndIndex: Math.max(...relevant.map(seg => seg.irEnd)),
speaker: sp.speaker,
content: msgSegs.map(seg => ({
startIndex: seg.inputStart,
endIndex: seg.inputEnd,
irStartIndex: seg.irStart,
irEndIndex: seg.irEnd,
content: seg.content
}))
};
})
.filter(msg => msg !== undefined);
}
/**
* Transform a {@link WriterResult} into discrete source map segments.
*
* The segments are ordered so that rich content can be reconstructed in
* the correct visual order while preserving multimedia positioning.
*/
buildSourceMap(result) {
// Collect every boundary within the output that could signify a change in
// source location. These come from the input/output mappings as well as
// multimedia positions. Splitting the output on these boundaries ensures
// each segment corresponds to a single source range.
const boundaries = new Set();
result.mappings.forEach(m => {
boundaries.add(m.outputStart);
boundaries.add(m.outputEnd + 1);
});
result.multimedia.forEach(m => {
boundaries.add(m.index);
boundaries.add(m.index + 1);
});
boundaries.add(0);
boundaries.add(result.output.length);
const points = Array.from(boundaries).sort((a, b) => a - b);
// `top` multimedia should appear before all textual content while `bottom`
// multimedia should come last. We therefore keep three buckets and merge
// them at the end.
const topSegments = [];
const middleSegments = [];
const bottomSegments = [];
const originalStartIndices = result.mappings
.map(m => m.originalStart)
.filter(m => m !== undefined);
const sourceStartIndex = originalStartIndices.length > 0 ? Math.min(...originalStartIndices) : 0;
const originalEndIndices = result.mappings.map(m => m.originalEnd).filter(m => m !== undefined);
const sourceEndIndex = originalEndIndices.length > 0 ? Math.max(...originalEndIndices) : 0;
for (let i = 0; i < points.length - 1; i++) {
const start = points[i];
const end = points[i + 1];
if (start >= end) {
continue;
}
const slice = result.output.slice(start, end);
// Find the most specific mapping that covers this slice. This allows the
// resulting segment to map back to the tightest IR range responsible for
// the output.
let chosen;
// The chosen IR might not have a precise original start or end index, so we
// choose a fallback based on the original mappings.
let chosenOriginal;
for (const m of result.mappings) {
if (start >= m.outputStart && end - 1 <= m.outputEnd) {
if (!chosen || m.outputEnd - m.outputStart < chosen.outputEnd - chosen.outputStart) {
chosen = m;
}
if (m.originalStart !== undefined &&
m.originalEnd !== undefined &&
(!chosenOriginal ||
m.originalEnd - m.originalStart <
chosenOriginal.originalEnd - chosenOriginal.originalStart)) {
chosenOriginal = m;
}
}
}
if (!chosen) {
// Mappings must be non-empty here because the points are derived from the
// mappings. If we cannot find a mapping, use the first one as a fallback.
chosen = result.mappings[0];
}
// If a multimedia item starts at this boundary, emit it instead of text.
const media = result.multimedia.find(m => m.index === start);
if (media) {
const { position, index, ...rest } = media;
const segment = {
outStart: start,
outEnd: end - 1,
irStart: chosen.inputStart,
irEnd: chosen.inputEnd,
inputStart: chosenOriginal?.originalStart ?? sourceStartIndex,
inputEnd: chosenOriginal?.originalEnd ?? sourceEndIndex,
content: [rest]
};
if (position === 'top') {
topSegments.push(segment);
}
else if (position === 'bottom') {
bottomSegments.push(segment);
}
else {
middleSegments.push(segment);
}
}
else if (slice !== SPECIAL_CHARACTER && slice.length > 0) {
// Normal textual slice.
middleSegments.push({
outStart: start,
outEnd: end - 1,
irStart: chosen.inputStart,
irEnd: chosen.inputEnd,
inputStart: chosenOriginal?.originalStart ?? sourceStartIndex,
inputEnd: chosenOriginal?.originalEnd ?? sourceEndIndex,
content: slice
});
}
}
middleSegments.sort((a, b) => a.outStart - b.outStart);
// Order the buckets so that `top` items are emitted before any textual
// content and `bottom` items are emitted last. When filtering these
// segments by speaker boundaries, each top or bottom item still appears
// within the correct message.
return [...topSegments, ...middleSegments, ...bottomSegments];
}
/**
* Execute the main writing logic and gather mapping, multimedia and speaker
* information before it is broken down into smaller segments.
*/
generateWriterResult(ir) {
this.reset(ir);
const $ = cheerio.load(ir, {
scriptingEnabled: false,
xml: { xmlMode: true, withStartIndices: true, withEndIndices: true }
}, false);
const partialResult = this.writeElementTree(this.getRoot($), $);
return {
input: ir,
output: partialResult.output,
mappings: partialResult.mappings,
multimedia: partialResult.multimedia,
speakers: this.assignSpeakers(partialResult, $)
};
}
getRoot($) {
return $($.root().children()[0]);
}
}
class EnvironmentDispatcher extends Writer {
writeElementTree(element, $) {
if (element.is('env')) {
let options = undefined;
try {
const optionsString = element.attr('writer-options');
if (optionsString) {
options = JSON.parse(optionsString);
}
}
catch (e) {
this.raiseError(`Invalid JSON for writer-options: ${element.attr('writer-options')}`, element);
}
if (element.attr('presentation') === 'markup') {
const markupLanguage = element.attr('markup-lang') || 'markdown';
if (markupLanguage === 'markdown') {
return new MarkdownWriter(this.ir, options).writeElementTree(element, $);
}
else if (markupLanguage === 'html') {
return new HtmlWriter(this.ir, options).writeElementTree(element, $);
}
else if (markupLanguage === 'csv') {
return new CsvWriter(this.ir, options).writeElementTree(element, $);
}
else if (markupLanguage === 'tsv') {
return new TsvWriter(this.ir, options).writeElementTree(element, $);
}
else {
return this.raiseError(`Invalid markup language: ${markupLanguage}`, element);
}
}
else if (element.attr('presentation') === 'serialize') {
const serializer = element.attr('serializer') || 'json';
if (serializer === 'json') {
return new JsonWriter(this.ir, options).writeElementTree(element, $);
}
else if (serializer === 'yaml') {
return new YamlWriter(this.ir, options).writeElementTree(element, $);
}
else if (serializer === 'xml') {
return new XmlWriter(this.ir, options).writeElementTree(element, $);
}
else {
return this.raiseError(`Invalid serializer: ${serializer}`, element);
}
}
else if (element.attr('presentation') === 'free') {
return new FreeWriter(this.ir, options).writeElementTree(element, $);
}
else if (element.attr('presentation') === 'multimedia') {
return new MultiMediaWriter(this.ir, options).writeElementTree(element, $);
}
else {
return this.raiseError(`Invalid presentation: ${element}`, element);
}
}
else {
// Not even an environment, consider writing it as a markdown
return new MarkdownWriter(this.ir).writeElementTree(element, $);
}
}
}
class MarkdownWriter extends Writer {
initializeOptions(options) {
options = options || {};
return {
markdownBaseHeaderLevel: options.markdownBaseHeaderLevel ?? 1,
markdownTableCollapse: options.markdownTableCollapse ?? false,
csvSeparator: options.csvSeparator ?? ',',
csvHeader: options.csvHeader ?? true,
truncateMarker: options.truncateMarker ?? ' (...truncated)',
truncateDirection: options.truncateDirection ?? 'end',
tokenEncodingModel: options.tokenEncodingModel ?? 'gpt-4o'
};
}
raiseErrorAndReturnEmpty(message, element) {
this.raiseError(message, element);
return { text: '', before: '', after: '', mappings: [], multimedia: [] };
}
makeBox(text, layout, element) {
const newBeforeAfter = layout === 'block' ? '\n\n' : layout === 'newline' ? '\n' : '';
const charLimitAttr = element.attr('char-limit');
const tokenLimitAttr = element.attr('token-limit');
const priorityAttr = element.attr('priority');
const charLimit = charLimitAttr !== undefined ? parseInt(charLimitAttr, 10) : undefined;
const tokenLimit = tokenLimitAttr !== undefined ? parseInt(tokenLimitAttr, 10) : undefined;
const priority = priorityAttr !== undefined ? parseFloat(priorityAttr) : undefined;
if (typeof text === 'string') {
const truncated = this.truncateText(text, charLimit, tokenLimit, this.options);
return {
text: truncated,
before: newBeforeAfter,
after: newBeforeAfter,
mappings: [this.createMappingNode(element, truncated.length)],
multimedia: [],
priority
};
}
else {
const combinedText = text.text;
const truncated = this.truncateText(combinedText, charLimit, tokenLimit, this.options);
return {
text: truncated,
before: this.consolidateSpace(newBeforeAfter, text.before),
after: this.consolidateSpace(text.after, newBeforeAfter),
mappings: [...text.mappings, this.createMappingNode(element, truncated.length)],
multimedia: text.multimedia,
priority
};
}
}
wrapBox(box, wrapBefore, wrapAfter, element) {
const text = wrapBefore + box.text + wrapAfter;
const mappings = this.indentMappings(box.mappings, wrapBefore.length, 0);
if (element) {
mappings.push(this.createMappingNode(element, text.length));
}
return {
text: text,
before: box.before,
after: box.after,
mappings: mappings,
multimedia: this.indentMultiMedia(box.multimedia, wrapBefore.length, 0)
};
}
wrapBoxEveryLine(box, wrapBefore, wrapAfter) {
const lines = box.text.split('\n');
let accumulatedLength = 0;
let mappings = box.mappings;
let multimedia = box.multimedia;
const text = lines
.map(line => {
const result = wrapBefore + line + wrapAfter;
mappings = this.indentMappings(mappings, wrapBefore.length, accumulatedLength);
multimedia = this.indentMultiMedia(multimedia, wrapBefore.length, accumulatedLength);
accumulatedLength += result.length + 1; // length of '\n'
return result;
})
.join('\n');
return {
text: text,
before: box.before,
after: box.after,
mappings: mappings,
multimedia: multimedia
};
}
consolidateSpace(space1, space2) {
let result = space1 + space2;
for (let i = 1; i <= Math.min(space1.length, space2.length); i++) {
if (space1.slice(-i) === space2.slice(0, i)) {
result = space1 + space2.slice(i);
}
}
return result;
}
reduceBoxesByLimit(boxes, charLimit, tokenLimit) {
if (boxes.length === 0 || (charLimit === undefined && tokenLimit === undefined)) {
return boxes;
}
const tokenModel = this.options.tokenEncodingModel || 'gpt-4o';
const getTokenLength = (t) => {
if (tokenLimit === undefined) {
return 0;
}
// Optimization: Use byte count as conservative estimate before tokenizing
const byteCount = Buffer.byteLength(t, 'utf8');
const BYTES_PER_TOKEN_ESTIMATE = 4;
// If byte count is small enough, we can estimate it's within token limits
// This is a heuristic - for very short strings, byte count ≈ token count
if (byteCount <= tokenLimit) {
return Math.ceil(byteCount / BYTES_PER_TOKEN_ESTIMATE); // Conservative estimate
}
let enc = this.tokenizerCache[tokenModel];
if (!enc) {
enc = encodingForModel(tokenModel);
this.tokenizerCache[tokenModel] = enc;
}
return enc.encode(t).length;
};
const totalChars = (arr) => arr.reduce((a, b) => a + b.text.length, 0);
const totalTokens = (arr) => arr.reduce((a, b) => a + getTokenLength(b.text), 0);
let current = [...boxes];
while (current.length > 0) {
const exceeds = (charLimit !== undefined && totalChars(current) > charLimit) ||
(tokenLimit !== undefined && totalTokens(current) > tokenLimit);
if (!exceeds) {
break;
}
const priorities = current.map(b => b.priority ?? 0);
const minP = Math.min(...priorities);
if (current.every(b => (b.priority ?? 0) === minP)) {
break;
}
current = current.filter(b => (b.priority ?? 0) !== minP);
}
return current;
}
concatMarkdownBoxes(boxes, element) {
const charLimitAttr = element?.attr('char-limit');
const tokenLimitAttr = element?.attr('token-limit');
const charLimit = charLimitAttr !== undefined ? parseInt(charLimitAttr, 10) : undefined;
const tokenLimit = tokenLimitAttr !== undefined ? parseInt(tokenLimitAttr, 10) : undefined;
const multimedia = [];
// Remove all spaces children before and after block elements
// or between two multimedia-only nodes so images do not create
// stray blank lines when placed consecutively.
let removedSpace = boxes;
while (true) {
let afterRemoveSpace = removedSpace.filter((child, i) => {
const afterBlock = i > 0 &&
(removedSpace[i - 1].after.includes('\n') || /^\n+$/.test(removedSpace[i - 1].text));
const beforeBlock = i < removedSpace.length - 1 &&
(removedSpace[i + 1].before.includes('\n') || /^\n+$/.test(removedSpace[i + 1].text));
// When a whitespace-only box is sandwiched between two multimedia
// boxes (e.g., two consecutive images), we treat it like the spaces
// around a block element so it doesn't generate a blank line.
const afterMedia = i > 0 &&
removedSpace[i - 1].multimedia.length > 0 &&
removedSpace[i - 1].multimedia.length === removedSpace[i - 1].text.length;
const beforeMedia = i < removedSpace.length - 1 &&
removedSpace[i + 1].multimedia.length > 0 &&
removedSpace[i + 1].multimedia.length === removedSpace[i + 1].text.length;
return !((afterBlock || beforeBlock || afterMedia || beforeMedia) &&
/^[ \t]*$/.test(child.text));
});
if (afterRemoveSpace.length === removedSpace.length) {
break;
}
// Repeat until no more space can be removed
removedSpace = afterRemoveSpace;
}
removedSpace = this.reduceBoxesByLimit(removedSpace, charLimit, tokenLimit);
// When concatenating, we handle 3 cases.
// 1. If both ends are text, the same space characters will be overlapped and consolidated.
// 2. If one end is text and the other end is multimedia (floated), the multimedia will be as if it doesn't exist.
// This case is only handled when it only contains multimedia. If there's text in between, we assume it's already handled.
// 3. If one end is text and the other end is multimedia (adhered), the multimedia will eat up the space characters.
const enumerate = (boxes) => {
return boxes.map((box, i) => {
return { box, index: i };
});
};
// See the comment above for the explanation.
const asIfNotExist = (box) => {
return (box.multimedia.length > 0 &&
box.multimedia.length === box.text.length &&
box.multimedia.every(media => media.position !== 'here'));
};
const textBoxQueue = enumerate(removedSpace).filter(({ box }) => !asIfNotExist(box));
const multimediaQueue = enumerate(removedSpace).filter(({ box }) => asIfNotExist(box));
const mappings = [];
// When concatenating, make sure all multimedia boxes are skipped.
// Multimedia boxes are instead directly adhered to the previous box.
// Kinda like a merge sort.
let text = '';
let before = '';
let after = '';
let i = 0, j = 0;
while (i < textBoxQueue.length || j < multimediaQueue.length) {
if (i === textBoxQueue.length ||
(j < multimediaQueue.length && multimediaQueue[j].index < textBoxQueue[i].index)) {
const multimediaBox = multimediaQueue[j].box;
mappings.push(...this.indentMappings(multimediaBox.mappings, text.length, 0));
multimedia.push(...this.indentMultiMedia(multimediaBox.multimedia, text.length, 0));
text += multimediaBox.text;
j++;
}
else {
const box = textBoxQueue[i].box;
if (i === 0) {
before = box.before;
}
mappings.push(...this.indentMappings(box.mappings, text.length, 0));
// It still could contain inner multimedia
multimedia.push(...this.indentMultiMedia(box.multimedia, text.length, 0));
text += box.text;
if (i === textBoxQueue.length - 1) {
after = box.after;
}
else {
let thisAfter;
if (box.multimedia.filter(media => media.position === 'here' && media.index + 1 === box.text.length).length > 0) {
// Has an adhered multimedia at the end
thisAfter = '';
}
else if (textBoxQueue[i + 1].box.multimedia.filter(media => media.position === 'here' && media.index === 0).length > 0) {
thisAfter = '';
}
else {
thisAfter = this.consolidateSpace(box.after, textBoxQueue[i + 1].box.before);
}
text += thisAfter;
}
i++;
}
}
let finalText = text;
if (charLimit !== undefined || tokenLimit !== undefined) {
finalText = this.truncateText(finalText, charLimit, tokenLimit, this.options);
}
return { text: finalText, before, after, mappings, multimedia };
}
indentText(text, indent, firstLineIndent) {
const lines = text.split('\n');
return lines
.map((line, i) => {
if (!line) {
return line;
}
else if (i === 0) {
return ' '.repeat(firstLineIndent) + line;
}
else {
return ' '.repeat(indent) + line;
}
})
.join('\n');
}
handleParagraph = (innerParagraphs, element, indent, firstLineIndent, blankLine) => {
innerParagraphs.text = this.indentText(innerParagraphs.text, indent ?? 0, Math.max(0, (firstLineIndent ?? 0) + (indent ?? 0)));
if (element.attr('blank-line') === 'true') {
blankLine = true;
}
else if (element.attr('blank-line') === 'false') {
blankLine = false;
}
if (blankLine || blankLine === undefined) {
return this.makeBox(innerParagraphs, 'block', element);
}
else {
return this.makeBox(innerParagraphs, 'newline', element);
}
};
writeElementTrees(elements, $, element) {
const children = elements
.toArray()
.filter(element => element.type !== 'comment')
.map(element => {
if (element.type === 'text') {
return { text: element.data, before: '', after: '', mappings: [], multimedia: [] };
}
else {
return this.writeElementTreeImpl($(element), $);
}
});
return this.concatMarkdownBoxes(children, element);
}
handleList(listStyle, listSelf, $) {
let indexIncrement = 0;
const renderListItem = (item) => {
const selectedItem = $(item);
if (item.type === 'text') {
return this.makeBox(item.data, 'inline', selectedItem);
}
if (!selectedItem.is('item')) {
return this.writeElementTreeImpl(selectedItem, $);
}
let bullet;
++indexIncrement;
switch (listStyle) {
case 'star':
bullet = '* ';
break;
case 'dash':
bullet = '- ';
break;
case 'plus':
bullet = '+ ';
break;
case 'decimal':
bullet = `${indexIncrement}. `;
break;
case 'latin':
bullet = String.fromCharCode(0x61 + indexIncrement - 1) + '. ';
break;
default:
this.raiseError(`Invalid list style: ${listStyle}`, selectedItem);
return this.makeBox('', 'block', selectedItem);
}
const paragraph = this.writeElementTrees(selectedItem.contents(), $);
const paragraphWithBullet = this.wrapBox(paragraph, bullet, '', selectedItem);
const doubleNewLine = paragraphWithBullet.text.includes('\n\n');
return this.handleParagraph(paragraphWithBullet, selectedItem, bullet.length, -bullet.length, doubleNewLine);
};
const items = listSelf
.contents()
.toArray()
.map(item => renderListItem(item));
return this.handleParagraph(this.concatMarkdownBoxes(items, listSelf), listSelf);
}
processMultipleTableRows(elements, $) {
const escapeInTable = (text) => {
return text.replace(/\|/g, '\\|');
};
return elements
.contents()
.toArray()
.map(element => {
if (!$(element).is('trow')) {
this.raiseError(`Invalid table head, expect trow: ${element}`, $(element));
return [];
}
return $(element)
.contents()
.toArray()
.map(cell => {
if (!$(cell).is('tcell')) {
this.raiseError(`Invalid table cell, expect tcell: ${cell}`, $(element));
return '';
}
return escapeInTable(this.writeElementTrees($(cell).contents(), $).text);
});
});
}
handleTable(tableHeadElements, tableBodyElements, tableElement, $) {
const tableHead = this.processMultipleTableRows(tableHeadElements, $);
const tableBody = this.processMultipleTableRows(tableBodyElements, $);
const numberOfColumns = Math.max(...tableHead.map(row => row.length), ...tableBody.map(row => row.length));
const columnWidths = [...Array(numberOfColumns).keys()].map(i => {
return Math.max(...tableHead.map(row => (row[i] ? row[i].length : 0)), ...tableBody.map(row => (row[i] ? row[i].length : 0)));
});
// TODO: alignment and collapse config
// Currently follows the format here: https://docs.github.com/en/get-started/writing-on-github/working-with-advanced-formatting/organizing-information-with-tables
const makeRow = (row, isHeader) => {
if (isHeader && row.length !== numberOfColumns) {
row = [...row, ...[...Array(numberOfColumns - row.length).keys()].map(() => '')];
}
return ('| ' +
row
.map((cell, i) => {
if (this.options.markdownTableCollapse) {
return cell + ' |';
}
else {
return cell.padEnd(columnWidths[i]) + ' |';
}
})
.join(' '));
};
const makeSeparator = () => {
return ('| ' +
columnWidths
.map(width => '-'.repeat(this.options.markdownTableCollapse && width >= 3 ? 3 : width))
.join(' | ') +
' |');
};
const renderedTable = [
...tableHead.map(row => makeRow(row, true)),
makeSeparator(),
...tableBody.map(row => makeRow(row, false))
];
return this.makeBox(renderedTable.join('\n'), 'block', tableElement);
}
writeElementTreeImpl(element, $) {
if (element.is('p')) {
let paragraphs = this.writeElementTrees(element.contents(), $, element);
return this.handleParagraph(paragraphs, element);
}
else if (element.is('span')) {
return this.makeBox(this.writeElementTrees(element.contents(), $, element), 'inline', element);
}
else if (element.is('nl')) {
const nlText = '\n'.repeat(parseInt(element.attr('count') || '1'));
return {
text: nlText,
before: '',
after: '',
mappings: [this.createMappingNode(element, nlText.length)],
multimedia: []
};
}
else if (element.is('h')) {
let paragraphs = this.writeElementTrees(element.contents(), $, element);
const level = parseInt(element.attr('level') || '1') + this.options.markdownBaseHeaderLevel - 1;
return this.handleParagraph(this.wrapBoxEveryLine(paragraphs, '#'.repeat(level) + ' ', ''), element);
}
else if (element.is('b')) {
return this.wrapBox(this.writeElementTrees(element.contents(), $, element), '**', '**', element);
}
else if (element.is('i')) {
return this.wrapBox(this.writeElementTrees(element.contents(), $, element), '*', '*', element);
}
else if (element.is('s')) {
return this.wrapBox(this.writeElementTrees(element.contents(), $, element), '~~', '~~', element);
}
else if (element.is('u')) {
return this.wrapBox(this.writeElementTrees(element.contents(), $, element), '__', '__', element);
}
else if (element.is('code')) {
let paragraphs;
if (element.attr('inline') === 'false') {
const lang = element.attr('lang') || '';
paragraphs = this.wrapBox(this.writeElementTrees(element.contents(), $, element), '```' + lang + '\n', '\n```');
return this.handleParagraph(paragraphs, element);
}
else {
// inline = true or undefined
return this.wrapBox(this.writeElementTrees(element.contents(), $, element), '`', '`', element);
}
}
else if (element.is('table')) {
const contents = element.contents();
if (contents.length !== 2 ||
(!contents.first().is('thead') && !contents.first().is('tbody'))) {
return this.raiseErrorAndReturnEmpty(`Invalid table, expect two children thead and tbody: ${element}`, element);
}
const [tableHeadElements, tableBodyElements] = contents.toArray();
return this.handleParagraph(this.handleTable($(tableHeadElements), $(tableBodyElements), $(element), $), element);
}
else if (element.is('thead') ||
element.is('tbody') ||
element.is('trow') ||
element.is('tcell')) {
return this.raiseErrorAndReturnEmpty('thead, tbody, trow, tcell do not appear alone without a table context', element);
}
else if (element.is('list')) {
const listStyle = element.attr('list-style');
return this.handleList(listStyle || 'dash', element, $);
}
else if (element.is('item')) {
return this.raiseErrorAndReturnEmpty('item does not appear alone without a list context', element);
}
else if (element.is('env')) {
if (element.attr('presentation') === 'markup' &&
element.attr('markup-lang') === this.markupLanguage()) {
return this.makeBox(this.writeElementTrees(element.contents(), $, element), 'inline', element);
}
else {
const content = new EnvironmentDispatcher(this.ir).writeElementTree(element, $);
const { output, mappings, multimedia } = content;
return this.makeBox({ text: output, before: '', after: '', mappings, multimedia }, 'inline', $(element));
}
}
else {
return this.raiseErrorAndReturnEmpty(`Not implemented element type ${element}`, element);
}
}
writeElementTree(element, $) {
const markdownBox = this.writeElementTreeImpl(element, $);
return {
output: markdownBox.text,
mappings: markdownBox.mappings,
multimedia: markdownBox.multimedia
};
}
markupLanguage() {
return 'markdown';
}
}
class HtmlWriter extends Writer {
inTableHead = false;
initializeOptions(options) {
return {
htmlPrettyPrint: options?.htmlPrettyPrint ?? true,
htmlIndent: options?.htmlIndent ?? ' '
};
}
handleTableHeadBody(document, element, $) {
if (!(element.is('thead') || element.is('tbody') || element.is('tcell') || element.is('trow'))) {
this.raiseError(`Only thead, tbody and tcell should be handled, not ${element}`, element);
return;
}
const originalTableHead = this.inTableHead;
if (element.is('thead')) {
this.inTableHead = true;
}
if (element.is('tcell')) {
if (this.inTableHead) {
this.fillNodeContents(document.ele('th'), element, $);
}
else {
this.fillNodeContents(document.ele('td'), element, $);
}
}
else if (element.is('trow')) {
this.fillNodeContents(document.ele('tr'), element, $);
}
else {
const tagName = element.is('thead') ? 'thead' : 'tbody';
this.fillNodeContents(document.ele(tagName), element, $);
}
this.inTableHead = originalTableHead;
}
fillNodeContents(document, element, $) {
element
.contents()
.toArray()
.forEach(child => {
if (child.type === 'text') {
document.txt(child.data);
}
else {
this.addNode(document, $(child), $);
}
});
}
addNode(document, element, $) {
if (element.is('h')) {
const level = element.attr('level') || '1';
const tagName = `h${level}`;
this.fillNodeContents(document.ele(tagName), element, $);
}
else if (element.is('code')) {
this.fillNodeContents(document.ele('pre').ele('code'), element, $);
}
else if (element.is('nl')) {
const count = parseInt(element.attr('count') || '1');
for (let i = 0; i < count; i++) {
document.ele('br');
}
}
else if (element.is('thead') ||
element.is('tbody') ||
element.is('trow') ||
element.is('tcell')) {
this.handleTableHeadBody(document, element, $);
}
else if (element.is('env')) {
if (element.attr('presentation') === 'markup' && element.attr('markup-lang') === 'html') {
this.fillNodeContents(document, element, $);
}
else {
const inner = new EnvironmentDispatcher(this.ir).writeElementTree(element, $);
if (inner.multimedia.length > 0) {
this.raiseError('Multimedia cannot be nested in HTML.', element);
}
document.txt(inner.output);
}
}
else {
const tagName = element.prop('tagName')?.toLowerCase() || 'div';
this.fillNodeContents(document.ele(tagName), element, $);
}
}
writeElementTree(element, $) {
const document = xmlbuilder.create();
this.addNode(document, element, $);
const html = document.end({
prettyPrint: this.options.htmlPrettyPrint,
indent: this.options.htmlIndent,
headless: true
});
return {
output: html,
mappings: [this.createMappingNode(element, html.length)],
multimedia: []
};
}
}
class CsvWriter extends MarkdownWriter {
handleTable(tableHeadElements, tableBodyElements, tableElement, $) {
const tableHead = this.processMultipleTableRows(tableHeadElements, $);
const tableBody = this.processMultipleTableRows(tableBodyElements, $);
const makeCell = (cell) => {
if (cell.includes(this.options.csvSeparator)) {
if (cell.includes('"')) {
cell = cell.replace(/"/g, '""');
}
cell = '"' + cell + '"';
}
return cell;
};
const makeRow = (row) => {
return row.map(makeCell).join(this.options.csvSeparator);
};
let renderedTable;
if (this.options.csvHeader) {
renderedTable = [...tableHead.map(makeRow), ...tableBody.map(makeRow)];
}
else {
renderedTable = [...tableBody.map(makeRow)];
}
return this.makeBox(renderedTable.join('\n'), 'block', tableElement);
}
writeElementTreeImpl(element, $) {
if (element.is('table') ||
element.is('thead') ||
element.is('tbody') ||
element.is('trow') ||
element.is('tcell') ||
element.is('env')) {
return super.writeElementTreeImpl(element, $);
}
else {
return this.raiseErrorAndReturnEmpty(`Not implemented element type in csv ${element}`, element);
}
}
ma