@paroicms/site-generator-plugin
Version:
ParoiCMS Site Generator Plugin
207 lines (206 loc) • 9.15 kB
JavaScript
export function parseLlmResponseAsProperties(llmResponse, outputTags) {
const rawTags = parseLlmRawTags(llmResponse, outputTags.map((tag) => tag.tagName));
const map = new Map(rawTags.map((tag) => [tag.tagName, tag]));
if (rawTags.length !== outputTags.length) {
const missingTags = outputTags.filter((tag) => !map.has(tag.tagName));
throw new Error(`Missing tags: ${missingTags.map((tag) => tag.tagName).join(", ")}`);
}
const resultObj = {};
for (const outputTag of outputTags) {
const raw = map.get(outputTag.tagName);
if (!raw) {
if (!outputTag.optional)
throw new Error(`Missing tag: ${outputTag.tagName}`);
continue;
}
resultObj[outputTag.key] = formatRawContent(raw.content, outputTag);
}
return resultObj;
}
export function parseLlmResponseAsList(llmResponse, outputTags, options = {}) {
const { tolerateErrors, titleFromMarkdownHeading } = options;
const rawTags = parseLlmRawTags(llmResponse, outputTags.map((tag) => tag.tagName), options);
if (rawTags.length === 0)
return [];
const outputTagMap = new Map(outputTags.map((tag) => [tag.tagName, tag]));
const titleOutputTag = titleFromMarkdownHeading
? outputTags.find((tag) => tag.key === titleFromMarkdownHeading)
: undefined;
const result = [];
let current = {};
let currentItemStartPosition;
let previousItemEndPosition = 0;
for (let tagIndex = 0; tagIndex < rawTags.length; ++tagIndex) {
const rawTag = rawTags[tagIndex];
const outputTag = outputTagMap.get(rawTag.tagName);
if (!outputTag)
throw new Error(`Unexpected output tag "${rawTag.tagName}"`); // it's a real bug
// Track start position of current item (first tag we see for this item)
if (currentItemStartPosition === undefined) {
currentItemStartPosition = rawTag.startPosition;
}
if (outputTag.key in current) {
// We're starting a new item - try to complete the previous one
const completedItem = tryCompleteItem(current, outputTags, titleOutputTag, llmResponse, previousItemEndPosition, currentItemStartPosition, options);
if (completedItem) {
result.push(completedItem);
}
// Update end position for next gap calculation:
// The gap for the next item starts where this tag starts (not where the previous item started)
// This way, the Markdown heading before this tag can be found
previousItemEndPosition = currentItemStartPosition;
currentItemStartPosition = rawTag.startPosition;
current = {};
}
if (rawTag.content.trim() === "") {
if (!outputTag.optional) {
const message = `Empty tag <${outputTag.tagName}>`;
if (!tolerateErrors)
throw new Error(message);
tolerateErrors.errorMessages.push(message);
current = {};
currentItemStartPosition = undefined;
}
continue;
}
current[outputTag.key] = formatRawContent(rawTag.content, outputTag);
}
// Handle the last item
if (Object.keys(current).length > 0) {
const completedItem = tryCompleteItem(current, outputTags, titleOutputTag, llmResponse, previousItemEndPosition, currentItemStartPosition ?? 0, options);
if (completedItem) {
result.push(completedItem);
}
}
return result;
}
/**
* Try to complete an item by ensuring all required properties are present.
* If the title is missing, attempt to extract it from a Markdown heading
* in the gap before the item's first tag.
*/
function tryCompleteItem(obj, outputTags, titleOutputTag, llmResponse, gapStart, gapEnd, options) {
const { tolerateErrors } = options;
// If title is missing and we have a title tag definition, try fallback extraction
if (titleOutputTag && !(titleOutputTag.key in obj)) {
const gapText = llmResponse.substring(gapStart, gapEnd);
const extractedTitle = extractMarkdownTitle(gapText);
if (extractedTitle) {
obj[titleOutputTag.key] = extractedTitle;
}
}
// Now check all required properties
for (const tag of outputTags) {
if (!(tag.key in obj) && !tag.optional) {
const message = `Missing tag <${tag.tagName}>`;
if (!tolerateErrors)
throw new Error(message);
tolerateErrors.errorMessages.push(message);
return;
}
}
return obj;
}
function formatRawContent(rawContent, tag) {
const { format, optional } = tag;
if (!rawContent && optional)
return;
switch (format) {
case "json":
return JSON.parse(rawContent);
case "markdown":
case "text":
return rawContent;
default:
throw new Error(`Unknown format "${format}"`);
}
}
/**
* Extract the first Markdown heading from a text snippet.
* Handles any heading depth (#, ##, ###, etc.) and strips bold markers (**).
* Returns undefined if no heading is found.
*/
export function extractMarkdownTitle(text) {
// Match any Markdown heading: # Title, ## Title, ### Page 1: **Title**, etc.
const headingMatch = text.match(/^#{1,6}\s+(.+)$/m);
if (!headingMatch)
return undefined;
let title = headingMatch[1].trim();
// Handle patterns like "Page 1: **Actual Title**" - extract just the title part
const colonMatch = title.match(/^(?:Page\s+\d+\s*:\s*)?(.+)$/i);
if (colonMatch) {
title = colonMatch[1].trim();
}
// Strip bold markers **title** -> title
title = title.replace(/^\*\*(.+)\*\*$/, "$1");
// Strip any trailing ** that might be left
title = title.replace(/\*\*/g, "").trim();
return title || undefined;
}
export function parseLlmRawTags(llmResponse, tagNames, options = {}) {
const { tolerateErrors } = options;
const tagList = [];
// Create regex to match all opening and closing tags
const tagNamesPattern = tagNames.join("|");
const pattern = new RegExp(`<(${tagNamesPattern})>|</(${tagNamesPattern})>`, "g");
const matches = [];
let match;
// Find all tags (opening and closing) and store them with their positions
// biome-ignore lint/suspicious/noAssignInExpressions: regex exec requires assignment in loop
while ((match = pattern.exec(llmResponse)) !== null) {
const isOpening = match[1] !== undefined;
const tagName = isOpening ? match[1] : match[2];
matches.push({ isOpening, tagName, position: match.index });
}
// Process the tags to extract content
for (let i = 0; i < matches.length; ++i) {
const current = matches[i];
if (current.isOpening) {
// Find the next corresponding closing tag
const j = i + 1;
let foundClosing = false;
if (j < matches.length) {
const next = matches[j];
// If we encounter another opening tag of any type before finding our closing tag,
// it's an error if not tolerating errors
if (next.isOpening) {
const message = `Missing closing tag for <${current.tagName}>`;
if (!tolerateErrors)
throw new Error(message);
tolerateErrors.errorMessages.push(message);
foundClosing = undefined;
// If we are tolerating errors, we skip this opening tag entirely
}
else {
if (next.tagName !== current.tagName) {
// Found a non-matching closing tag
const message = `Mismatched tags: opening <${current.tagName}>, closing </${next.tagName}>`;
if (!tolerateErrors)
throw new Error(message);
tolerateErrors.errorMessages.push(message);
}
// Found a matching closing tag
const contentStart = current.position + `<${current.tagName}>`.length;
const contentEnd = next.position;
const content = llmResponse.substring(contentStart, contentEnd).trim();
tagList.push({
tagName: current.tagName,
content,
startPosition: current.position,
});
// Skip to after this closing tag
i = j;
foundClosing = true;
}
}
// Handle case where no matching closing tag was found
if (foundClosing === false) {
const message = `Unclosed tag <${current.tagName}>`;
if (!tolerateErrors)
throw new Error(message);
tolerateErrors.errorMessages.push(message);
}
}
}
return tagList;
}