UNPKG

@paroicms/site-generator-plugin

Version:

ParoiCMS Site Generator Plugin

gitlab.com/paroi/opensource/paroicms

207 lines (206 loc) • 9.15 kB

JavaScript

export function parseLlmResponseAsProperties(llmResponse, outputTags) { const rawTags = parseLlmRawTags(llmResponse, outputTags.map((tag) => tag.tagName)); const map = new Map(rawTags.map((tag) => [tag.tagName, tag])); if (rawTags.length !== outputTags.length) { const missingTags = outputTags.filter((tag) => !map.has(tag.tagName)); throw new Error(`Missing tags: ${missingTags.map((tag) => tag.tagName).join(", ")}`); } const resultObj = {}; for (const outputTag of outputTags) { const raw = map.get(outputTag.tagName); if (!raw) { if (!outputTag.optional) throw new Error(`Missing tag: ${outputTag.tagName}`); continue; } resultObj[outputTag.key] = formatRawContent(raw.content, outputTag); } return resultObj; } export function parseLlmResponseAsList(llmResponse, outputTags, options = {}) { const { tolerateErrors, titleFromMarkdownHeading } = options; const rawTags = parseLlmRawTags(llmResponse, outputTags.map((tag) => tag.tagName), options); if (rawTags.length === 0) return []; const outputTagMap = new Map(outputTags.map((tag) => [tag.tagName, tag])); const titleOutputTag = titleFromMarkdownHeading ? outputTags.find((tag) => tag.key === titleFromMarkdownHeading) : undefined; const result = []; let current = {}; let currentItemStartPosition; let previousItemEndPosition = 0; for (let tagIndex = 0; tagIndex < rawTags.length; ++tagIndex) { const rawTag = rawTags[tagIndex]; const outputTag = outputTagMap.get(rawTag.tagName); if (!outputTag) throw new Error(`Unexpected output tag "${rawTag.tagName}"`); // it's a real bug // Track start position of current item (first tag we see for this item) if (currentItemStartPosition === undefined) { currentItemStartPosition = rawTag.startPosition; } if (outputTag.key in current) { // We're starting a new item - try to complete the previous one const completedItem = tryCompleteItem(current, outputTags, titleOutputTag, llmResponse, previousItemEndPosition, currentItemStartPosition, options); if (completedItem) { result.push(completedItem); } // Update end position for next gap calculation: // The gap for the next item starts where this tag starts (not where the previous item started) // This way, the Markdown heading before this tag can be found previousItemEndPosition = currentItemStartPosition; currentItemStartPosition = rawTag.startPosition; current = {}; } if (rawTag.content.trim() === "") { if (!outputTag.optional) { const message = `Empty tag <${outputTag.tagName}>`; if (!tolerateErrors) throw new Error(message); tolerateErrors.errorMessages.push(message); current = {}; currentItemStartPosition = undefined; } continue; } current[outputTag.key] = formatRawContent(rawTag.content, outputTag); } // Handle the last item if (Object.keys(current).length > 0) { const completedItem = tryCompleteItem(current, outputTags, titleOutputTag, llmResponse, previousItemEndPosition, currentItemStartPosition ?? 0, options); if (completedItem) { result.push(completedItem); } } return result; } /** * Try to complete an item by ensuring all required properties are present. * If the title is missing, attempt to extract it from a Markdown heading * in the gap before the item's first tag. */ function tryCompleteItem(obj, outputTags, titleOutputTag, llmResponse, gapStart, gapEnd, options) { const { tolerateErrors } = options; // If title is missing and we have a title tag definition, try fallback extraction if (titleOutputTag && !(titleOutputTag.key in obj)) { const gapText = llmResponse.substring(gapStart, gapEnd); const extractedTitle = extractMarkdownTitle(gapText); if (extractedTitle) { obj[titleOutputTag.key] = extractedTitle; } } // Now check all required properties for (const tag of outputTags) { if (!(tag.key in obj) && !tag.optional) { const message = `Missing tag <${tag.tagName}>`; if (!tolerateErrors) throw new Error(message); tolerateErrors.errorMessages.push(message); return; } } return obj; } function formatRawContent(rawContent, tag) { const { format, optional } = tag; if (!rawContent && optional) return; switch (format) { case "json": return JSON.parse(rawContent); case "markdown": case "text": return rawContent; default: throw new Error(`Unknown format "${format}"`); } } /** * Extract the first Markdown heading from a text snippet. * Handles any heading depth (#, ##, ###, etc.) and strips bold markers (**). * Returns undefined if no heading is found. */ export function extractMarkdownTitle(text) { // Match any Markdown heading: # Title, ## Title, ### Page 1: **Title**, etc. const headingMatch = text.match(/^#{1,6}\s+(.+)$/m); if (!headingMatch) return undefined; let title = headingMatch[1].trim(); // Handle patterns like "Page 1: **Actual Title**" - extract just the title part const colonMatch = title.match(/^(?:Page\s+\d+\s*:\s*)?(.+)$/i); if (colonMatch) { title = colonMatch[1].trim(); } // Strip bold markers **title** -> title title = title.replace(/^\*\*(.+)\*\*$/, "$1"); // Strip any trailing ** that might be left title = title.replace(/\*\*/g, "").trim(); return title || undefined; } export function parseLlmRawTags(llmResponse, tagNames, options = {}) { const { tolerateErrors } = options; const tagList = []; // Create regex to match all opening and closing tags const tagNamesPattern = tagNames.join("|"); const pattern = new RegExp(`<(${tagNamesPattern})>|</(${tagNamesPattern})>`, "g"); const matches = []; let match; // Find all tags (opening and closing) and store them with their positions // biome-ignore lint/suspicious/noAssignInExpressions: regex exec requires assignment in loop while ((match = pattern.exec(llmResponse)) !== null) { const isOpening = match[1] !== undefined; const tagName = isOpening ? match[1] : match[2]; matches.push({ isOpening, tagName, position: match.index }); } // Process the tags to extract content for (let i = 0; i < matches.length; ++i) { const current = matches[i]; if (current.isOpening) { // Find the next corresponding closing tag const j = i + 1; let foundClosing = false; if (j < matches.length) { const next = matches[j]; // If we encounter another opening tag of any type before finding our closing tag, // it's an error if not tolerating errors if (next.isOpening) { const message = `Missing closing tag for <${current.tagName}>`; if (!tolerateErrors) throw new Error(message); tolerateErrors.errorMessages.push(message); foundClosing = undefined; // If we are tolerating errors, we skip this opening tag entirely } else { if (next.tagName !== current.tagName) { // Found a non-matching closing tag const message = `Mismatched tags: opening <${current.tagName}>, closing </${next.tagName}>`; if (!tolerateErrors) throw new Error(message); tolerateErrors.errorMessages.push(message); } // Found a matching closing tag const contentStart = current.position + `<${current.tagName}>`.length; const contentEnd = next.position; const content = llmResponse.substring(contentStart, contentEnd).trim(); tagList.push({ tagName: current.tagName, content, startPosition: current.position, }); // Skip to after this closing tag i = j; foundClosing = true; } } // Handle case where no matching closing tag was found if (foundClosing === false) { const message = `Unclosed tag <${current.tagName}>`; if (!tolerateErrors) throw new Error(message); tolerateErrors.errorMessages.push(message); } } } return tagList; }