llm-code-format
Version:
Parsing and serialization of multiple code files in Markdown for LLMs
93 lines (92 loc) • 3.59 kB
JavaScript
export function parseMarkdownFiles(markdownString, format) {
const files = {};
const backtickHeadingRegex = /^\s*###\s*`([^`]+)`\s*\n```(?:\w+)?\n([\s\S]*?)```/gm;
const fileBoldFormatRegex = /^\s*###\s*File:\s*\*\*(.+?)\*\*\s*\n```(?:\w+)?\n([\s\S]*?)```/gm;
const numberedBacktickFormatRegex = /^\s*###\s*\d+\.\s*`([^`]+)`(?:[^\n]*\n)*?\s*```(?:\w+)?\n([\s\S]*?)```/gm;
const standardHeadingRegex = /^\s*###\s*(?!`|File:|\d+\.)\s*([^\n`]+?)\s*\n```(?:\w+)?\n([\s\S]*?)```/gm;
const colonFormatRegex = /^\s*(?!###|\*\*|`)([^\n#*`]+?):\s*\n```(?:\w+)?\n([\s\S]*?)```/gm;
const hashFormatRegex = /^\s*# ([^\n`]+?)\s*\n```(?:\w+)?\n([\s\S]*?)```/gm;
const boldFormatRegex = /^\s*(?!###)\*\*([^\n*`]+?)\*\*(?:[^\n]*)\s*\n```(?:\w+)?\n([\s\S]*?)```/gm;
const headingBoldFormatRegex = /^### \*\*([^\n`]+?)\*\*\s*\n```(?:\w+)?\n([\s\S]*?)```/gm;
const numberedBoldFormatRegex = /^\s*\d+\.\s*\*\*([^\n`]+?)\*\*[\s\S]*?\n```(?:\w+)?\n([\s\S]*?)```/gm;
// Associate each regex with a unique, lowercase key
const regexes = [
{
regex: backtickHeadingRegex,
format: "Backtick-Heading Format",
key: "backtick-heading",
},
{
regex: fileBoldFormatRegex,
format: "File Bold Format",
key: "file-bold",
},
{
regex: numberedBacktickFormatRegex,
format: "Numbered Backtick Format",
key: "numbered-backtick",
},
{
regex: headingBoldFormatRegex,
format: "Heading Bold Format",
key: "heading-bold",
},
{
regex: standardHeadingRegex,
format: "Standard Heading Format",
key: "standard-heading",
},
{
regex: colonFormatRegex,
format: "Colon Format",
key: "colon",
},
{
regex: boldFormatRegex,
format: "Bold Format",
key: "bold",
},
{
regex: hashFormatRegex,
format: "Hash Format",
key: "hash",
},
{
regex: numberedBoldFormatRegex,
format: "Numbered Bold Format",
key: "numbered-bold",
},
];
let selectedRegexes = regexes;
if (format) {
// Find the regex that matches the specified format
const formatRegexEntry = regexes.find((r) => r.key === format);
if (!formatRegexEntry) {
throw new Error(`Unsupported format: ${format}`);
}
selectedRegexes = [formatRegexEntry];
}
let detectedFormat = "Unknown Format";
// Process each format and stop after the first matching format
for (const { regex, format: fmt } of selectedRegexes) {
regex.lastIndex = 0; // Reset regex index
const matches = {};
let match;
while ((match = regex.exec(markdownString)) !== null) {
let name = match[1].trim();
// For Bold Format, strip out parentheses and any content after them
if (fmt === "Bold Format") {
// Remove anything in parentheses and trim
name = name.replace(/\s*\([^)]*\).*$/, "").trim();
}
const code = match[2].trim();
matches[name] = code;
}
if (Object.keys(matches).length > 0) {
Object.assign(files, matches);
detectedFormat = fmt;
break; // Stop after the first matching format
}
}
return { files, format: detectedFormat };
}