llm-code-format
Version:
Parsing and serialization of multiple code files in Markdown for LLMs
99 lines (98 loc) • 3.83 kB
JavaScript
"use strict";
Object.defineProperty(exports, "__esModule", { value: true });
exports.StreamingMarkdownParser = void 0;
class StreamingMarkdownParser {
buffer = "";
insideCodeFence = false;
currentFileName = null;
detectedFormat = "Unknown Format";
callbacks;
/**
* An array of regex patterns for detecting file headers.
* Currently only supports Bold Format, but can be extended in the future.
*/
headerPatterns = [
// Matches: **filename.js**
{
regex: /^\s*\*\*([^\n*`]+?)\*\*(?:[^\n]*)\s*$/,
format: "Bold Format",
},
];
constructor(callbacks) {
this.callbacks = callbacks;
}
/**
* Processes an incoming chunk from the stream.
* Chunks are buffered until full lines (ending with '\n') are available.
* @param chunk - A string chunk from the stream.
*/
async processChunk(chunk) {
this.buffer += chunk;
let newlineIndex;
while ((newlineIndex = this.buffer.indexOf("\n")) !== -1) {
const line = this.buffer.slice(0, newlineIndex);
this.buffer = this.buffer.slice(newlineIndex + 1);
await this.processLine(line);
}
}
/**
* Flushes any remaining content in the buffer.
* Should be called once after the stream has ended.
*/
async flushRemaining() {
if (this.buffer.length > 0) {
await this.processLine(this.buffer);
this.buffer = "";
}
}
/**
* Processes a single line.
* If the line is a code fence marker (starting with "```"), it toggles the code block state.
* When inside a code block, every line is emitted via onCodeLine.
* Outside of a code block, the line is checked against header patterns,
* and if a match is found, onFileNameChange is invoked.
* @param line - A single line of text.
*/
async processLine(line) {
// Check if the line is a code fence marker (could be "```" or "```lang")
if (line.trim().startsWith("```")) {
this.insideCodeFence = !this.insideCodeFence;
return; // The fence marker itself is not emitted as code content.
}
if (this.insideCodeFence) {
// Emit every line inside the code fence as a code line.
await this.callbacks.onCodeLine(line);
}
else {
// Outside a code fence, check for file header patterns.
for (const { regex, format } of this.headerPatterns) {
const match = regex.exec(line);
if (match) {
let fileName = match[1].trim();
// For Bold Format, strip out parentheses and any content after them
if (format === "Bold Format") {
// Remove anything in parentheses and trim
fileName = fileName.replace(/\s*\([^)]*\).*$/, "").trim();
}
this.currentFileName = fileName;
this.detectedFormat = format;
await this.callbacks.onFileNameChange(fileName, format);
break; // Stop after the first matching header is found.
}
}
// Non-header lines outside code fences
let isHeader = false;
for (const { regex } of this.headerPatterns) {
if (regex.test(line)) {
isHeader = true;
break;
}
}
// If it's not a header and the callback exists, call it
if (!isHeader && this.callbacks.onNonCodeLine) {
await this.callbacks.onNonCodeLine(line);
}
}
}
}
exports.StreamingMarkdownParser = StreamingMarkdownParser;