genaiscript
Version:
A CLI for GenAIScript, a generative AI scripting framework.
304 lines • 13 kB
JavaScript
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT License.
import replaceExt from "replace-ext";
import { readFile, writeFile } from "node:fs/promises";
import { CONSOLE_TOKEN_COLORS, MD_REGEX, PROMPTY_REGEX, CSVStringify, DOCXTryParse, HTMLToMarkdown, HTMLToText, INIStringify, JSON5Stringify, JSONLTryParse, YAMLStringify, chunkMarkdown, dataToMarkdownTable, dataTryParse, ellipse, estimateTokens, expandFiles, extractFenced, genaiscriptDebug, isJSONLFilename, jinjaRender, logVerbose, normalizeInt, parsePdf, prettyBytes, promptyParse, promptyToGenAIScript, readText, redactSecrets, resolveFileContent, resolveTokenEncoder, splitMarkdown, stdout, terminalSize, tryReadText, wrapColor, writeText, } from "@genaiscript/core";
import { basename, join } from "node:path";
import { parseOptionsVars } from "@genaiscript/core";
const dbg = genaiscriptDebug("cli:parse");
/**
* This module provides various parsing utilities for different file types such
* as PDF, DOCX, HTML, JSONL, and more. It includes functions to extract and
* convert data, estimate tokens, and transform file formats.
*/
/**
* Extracts and logs fenced code blocks of a specific language from a file.
* Filters the fenced blocks by the specified language and logs their content.
* @param language - The language to filter the fenced blocks by.
* @param file - The file to parse and extract fenced code blocks from.
*/
export async function parseFence(language, file) {
const res = await resolveFileContent({ filename: file });
const fences = extractFenced(res.content || "").filter((f) => f.language === language);
// Logs the content of the filtered fenced blocks
console.log(fences.map((f) => f.content).join("\n\n"));
}
/**
* Parses the contents of a PDF file and outputs them in text format.
* Optionally writes the content and page images to the specified output directory.
* If an output directory is specified, the text content is saved as a .txt file,
* and page images (if any) are saved as .png files.
* Logs the writing process for each file.
* If no output directory is specified, logs the text content to the console.
* @param file - The PDF file to parse.
* @param options - Options to include images and specify the output directory.
* - images: Whether to include page images in the output.
* - out: The output directory where files will be saved.
*/
export async function parsePDF(file, options) {
const { images, out } = options;
const { content, pages } = await parsePdf(file, { renderAsImage: images });
if (out) {
const fn = basename(file);
console.log(`writing ${join(out, fn + ".txt")}`);
await writeText(join(out, fn + ".txt"), content || "");
for (const page of pages) {
if (page.image) {
const n = join(out, fn + ".page" + page.index + ".png");
console.log(`writing ${n}`);
await writeFile(n, page.image);
}
}
}
else {
console.log(content || "");
}
}
/**
* Parses the contents of a DOCX file and logs the extracted text.
* If an error occurs during parsing, it logs the error.
* Uses DOCXTryParse to extract text from the DOCX file.
* @param file - The path to the DOCX file to parse.
* @param options - Options for parsing the DOCX file.
*/
export async function parseDOCX(file, options) {
// Uses DOCXTryParse to extract text from the DOCX file
const res = await DOCXTryParse(file, options);
if (res.error)
console.error(res.error);
else
console.log(res.file.content);
}
/**
* Converts HTML content to text and logs it or writes it to a file.
* @param fileOrUrl - The HTML file or URL to convert.
* @param options - Options to specify the output format ("markdown" or "text") and the output file path.
*/
export async function parseHTMLToText(fileOrUrl, options) {
const { format = "markdown", out } = options || {};
const file = { filename: fileOrUrl };
await resolveFileContent(file);
// Converts HTML to plain text
let text;
if (format === "markdown")
text = await HTMLToMarkdown(file.content);
else
text = await HTMLToText(file.content);
if (out) {
logVerbose(`writing ${out}`);
await writeText(out, text);
}
else
console.log(text);
}
/**
* Parses a Jinja2 file, substitutes variables, and logs the rendered output.
*
* @param file - The path to the Jinja2 template file to parse.
* @param options - An object containing the following properties:
* - vars: An array of key-value pairs in the format "key=value" to replace variables in the template.
*
* The function reads the template file, processes it based on its type (Prompty or Markdown),
* substitutes the provided variables, and renders the output. Variable values are converted
* to numbers if possible. Environment variables are also considered during substitution.
*/
export async function parseJinja2(file, options) {
let src = await readFile(file, { encoding: "utf-8" });
if (PROMPTY_REGEX.test(file))
src = promptyParse(file, src).content;
else if (MD_REGEX.test(file))
src = splitMarkdown(src).content;
const vars = parseOptionsVars(options.vars, process.env);
for (const k in vars) {
const i = parseFloat(vars[k]);
if (!isNaN(i))
vars[k] = i;
}
const res = jinjaRender(src, vars);
console.log(res);
}
/**
* Parses the input file and converts its data into a specified format.
*
* @param file - Path to the file to be read and parsed.
* @param options - Configuration options for the output format.
* @param options.format - The target format for the output. Supported formats include:
* - "yaml": Converts data to YAML format.
* - "ini": Converts data to INI format.
* - "csv": Converts data into a CSV format with a header row.
* - "md" or "markdown": Converts data into a Markdown table.
* - "json5": Converts data into JSON5 format.
* - Default: Outputs data as a prettified JSON string.
*
* Logs the converted data to the console.
* Throws an error if the data format cannot be determined.
*/
export async function parseAnyToJSON(file, options) {
const data = await dataTryParse({ filename: file });
if (!data)
throw new Error(`Unknown data format for ${file}`);
let out;
switch (options?.format?.toLowerCase() || "") {
case "yaml":
out = YAMLStringify(data);
break;
case "ini":
out = INIStringify(data);
break;
case "csv":
out = CSVStringify(data, { header: true });
break;
case "md":
case "markdown":
out = dataToMarkdownTable(data);
break;
case "json5":
out = JSON5Stringify(data, null, 2);
break;
default:
out = JSON.stringify(data, null, 2);
break;
}
console.log(out);
}
/**
* Converts JSONL files to JSON files.
* Processes an array of files or glob patterns, skipping non-JSONL files,
* and writes the converted JSON content to new files with a ".json" extension.
* Logs the conversion process for each file.
* @param files - An array of files or glob patterns to process.
*/
export async function jsonl2json(files) {
for (const file of await expandFiles(files, { applyGitIgnore: false })) {
if (!isJSONLFilename(file)) {
// Skips files that are not JSONL
console.log(`skipping ${file}`);
continue;
}
const content = await tryReadText(file);
const objs = await JSONLTryParse(content, { repair: true });
const out = replaceExt(file, ".json");
await writeText(out, JSON.stringify(objs, null, 2));
console.log(`${file} -> ${out}`);
}
}
/**
* Estimates the number of tokens in the content of files and logs the results.
* @param filesGlobs - An array of files or glob patterns to process.
* @param options - Options for processing files.
* - excludedFiles - A list of files to exclude from processing.
* - model - The name of the model used for token encoding.
* - ignoreGitIgnore - Whether to ignore .gitignore rules when expanding files.
*/
export async function parseTokens(filesGlobs, options) {
const { model } = options || {};
const { encode: encoder } = await resolveTokenEncoder(model);
const files = await expandFiles(filesGlobs, options);
console.log(`parsing ${files.length} files`);
let text = "";
for (const file of files) {
const content = await readText(file);
if (content) {
const tokens = estimateTokens(content, encoder);
console.log(`${file}, ${tokens}`);
text += `${file}, ${tokens}\n`;
}
}
// Logs the aggregated text with file names and token estimates
console.log(text);
}
/**
* Tokenizes the content of a specified file using a provided model and logs the tokens.
*
* @param file - Path to the file to tokenize.
* @param options - Object containing the following properties:
* - model - The name of the model used for token encoding.
*
* The function reads the content of the file, tokenizes it using the given model,
* and logs each token along with its hexadecimal representation.
* Debug information about the process is also logged.
*/
export async function parseTokenize(file, options) {
const text = await readText(file);
dbg(`text: %s`, text);
const { model } = options || {};
const { model: tokenModel, encode: encoder, decode: decoder } = await resolveTokenEncoder(model);
console.debug(`model: %s`, tokenModel);
const tokens = encoder(text);
for (const token of tokens) {
stdout.write(`(${wrapColor(CONSOLE_TOKEN_COLORS[0], decoder([token]))}, x${wrapColor(CONSOLE_TOKEN_COLORS[1], token.toString(16))})`);
}
}
/**
* Converts "prompty" format files to GenAI script files.
*
* @param files - An array of file paths to process.
* @param options - An object containing the following properties:
* - out: The output directory where the converted files will be written.
*
* Logs the conversion process and writes the output files to the specified directory or replaces the extension in place if no directory is provided.
*/
export async function prompty2genaiscript(files, options) {
const { out } = options;
const fs = await expandFiles(files);
for (const f of fs) {
const gf = out ? join(out, replaceExt(basename(f), ".genai.mts")) : replaceExt(f, ".genai.mts");
console.log(`${f} -> ${gf}`);
const content = await readText(f);
const doc = promptyParse(f, content);
const script = promptyToGenAIScript(doc);
await writeText(gf, script);
}
}
/**
* Scans a list of files for sensitive information or secrets.
* Logs each file containing secrets and the types of secrets found.
* Issues a warning if secrets are found in any files.
*
* @param files - A list of file paths or glob patterns to scan.
* Logs the file name and the types of secrets found in each file.
* Warns if secrets are found in any of the scanned files.
*/
export async function parseSecrets(files) {
const fs = await expandFiles(files);
let n = 0;
for (const f of fs) {
const content = await readText(f);
const { found } = redactSecrets(content);
const entries = Object.entries(found);
if (entries.length) {
n++;
console.log(`${f}: ${entries.map(([k, v]) => `${k} (${v})`).join(", ")}`);
}
}
if (n > 0)
console.warn(`found secrets in ${n} of ${fs.length} files`);
}
/**
* Parses a markdown file, breaks it into chunks based on token limits, and logs a preview of each chunk.
*
* @param filename - The name of the markdown file to parse.
* @param options - Object containing parsing options.
* - model - The model name used for token encoding.
* - maxTokens - The maximum number of tokens allowed per chunk.
* - disableFallback - Whether to disable fallback for token encoding.
*/
export async function parseMarkdown(filename, options) {
const maxTokens = normalizeInt(options.maxTokens) || 1024;
const file = { filename };
await resolveFileContent(file);
if (file.size)
console.debug(`file: ${prettyBytes(file.size)}`);
const encoding = await resolveTokenEncoder(options?.model, {
disableFallback: false,
});
const res = await chunkMarkdown(file, (text) => encoding.encode(text).length, {
maxTokens,
});
const cols = terminalSize().columns;
for (const { content, filename, lineStart, lineEnd } of res) {
const prefix = `${basename(filename)} (${lineStart}-${lineEnd}): `;
console.log(`${prefix}${ellipse(content.replace(/\n/g, " "), cols - prefix.length)}`);
}
}
//# sourceMappingURL=parse.js.map