UNPKG

genaiscript

Version:

A CLI for GenAIScript, a generative AI scripting framework.

304 lines 13 kB
// Copyright (c) Microsoft Corporation. // Licensed under the MIT License. import replaceExt from "replace-ext"; import { readFile, writeFile } from "node:fs/promises"; import { CONSOLE_TOKEN_COLORS, MD_REGEX, PROMPTY_REGEX, CSVStringify, DOCXTryParse, HTMLToMarkdown, HTMLToText, INIStringify, JSON5Stringify, JSONLTryParse, YAMLStringify, chunkMarkdown, dataToMarkdownTable, dataTryParse, ellipse, estimateTokens, expandFiles, extractFenced, genaiscriptDebug, isJSONLFilename, jinjaRender, logVerbose, normalizeInt, parsePdf, prettyBytes, promptyParse, promptyToGenAIScript, readText, redactSecrets, resolveFileContent, resolveTokenEncoder, splitMarkdown, stdout, terminalSize, tryReadText, wrapColor, writeText, } from "@genaiscript/core"; import { basename, join } from "node:path"; import { parseOptionsVars } from "@genaiscript/core"; const dbg = genaiscriptDebug("cli:parse"); /** * This module provides various parsing utilities for different file types such * as PDF, DOCX, HTML, JSONL, and more. It includes functions to extract and * convert data, estimate tokens, and transform file formats. */ /** * Extracts and logs fenced code blocks of a specific language from a file. * Filters the fenced blocks by the specified language and logs their content. * @param language - The language to filter the fenced blocks by. * @param file - The file to parse and extract fenced code blocks from. */ export async function parseFence(language, file) { const res = await resolveFileContent({ filename: file }); const fences = extractFenced(res.content || "").filter((f) => f.language === language); // Logs the content of the filtered fenced blocks console.log(fences.map((f) => f.content).join("\n\n")); } /** * Parses the contents of a PDF file and outputs them in text format. * Optionally writes the content and page images to the specified output directory. * If an output directory is specified, the text content is saved as a .txt file, * and page images (if any) are saved as .png files. * Logs the writing process for each file. * If no output directory is specified, logs the text content to the console. * @param file - The PDF file to parse. * @param options - Options to include images and specify the output directory. * - images: Whether to include page images in the output. * - out: The output directory where files will be saved. */ export async function parsePDF(file, options) { const { images, out } = options; const { content, pages } = await parsePdf(file, { renderAsImage: images }); if (out) { const fn = basename(file); console.log(`writing ${join(out, fn + ".txt")}`); await writeText(join(out, fn + ".txt"), content || ""); for (const page of pages) { if (page.image) { const n = join(out, fn + ".page" + page.index + ".png"); console.log(`writing ${n}`); await writeFile(n, page.image); } } } else { console.log(content || ""); } } /** * Parses the contents of a DOCX file and logs the extracted text. * If an error occurs during parsing, it logs the error. * Uses DOCXTryParse to extract text from the DOCX file. * @param file - The path to the DOCX file to parse. * @param options - Options for parsing the DOCX file. */ export async function parseDOCX(file, options) { // Uses DOCXTryParse to extract text from the DOCX file const res = await DOCXTryParse(file, options); if (res.error) console.error(res.error); else console.log(res.file.content); } /** * Converts HTML content to text and logs it or writes it to a file. * @param fileOrUrl - The HTML file or URL to convert. * @param options - Options to specify the output format ("markdown" or "text") and the output file path. */ export async function parseHTMLToText(fileOrUrl, options) { const { format = "markdown", out } = options || {}; const file = { filename: fileOrUrl }; await resolveFileContent(file); // Converts HTML to plain text let text; if (format === "markdown") text = await HTMLToMarkdown(file.content); else text = await HTMLToText(file.content); if (out) { logVerbose(`writing ${out}`); await writeText(out, text); } else console.log(text); } /** * Parses a Jinja2 file, substitutes variables, and logs the rendered output. * * @param file - The path to the Jinja2 template file to parse. * @param options - An object containing the following properties: * - vars: An array of key-value pairs in the format "key=value" to replace variables in the template. * * The function reads the template file, processes it based on its type (Prompty or Markdown), * substitutes the provided variables, and renders the output. Variable values are converted * to numbers if possible. Environment variables are also considered during substitution. */ export async function parseJinja2(file, options) { let src = await readFile(file, { encoding: "utf-8" }); if (PROMPTY_REGEX.test(file)) src = promptyParse(file, src).content; else if (MD_REGEX.test(file)) src = splitMarkdown(src).content; const vars = parseOptionsVars(options.vars, process.env); for (const k in vars) { const i = parseFloat(vars[k]); if (!isNaN(i)) vars[k] = i; } const res = jinjaRender(src, vars); console.log(res); } /** * Parses the input file and converts its data into a specified format. * * @param file - Path to the file to be read and parsed. * @param options - Configuration options for the output format. * @param options.format - The target format for the output. Supported formats include: * - "yaml": Converts data to YAML format. * - "ini": Converts data to INI format. * - "csv": Converts data into a CSV format with a header row. * - "md" or "markdown": Converts data into a Markdown table. * - "json5": Converts data into JSON5 format. * - Default: Outputs data as a prettified JSON string. * * Logs the converted data to the console. * Throws an error if the data format cannot be determined. */ export async function parseAnyToJSON(file, options) { const data = await dataTryParse({ filename: file }); if (!data) throw new Error(`Unknown data format for ${file}`); let out; switch (options?.format?.toLowerCase() || "") { case "yaml": out = YAMLStringify(data); break; case "ini": out = INIStringify(data); break; case "csv": out = CSVStringify(data, { header: true }); break; case "md": case "markdown": out = dataToMarkdownTable(data); break; case "json5": out = JSON5Stringify(data, null, 2); break; default: out = JSON.stringify(data, null, 2); break; } console.log(out); } /** * Converts JSONL files to JSON files. * Processes an array of files or glob patterns, skipping non-JSONL files, * and writes the converted JSON content to new files with a ".json" extension. * Logs the conversion process for each file. * @param files - An array of files or glob patterns to process. */ export async function jsonl2json(files) { for (const file of await expandFiles(files, { applyGitIgnore: false })) { if (!isJSONLFilename(file)) { // Skips files that are not JSONL console.log(`skipping ${file}`); continue; } const content = await tryReadText(file); const objs = await JSONLTryParse(content, { repair: true }); const out = replaceExt(file, ".json"); await writeText(out, JSON.stringify(objs, null, 2)); console.log(`${file} -> ${out}`); } } /** * Estimates the number of tokens in the content of files and logs the results. * @param filesGlobs - An array of files or glob patterns to process. * @param options - Options for processing files. * - excludedFiles - A list of files to exclude from processing. * - model - The name of the model used for token encoding. * - ignoreGitIgnore - Whether to ignore .gitignore rules when expanding files. */ export async function parseTokens(filesGlobs, options) { const { model } = options || {}; const { encode: encoder } = await resolveTokenEncoder(model); const files = await expandFiles(filesGlobs, options); console.log(`parsing ${files.length} files`); let text = ""; for (const file of files) { const content = await readText(file); if (content) { const tokens = estimateTokens(content, encoder); console.log(`${file}, ${tokens}`); text += `${file}, ${tokens}\n`; } } // Logs the aggregated text with file names and token estimates console.log(text); } /** * Tokenizes the content of a specified file using a provided model and logs the tokens. * * @param file - Path to the file to tokenize. * @param options - Object containing the following properties: * - model - The name of the model used for token encoding. * * The function reads the content of the file, tokenizes it using the given model, * and logs each token along with its hexadecimal representation. * Debug information about the process is also logged. */ export async function parseTokenize(file, options) { const text = await readText(file); dbg(`text: %s`, text); const { model } = options || {}; const { model: tokenModel, encode: encoder, decode: decoder } = await resolveTokenEncoder(model); console.debug(`model: %s`, tokenModel); const tokens = encoder(text); for (const token of tokens) { stdout.write(`(${wrapColor(CONSOLE_TOKEN_COLORS[0], decoder([token]))}, x${wrapColor(CONSOLE_TOKEN_COLORS[1], token.toString(16))})`); } } /** * Converts "prompty" format files to GenAI script files. * * @param files - An array of file paths to process. * @param options - An object containing the following properties: * - out: The output directory where the converted files will be written. * * Logs the conversion process and writes the output files to the specified directory or replaces the extension in place if no directory is provided. */ export async function prompty2genaiscript(files, options) { const { out } = options; const fs = await expandFiles(files); for (const f of fs) { const gf = out ? join(out, replaceExt(basename(f), ".genai.mts")) : replaceExt(f, ".genai.mts"); console.log(`${f} -> ${gf}`); const content = await readText(f); const doc = promptyParse(f, content); const script = promptyToGenAIScript(doc); await writeText(gf, script); } } /** * Scans a list of files for sensitive information or secrets. * Logs each file containing secrets and the types of secrets found. * Issues a warning if secrets are found in any files. * * @param files - A list of file paths or glob patterns to scan. * Logs the file name and the types of secrets found in each file. * Warns if secrets are found in any of the scanned files. */ export async function parseSecrets(files) { const fs = await expandFiles(files); let n = 0; for (const f of fs) { const content = await readText(f); const { found } = redactSecrets(content); const entries = Object.entries(found); if (entries.length) { n++; console.log(`${f}: ${entries.map(([k, v]) => `${k} (${v})`).join(", ")}`); } } if (n > 0) console.warn(`found secrets in ${n} of ${fs.length} files`); } /** * Parses a markdown file, breaks it into chunks based on token limits, and logs a preview of each chunk. * * @param filename - The name of the markdown file to parse. * @param options - Object containing parsing options. * - model - The model name used for token encoding. * - maxTokens - The maximum number of tokens allowed per chunk. * - disableFallback - Whether to disable fallback for token encoding. */ export async function parseMarkdown(filename, options) { const maxTokens = normalizeInt(options.maxTokens) || 1024; const file = { filename }; await resolveFileContent(file); if (file.size) console.debug(`file: ${prettyBytes(file.size)}`); const encoding = await resolveTokenEncoder(options?.model, { disableFallback: false, }); const res = await chunkMarkdown(file, (text) => encoding.encode(text).length, { maxTokens, }); const cols = terminalSize().columns; for (const { content, filename, lineStart, lineEnd } of res) { const prefix = `${basename(filename)} (${lineStart}-${lineEnd}): `; console.log(`${prefix}${ellipse(content.replace(/\n/g, " "), cols - prefix.length)}`); } } //# sourceMappingURL=parse.js.map