UNPKG

@harutakax/html-rag-optimizer

Version:

HTML optimization tool for RAG (Retrieval-Augmented Generation) systems

176 lines (171 loc) 7.34 kB
//#region rolldown:runtime var __create = Object.create; var __defProp = Object.defineProperty; var __getOwnPropDesc = Object.getOwnPropertyDescriptor; var __getOwnPropNames = Object.getOwnPropertyNames; var __getProtoOf = Object.getPrototypeOf; var __hasOwnProp = Object.prototype.hasOwnProperty; var __copyProps = (to, from, except, desc) => { if (from && typeof from === "object" || typeof from === "function") for (var keys = __getOwnPropNames(from), i = 0, n = keys.length, key; i < n; i++) { key = keys[i]; if (!__hasOwnProp.call(to, key) && key !== except) __defProp(to, key, { get: ((k) => from[k]).bind(null, key), enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable }); } return to; }; var __toESM = (mod, isNodeMode, target) => (target = mod != null ? __create(__getProtoOf(mod)) : {}, __copyProps(isNodeMode || !mod || !mod.__esModule ? __defProp(target, "default", { value: mod, enumerable: true }) : target, mod)); //#endregion const node_fs = __toESM(require("node:fs")); const node_path = __toESM(require("node:path")); const node_url = __toESM(require("node:url")); const commander = __toESM(require("commander")); //#region src/optimizer.ts const DEFAULT_OPTIONS = { keepAttributes: false, removeEmpty: true, preserveWhitespace: false, excludeTags: [], removeComments: true, minifyText: true }; function optimizeHtml(html, options = {}) { const opts = { ...DEFAULT_OPTIONS, ...options }; if (!html.trim()) return ""; return optimizeWithRegex(html, opts); } function optimizeWithRegex(html, opts) { let result = html; result = result.replace(/<!DOCTYPE[^>]*>/gi, ""); if (!opts.excludeTags.includes("script")) result = result.replace(/<script\b[^<]*(?:(?!<\/script>)<[^<]*)*<\/script>/gi, ""); if (!opts.excludeTags.includes("style")) result = result.replace(/<style\b[^<]*(?:(?!<\/style>)<[^<]*)*<\/style>/gi, ""); if (!opts.excludeTags.includes("meta")) result = result.replace(/<meta\b[^>]*>/gi, ""); if (opts.removeComments) result = result.replace(/<!--[\s\S]*?-->/g, ""); if (!opts.keepAttributes && opts.excludeTags.length > 0) { const excludePattern = opts.excludeTags.join("|"); const regex = new RegExp(`<((?!(?:${excludePattern})\\b)\\w+)\\s[^>]*>`, "gi"); result = result.replace(regex, "<$1>"); } else if (!opts.keepAttributes) result = result.replace(/<(\w+)\s[^>]*>/g, "<$1>"); if (opts.minifyText && !opts.preserveWhitespace) { result = result.replace(/>\s+</g, "><"); result = result.replace(/\s+/g, " "); result = result.replace(/>(\s+)/g, ">"); result = result.replace(/(\s+)</g, "<"); } if (opts.removeEmpty) { let prevResult = ""; while (prevResult !== result) { prevResult = result; result = result.replace(/<(\w+)>\s*<\/\1>/g, ""); result = result.replace(/<(\w+)><\/\1>/g, ""); } } result = result.replace(/<(\w+)\s*\/>/g, "<$1>"); return result.trim(); } //#endregion //#region src/utils/file-handler.ts async function optimizeHtmlFile(inputPath, outputPath, options) { try { const inputContent = await node_fs.promises.readFile(inputPath, "utf-8"); const optimizedContent = optimizeHtml(inputContent, options); const outputDir = (0, node_path.dirname)(outputPath); await node_fs.promises.mkdir(outputDir, { recursive: true }); await node_fs.promises.writeFile(outputPath, optimizedContent, "utf-8"); } catch (error) { if (error.code === "ENOENT") throw new Error(`Input file not found: ${inputPath}`); throw error; } } async function optimizeHtmlDir(inputDir, outputDir, options) { await processDirectory(inputDir, outputDir, options); } async function processDirectory(inputDir, outputDir, options, relativePath = "") { const currentInputDir = (0, node_path.join)(inputDir, relativePath); const currentOutputDir = (0, node_path.join)(outputDir, relativePath); await node_fs.promises.mkdir(currentOutputDir, { recursive: true }); const entries = await node_fs.promises.readdir(currentInputDir, { withFileTypes: true }); for (const entry of entries) { const inputPath = (0, node_path.join)(currentInputDir, entry.name); const outputPath = (0, node_path.join)(currentOutputDir, entry.name); const entryRelativePath = (0, node_path.join)(relativePath, entry.name); if (entry.isDirectory()) await processDirectory(inputDir, outputDir, options, entryRelativePath); else if (entry.isFile() && (0, node_path.extname)(entry.name).toLowerCase() === ".html") await optimizeHtmlFile(inputPath, outputPath, options); } } //#endregion //#region src/cli.ts const __filename$1 = (0, node_url.fileURLToPath)(require("url").pathToFileURL(__filename).href); const __dirname$1 = (0, node_path.dirname)(__filename$1); async function runCli(args = process.argv) { const program = new commander.Command(); const packageJson = JSON.parse(await node_fs.promises.readFile((0, node_path.resolve)(__dirname$1, "../package.json"), "utf-8")); program.name("html-rag-optimizer").description("HTML optimization tool for RAG (Retrieval-Augmented Generation) systems").version(packageJson.version).argument("[input]", "Input HTML file").option("-o, --output <path>", "Output file or directory").option("--input-dir <path>", "Input directory").option("--output-dir <path>", "Output directory").option("--keep-attributes", "Keep tag attributes").option("--exclude-tags <tags>", "Exclude tags from removal (comma-separated)").option("--preserve-whitespace", "Preserve whitespace").option("--config <path>", "Configuration file path").action(async (input, options) => { try { await handleCliAction(input, options); } catch (error) { console.error("Error:", error.message); process.exit(1); } }); await program.parseAsync(args); } async function handleCliAction(input, options) { const optimizeOptions = { keepAttributes: options.keepAttributes || false, excludeTags: options.excludeTags ? options.excludeTags.split(",").map((tag) => tag.trim()) : [], preserveWhitespace: options.preserveWhitespace || false }; if (options.config) { const configContent = await node_fs.promises.readFile(options.config, "utf-8"); const config = JSON.parse(configContent); Object.assign(optimizeOptions, config); } if (options.inputDir && options.outputDir) { await optimizeHtmlDir(options.inputDir, options.outputDir, optimizeOptions); console.log(`Directory optimization completed: ${options.inputDir} -> ${options.outputDir}`); return; } if (!input) throw new Error("Input file or --input-dir is required"); if (!options.output) throw new Error("Output file (-o) is required for single file processing"); await optimizeHtmlFile(input, options.output, optimizeOptions); console.log(`Optimization completed: ${input} -> ${options.output}`); } //#endregion Object.defineProperty(exports, 'handleCliAction', { enumerable: true, get: function () { return handleCliAction; } }); Object.defineProperty(exports, 'optimizeHtml', { enumerable: true, get: function () { return optimizeHtml; } }); Object.defineProperty(exports, 'optimizeHtmlDir', { enumerable: true, get: function () { return optimizeHtmlDir; } }); Object.defineProperty(exports, 'optimizeHtmlFile', { enumerable: true, get: function () { return optimizeHtmlFile; } }); Object.defineProperty(exports, 'runCli', { enumerable: true, get: function () { return runCli; } });