@harutakax/html-rag-optimizer
Version:
HTML optimization tool for RAG (Retrieval-Augmented Generation) systems
176 lines (171 loc) • 7.34 kB
JavaScript
//#region rolldown:runtime
var __create = Object.create;
var __defProp = Object.defineProperty;
var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
var __getOwnPropNames = Object.getOwnPropertyNames;
var __getProtoOf = Object.getPrototypeOf;
var __hasOwnProp = Object.prototype.hasOwnProperty;
var __copyProps = (to, from, except, desc) => {
if (from && typeof from === "object" || typeof from === "function") for (var keys = __getOwnPropNames(from), i = 0, n = keys.length, key; i < n; i++) {
key = keys[i];
if (!__hasOwnProp.call(to, key) && key !== except) __defProp(to, key, {
get: ((k) => from[k]).bind(null, key),
enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable
});
}
return to;
};
var __toESM = (mod, isNodeMode, target) => (target = mod != null ? __create(__getProtoOf(mod)) : {}, __copyProps(isNodeMode || !mod || !mod.__esModule ? __defProp(target, "default", {
value: mod,
enumerable: true
}) : target, mod));
//#endregion
const node_fs = __toESM(require("node:fs"));
const node_path = __toESM(require("node:path"));
const node_url = __toESM(require("node:url"));
const commander = __toESM(require("commander"));
//#region src/optimizer.ts
const DEFAULT_OPTIONS = {
keepAttributes: false,
removeEmpty: true,
preserveWhitespace: false,
excludeTags: [],
removeComments: true,
minifyText: true
};
function optimizeHtml(html, options = {}) {
const opts = {
...DEFAULT_OPTIONS,
...options
};
if (!html.trim()) return "";
return optimizeWithRegex(html, opts);
}
function optimizeWithRegex(html, opts) {
let result = html;
result = result.replace(/<!DOCTYPE[^>]*>/gi, "");
if (!opts.excludeTags.includes("script")) result = result.replace(/<script\b[^<]*(?:(?!<\/script>)<[^<]*)*<\/script>/gi, "");
if (!opts.excludeTags.includes("style")) result = result.replace(/<style\b[^<]*(?:(?!<\/style>)<[^<]*)*<\/style>/gi, "");
if (!opts.excludeTags.includes("meta")) result = result.replace(/<meta\b[^>]*>/gi, "");
if (opts.removeComments) result = result.replace(/<!--[\s\S]*?-->/g, "");
if (!opts.keepAttributes && opts.excludeTags.length > 0) {
const excludePattern = opts.excludeTags.join("|");
const regex = new RegExp(`<((?!(?:${excludePattern})\\b)\\w+)\\s[^>]*>`, "gi");
result = result.replace(regex, "<$1>");
} else if (!opts.keepAttributes) result = result.replace(/<(\w+)\s[^>]*>/g, "<$1>");
if (opts.minifyText && !opts.preserveWhitespace) {
result = result.replace(/>\s+</g, "><");
result = result.replace(/\s+/g, " ");
result = result.replace(/>(\s+)/g, ">");
result = result.replace(/(\s+)</g, "<");
}
if (opts.removeEmpty) {
let prevResult = "";
while (prevResult !== result) {
prevResult = result;
result = result.replace(/<(\w+)>\s*<\/\1>/g, "");
result = result.replace(/<(\w+)><\/\1>/g, "");
}
}
result = result.replace(/<(\w+)\s*\/>/g, "<$1>");
return result.trim();
}
//#endregion
//#region src/utils/file-handler.ts
async function optimizeHtmlFile(inputPath, outputPath, options) {
try {
const inputContent = await node_fs.promises.readFile(inputPath, "utf-8");
const optimizedContent = optimizeHtml(inputContent, options);
const outputDir = (0, node_path.dirname)(outputPath);
await node_fs.promises.mkdir(outputDir, { recursive: true });
await node_fs.promises.writeFile(outputPath, optimizedContent, "utf-8");
} catch (error) {
if (error.code === "ENOENT") throw new Error(`Input file not found: ${inputPath}`);
throw error;
}
}
async function optimizeHtmlDir(inputDir, outputDir, options) {
await processDirectory(inputDir, outputDir, options);
}
async function processDirectory(inputDir, outputDir, options, relativePath = "") {
const currentInputDir = (0, node_path.join)(inputDir, relativePath);
const currentOutputDir = (0, node_path.join)(outputDir, relativePath);
await node_fs.promises.mkdir(currentOutputDir, { recursive: true });
const entries = await node_fs.promises.readdir(currentInputDir, { withFileTypes: true });
for (const entry of entries) {
const inputPath = (0, node_path.join)(currentInputDir, entry.name);
const outputPath = (0, node_path.join)(currentOutputDir, entry.name);
const entryRelativePath = (0, node_path.join)(relativePath, entry.name);
if (entry.isDirectory()) await processDirectory(inputDir, outputDir, options, entryRelativePath);
else if (entry.isFile() && (0, node_path.extname)(entry.name).toLowerCase() === ".html") await optimizeHtmlFile(inputPath, outputPath, options);
}
}
//#endregion
//#region src/cli.ts
const __filename$1 = (0, node_url.fileURLToPath)(require("url").pathToFileURL(__filename).href);
const __dirname$1 = (0, node_path.dirname)(__filename$1);
async function runCli(args = process.argv) {
const program = new commander.Command();
const packageJson = JSON.parse(await node_fs.promises.readFile((0, node_path.resolve)(__dirname$1, "../package.json"), "utf-8"));
program.name("html-rag-optimizer").description("HTML optimization tool for RAG (Retrieval-Augmented Generation) systems").version(packageJson.version).argument("[input]", "Input HTML file").option("-o, --output <path>", "Output file or directory").option("--input-dir <path>", "Input directory").option("--output-dir <path>", "Output directory").option("--keep-attributes", "Keep tag attributes").option("--exclude-tags <tags>", "Exclude tags from removal (comma-separated)").option("--preserve-whitespace", "Preserve whitespace").option("--config <path>", "Configuration file path").action(async (input, options) => {
try {
await handleCliAction(input, options);
} catch (error) {
console.error("Error:", error.message);
process.exit(1);
}
});
await program.parseAsync(args);
}
async function handleCliAction(input, options) {
const optimizeOptions = {
keepAttributes: options.keepAttributes || false,
excludeTags: options.excludeTags ? options.excludeTags.split(",").map((tag) => tag.trim()) : [],
preserveWhitespace: options.preserveWhitespace || false
};
if (options.config) {
const configContent = await node_fs.promises.readFile(options.config, "utf-8");
const config = JSON.parse(configContent);
Object.assign(optimizeOptions, config);
}
if (options.inputDir && options.outputDir) {
await optimizeHtmlDir(options.inputDir, options.outputDir, optimizeOptions);
console.log(`Directory optimization completed: ${options.inputDir} -> ${options.outputDir}`);
return;
}
if (!input) throw new Error("Input file or --input-dir is required");
if (!options.output) throw new Error("Output file (-o) is required for single file processing");
await optimizeHtmlFile(input, options.output, optimizeOptions);
console.log(`Optimization completed: ${input} -> ${options.output}`);
}
//#endregion
Object.defineProperty(exports, 'handleCliAction', {
enumerable: true,
get: function () {
return handleCliAction;
}
});
Object.defineProperty(exports, 'optimizeHtml', {
enumerable: true,
get: function () {
return optimizeHtml;
}
});
Object.defineProperty(exports, 'optimizeHtmlDir', {
enumerable: true,
get: function () {
return optimizeHtmlDir;
}
});
Object.defineProperty(exports, 'optimizeHtmlFile', {
enumerable: true,
get: function () {
return optimizeHtmlFile;
}
});
Object.defineProperty(exports, 'runCli', {
enumerable: true,
get: function () {
return runCli;
}
});