boox-cli
Version:
A command-line interface (CLI) for training and searching Boox datasets.
217 lines (216 loc) • 7.94 kB
JavaScript
import { program } from "commander";
import { readFile, access, constants, mkdir, writeFile } from "node:fs/promises";
import { resolve, relative, join, dirname, basename, extname } from "path";
import Boox from "boox";
import { oraPromise } from "ora";
import { deflate, gzip, inflate, ungzip } from "pako";
import { loadRc } from "rcfy";
async function o() {
try {
return typeof navigator < "u" && navigator.hardwareConcurrency ? navigator.hardwareConcurrency * 2 : (await import("os")).cpus().length;
} catch (r) {
console.error("Error determining available CPUs:", r);
}
return 8;
}
async function c(r, n) {
n || (n = await o());
const t = [];
for (let e = 0; e < r.length; e += n) {
const a = r.slice(e, e + n);
t.push(a);
}
return t;
}
function getElapsedTime(startTime) {
if (!startTime) {
return "0s";
}
const elapsedTime = (/* @__PURE__ */ new Date()).getTime() - startTime.getTime();
return formatTime(elapsedTime);
}
function formatTime(time) {
const totalSeconds = Math.floor(time / 1e3);
const hours = Math.floor(totalSeconds / 3600);
const minutes = Math.floor(totalSeconds % 3600 / 60);
const seconds = totalSeconds % 60;
const formattedTime = [
hours > 0 ? `${hours.toString().padStart(2, "0")}h` : "",
minutes > 0 ? `${minutes.toString().padStart(2, "0")}m` : "",
`${seconds.toString().padStart(2, "0")}s`
].filter(Boolean).join(" ");
return formattedTime;
}
function getDataSize(data) {
const dataStr = typeof data === "string" ? data : JSON.stringify(data);
const sizeInBytes = dataStr.length;
return formatBytes(sizeInBytes);
}
function formatBytes(bytes, decimals = 2) {
if (bytes === 0) return "0 Bytes";
const k = 1024;
const dm = decimals < 0 ? 0 : decimals;
const sizes = ["Bytes", "KB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB"];
const i = Math.floor(Math.log(bytes) / Math.log(k));
return parseFloat((bytes / Math.pow(k, i)).toFixed(dm)) + " " + sizes[i];
}
const DEFAULT_COMPRESSION_LEVEL = 6;
async function trainDataset(src, dest, { rcname = "boox", cwd, ...options } = {}) {
const resolvedCwd = cwd ? resolve(cwd) : process.cwd();
const userConfig = await loadRc(rcname, resolvedCwd);
const {
id = "id",
features = ["text"],
attributes = [],
modelOptions,
isDeflate = false
} = { ...options, ...userConfig };
const resolvedSrc = relative(process.cwd(), join(resolvedCwd, src));
const trainedFile = join(
dest ? relative(process.cwd(), join(resolvedCwd, dest)) : dirname(resolvedSrc),
`${basename(src).replace(
extname(src),
isDeflate ? "-trained.dat" : "-trained.gz"
)}`
);
const boox = new Boox({ id, features, attributes, modelOptions });
const datasets = await oraPromise(
async () => JSON.parse(await readFile(resolvedSrc, "utf8")),
{
text: "Reading data...",
successText(data) {
return `Reading ${getDataSize(data)} data!`;
}
}
);
const batches = await c(datasets);
const progress = {
current: 0,
length: datasets.length
};
const startTime = /* @__PURE__ */ new Date();
await oraPromise(
(ora) => {
return Promise.all(
batches.map(
(batch) => batch.map((dataset) => {
progress.current++;
ora.text = `Training ${resolvedSrc} ${progress.current} of ${progress.length} - ${getElapsedTime(startTime)}`;
ora.render();
boox.addDocumentSync(dataset);
})
)
);
},
{
text: "Start training...",
successText() {
return `Trained ${progress.current} documents in ${getElapsedTime(startTime)}`;
}
}
);
const compressor = isDeflate ? deflate : gzip;
const state = JSON.stringify(boox.currentState);
const compressedState = compressor(state, {
level: DEFAULT_COMPRESSION_LEVEL
});
await oraPromise(
async () => {
const distDir = dirname(trainedFile);
try {
await access(distDir, constants.F_OK);
} catch {
await mkdir(distDir, { recursive: true });
}
return await writeFile(trainedFile, compressedState);
},
{
text: "Saving...",
successText: `Saved ${getDataSize(state)} state to ${trainedFile}`
}
);
}
async function searchDataset(src, query, {
rcname = "boox",
cwd,
...options
} = {}) {
const resolvedCwd = cwd ? resolve(cwd) : process.cwd();
const userConfig = await loadRc(rcname, resolvedCwd);
const {
modelOptions,
isDeflate = false,
offset = 1,
length = 10
} = { ...options, ...userConfig };
const resolvedSrc = relative(process.cwd(), join(resolvedCwd, src));
const decompressor = isDeflate ? inflate : ungzip;
console.time("Loading state");
const compressedState = await readFile(resolvedSrc);
const decompressedState = decompressor(compressedState, { to: "string" });
const boox = new Boox({ modelOptions });
const state = JSON.parse(decompressedState);
boox.currentState = state;
console.timeEnd("Loading state");
console.info("State size:", getDataSize(decompressedState));
const resultsConfig = await loadRc("boox-results", resolvedCwd);
console.time("Search in");
const results = await boox.search(query, resultsConfig);
console.timeEnd("Search in");
console.log();
return Boox.paginateSearchResults(results, +offset, +length);
}
program.command("train <source> [destination]").description("Train a Boox dataset").option("-i, --id <field>", "Field to use as document ID", "id").option("-f, --features <fields...>", "Fields to index for search").option("-a, --attributes <fields...>", "Fields to include as-is").option("-d, --deflate", 'Compress the trained data as a ".dat" file', false).option("-c, --cwd <folder>", "Working directory", process.cwd()).option("-r, --rcname <name>", "Name of the Boox configuration file", "boox").action(async (src, dest, { deflate: deflate2, ...options }) => {
try {
await trainDataset(src, dest, { isDeflate: deflate2, ...options });
} catch (error) {
console.error(error);
process.exit(1);
}
});
program.command("search <source> <query>").description("Search a trained Boox dataset").option("-o, --offset <number>", "Offset for pagination", "1").option("-l, --length <number>", "Number of results per page", "10").option(
"-k, --context <field>",
"Display the context instead of paginated results object"
).option(
"-a, --attrs <fields...>",
'Fields to display when "--context" is provided'
).option(
"-d, --deflate",
'Assume the trained data is deflated as ".dat" file',
false
).option("-c, --cwd <folder>", "Working directory", process.cwd()).option("-r, --rcname <name>", "Name of the Boox configuration file", "boox").action(async (src, query, { context, attrs = [], deflate: deflate2, ...options }) => {
try {
const paginateResults = await searchDataset(src, query, {
isDeflate: deflate2,
...options
});
if (typeof context === "string") {
const { currentPage, totalPages, totalResults, results } = paginateResults;
console.log(
`
Page ${currentPage} of ${totalPages}, Showing ${results.length} of ${totalResults} results
`
);
console.log("=".repeat(30), "\n");
for (const result of results) {
const [field, maxlength = 160] = context.split("::");
const { keywords, text } = result.context(field, +maxlength);
const meta = attrs.map((attr) => attr + ": " + result.attributes[attr]).filter(Boolean);
console.log(...meta, Array.from(keywords), "\n");
console.log(`${text}...`, "\n");
console.log("=".repeat(30), "\n");
}
console.log(
`Page ${currentPage} of ${totalPages}, Showing ${results.length} of ${totalResults} results
`
);
} else {
console.log(paginateResults);
}
} catch (error) {
console.error(error);
process.exit(1);
}
});
program.parse();