UNPKG

boox-cli

Version:

A command-line interface (CLI) for training and searching Boox datasets.

stilearning.com/boox

217 lines (216 loc) • 7.94 kB

JavaScript

#!/usr/bin/env node import { program } from "commander"; import { readFile, access, constants, mkdir, writeFile } from "node:fs/promises"; import { resolve, relative, join, dirname, basename, extname } from "path"; import Boox from "boox"; import { oraPromise } from "ora"; import { deflate, gzip, inflate, ungzip } from "pako"; import { loadRc } from "rcfy"; async function o() { try { return typeof navigator < "u" && navigator.hardwareConcurrency ? navigator.hardwareConcurrency * 2 : (await import("os")).cpus().length; } catch (r) { console.error("Error determining available CPUs:", r); } return 8; } async function c(r, n) { n || (n = await o()); const t = []; for (let e = 0; e < r.length; e += n) { const a = r.slice(e, e + n); t.push(a); } return t; } function getElapsedTime(startTime) { if (!startTime) { return "0s"; } const elapsedTime = (/* @__PURE__ */ new Date()).getTime() - startTime.getTime(); return formatTime(elapsedTime); } function formatTime(time) { const totalSeconds = Math.floor(time / 1e3); const hours = Math.floor(totalSeconds / 3600); const minutes = Math.floor(totalSeconds % 3600 / 60); const seconds = totalSeconds % 60; const formattedTime = [ hours > 0 ? `${hours.toString().padStart(2, "0")}h` : "", minutes > 0 ? `${minutes.toString().padStart(2, "0")}m` : "", `${seconds.toString().padStart(2, "0")}s` ].filter(Boolean).join(" "); return formattedTime; } function getDataSize(data) { const dataStr = typeof data === "string" ? data : JSON.stringify(data); const sizeInBytes = dataStr.length; return formatBytes(sizeInBytes); } function formatBytes(bytes, decimals = 2) { if (bytes === 0) return "0 Bytes"; const k = 1024; const dm = decimals < 0 ? 0 : decimals; const sizes = ["Bytes", "KB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB"]; const i = Math.floor(Math.log(bytes) / Math.log(k)); return parseFloat((bytes / Math.pow(k, i)).toFixed(dm)) + " " + sizes[i]; } const DEFAULT_COMPRESSION_LEVEL = 6; async function trainDataset(src, dest, { rcname = "boox", cwd, ...options } = {}) { const resolvedCwd = cwd ? resolve(cwd) : process.cwd(); const userConfig = await loadRc(rcname, resolvedCwd); const { id = "id", features = ["text"], attributes = [], modelOptions, isDeflate = false } = { ...options, ...userConfig }; const resolvedSrc = relative(process.cwd(), join(resolvedCwd, src)); const trainedFile = join( dest ? relative(process.cwd(), join(resolvedCwd, dest)) : dirname(resolvedSrc), `${basename(src).replace( extname(src), isDeflate ? "-trained.dat" : "-trained.gz" )}` ); const boox = new Boox({ id, features, attributes, modelOptions }); const datasets = await oraPromise( async () => JSON.parse(await readFile(resolvedSrc, "utf8")), { text: "Reading data...", successText(data) { return `Reading ${getDataSize(data)} data!`; } } ); const batches = await c(datasets); const progress = { current: 0, length: datasets.length }; const startTime = /* @__PURE__ */ new Date(); await oraPromise( (ora) => { return Promise.all( batches.map( (batch) => batch.map((dataset) => { progress.current++; ora.text = `Training ${resolvedSrc} ${progress.current} of ${progress.length} - ${getElapsedTime(startTime)}`; ora.render(); boox.addDocumentSync(dataset); }) ) ); }, { text: "Start training...", successText() { return `Trained ${progress.current} documents in ${getElapsedTime(startTime)}`; } } ); const compressor = isDeflate ? deflate : gzip; const state = JSON.stringify(boox.currentState); const compressedState = compressor(state, { level: DEFAULT_COMPRESSION_LEVEL }); await oraPromise( async () => { const distDir = dirname(trainedFile); try { await access(distDir, constants.F_OK); } catch { await mkdir(distDir, { recursive: true }); } return await writeFile(trainedFile, compressedState); }, { text: "Saving...", successText: `Saved ${getDataSize(state)} state to ${trainedFile}` } ); } async function searchDataset(src, query, { rcname = "boox", cwd, ...options } = {}) { const resolvedCwd = cwd ? resolve(cwd) : process.cwd(); const userConfig = await loadRc(rcname, resolvedCwd); const { modelOptions, isDeflate = false, offset = 1, length = 10 } = { ...options, ...userConfig }; const resolvedSrc = relative(process.cwd(), join(resolvedCwd, src)); const decompressor = isDeflate ? inflate : ungzip; console.time("Loading state"); const compressedState = await readFile(resolvedSrc); const decompressedState = decompressor(compressedState, { to: "string" }); const boox = new Boox({ modelOptions }); const state = JSON.parse(decompressedState); boox.currentState = state; console.timeEnd("Loading state"); console.info("State size:", getDataSize(decompressedState)); const resultsConfig = await loadRc("boox-results", resolvedCwd); console.time("Search in"); const results = await boox.search(query, resultsConfig); console.timeEnd("Search in"); console.log(); return Boox.paginateSearchResults(results, +offset, +length); } program.command("train <source> [destination]").description("Train a Boox dataset").option("-i, --id <field>", "Field to use as document ID", "id").option("-f, --features <fields...>", "Fields to index for search").option("-a, --attributes <fields...>", "Fields to include as-is").option("-d, --deflate", 'Compress the trained data as a ".dat" file', false).option("-c, --cwd <folder>", "Working directory", process.cwd()).option("-r, --rcname <name>", "Name of the Boox configuration file", "boox").action(async (src, dest, { deflate: deflate2, ...options }) => { try { await trainDataset(src, dest, { isDeflate: deflate2, ...options }); } catch (error) { console.error(error); process.exit(1); } }); program.command("search <source> <query>").description("Search a trained Boox dataset").option("-o, --offset <number>", "Offset for pagination", "1").option("-l, --length <number>", "Number of results per page", "10").option( "-k, --context <field>", "Display the context instead of paginated results object" ).option( "-a, --attrs <fields...>", 'Fields to display when "--context" is provided' ).option( "-d, --deflate", 'Assume the trained data is deflated as ".dat" file', false ).option("-c, --cwd <folder>", "Working directory", process.cwd()).option("-r, --rcname <name>", "Name of the Boox configuration file", "boox").action(async (src, query, { context, attrs = [], deflate: deflate2, ...options }) => { try { const paginateResults = await searchDataset(src, query, { isDeflate: deflate2, ...options }); if (typeof context === "string") { const { currentPage, totalPages, totalResults, results } = paginateResults; console.log( ` Page ${currentPage} of ${totalPages}, Showing ${results.length} of ${totalResults} results ` ); console.log("=".repeat(30), "\n"); for (const result of results) { const [field, maxlength = 160] = context.split("::"); const { keywords, text } = result.context(field, +maxlength); const meta = attrs.map((attr) => attr + ": " + result.attributes[attr]).filter(Boolean); console.log(...meta, Array.from(keywords), "\n"); console.log(`${text}...`, "\n"); console.log("=".repeat(30), "\n"); } console.log( `Page ${currentPage} of ${totalPages}, Showing ${results.length} of ${totalResults} results ` ); } else { console.log(paginateResults); } } catch (error) { console.error(error); process.exit(1); } }); program.parse();