UNPKG

@summarisation/summarise

Version:

cli for summariser

68 lines (67 loc) 3.42 kB
"use strict"; var __importDefault = (this && this.__importDefault) || function (mod) { return (mod && mod.__esModule) ? mod : { "default": mod }; }; Object.defineProperty(exports, "__esModule", { value: true }); exports.addTikaCommand = exports.tikaAction = void 0; const summarise_config_1 = require("../summarise.config"); const node_fs_1 = __importDefault(require("node:fs")); const fileutils_1 = require("@summarisation/fileutils"); const cheerio_1 = __importDefault(require("cheerio")); const strings_1 = require("@itsmworkbench/utils/dist/src/strings"); const node_util_1 = require("node:util"); function tikaAction(tc) { return async (_, opts) => { if (opts.debug) console.log(`html `, opts); await (0, summarise_config_1.abortIfDirectoryDoesNotExist)(tc.config.directories.tika, `tika directory ${tc.config.directories.tika} does not exist`); await node_fs_1.default.promises.mkdir(tc.config.directories.text, { recursive: true }); const { tika, text } = tc.config.directories; if (opts.clean) await node_fs_1.default.promises.rm(text, { recursive: true }); const type = `onePer${(0, strings_1.uppercaseFirstLetter)(opts.onePer?.toString() || tc.config.transform.type)}`; if (node_util_1.debug) console.log('type', type); const fn = async (content, marker, newFilename) => { let contents = JSON.parse(content).flatMap((page, index) => { let html = page["X-TIKA:content"]; if (html === undefined) return ''; if (typeof html !== 'string') throw new Error(`Expected string got ${typeof html}`); let $ = cheerio_1.default.load(html); return $('body').text(); }); const result = type === 'onePerPage' ? contents.map((content, index) => ({ file: newFilename(index), content })) : [{ file: newFilename(0), content: contents.join('\n') }]; return result.filter(({ content }) => content !== undefined && content !== null && content.trim().length > 0); }; const config = { inputDir: tika, outputDir: text, fn, readFile: async (file) => node_fs_1.default.readFileSync(file, 'utf8'), filter: (file) => file.endsWith('.json'), newFileNameFn: (0, fileutils_1.changeExtensionAddIndex)('.txt'), debug: opts.debug === true, dryRun: opts.dryRun === true }; console.log('made html files', await (0, fileutils_1.transformFiles)(config)); }; } exports.tikaAction = tikaAction; function addTikaCommand(tc) { return { cmd: 'tika', description: `turn tika files to text files ${tc.config.directories.tika} ==> ${tc.config.directories.text}`, options: { '--clean': { description: 'Delete the output file directory at the start' }, '--debug': { description: 'Show debug information' }, '--onePer <type>': { description: `One file per page or per file. Legal values 'page' or 'default'. overrides config`, default: tc.config.transform.type }, '--dryRun': { description: `Just do a dry run instead of actually making the pipelines` } }, action: tikaAction(tc) }; } exports.addTikaCommand = addTikaCommand;