@summarisation/summarise
Version:
cli for summariser
68 lines (67 loc) • 3.42 kB
JavaScript
;
var __importDefault = (this && this.__importDefault) || function (mod) {
return (mod && mod.__esModule) ? mod : { "default": mod };
};
Object.defineProperty(exports, "__esModule", { value: true });
exports.addTikaCommand = exports.tikaAction = void 0;
const summarise_config_1 = require("../summarise.config");
const node_fs_1 = __importDefault(require("node:fs"));
const fileutils_1 = require("@summarisation/fileutils");
const cheerio_1 = __importDefault(require("cheerio"));
const strings_1 = require("@itsmworkbench/utils/dist/src/strings");
const node_util_1 = require("node:util");
function tikaAction(tc) {
return async (_, opts) => {
if (opts.debug)
console.log(`html `, opts);
await (0, summarise_config_1.abortIfDirectoryDoesNotExist)(tc.config.directories.tika, `tika directory ${tc.config.directories.tika} does not exist`);
await node_fs_1.default.promises.mkdir(tc.config.directories.text, { recursive: true });
const { tika, text } = tc.config.directories;
if (opts.clean)
await node_fs_1.default.promises.rm(text, { recursive: true });
const type = `onePer${(0, strings_1.uppercaseFirstLetter)(opts.onePer?.toString() || tc.config.transform.type)}`;
if (node_util_1.debug)
console.log('type', type);
const fn = async (content, marker, newFilename) => {
let contents = JSON.parse(content).flatMap((page, index) => {
let html = page["X-TIKA:content"];
if (html === undefined)
return '';
if (typeof html !== 'string')
throw new Error(`Expected string got ${typeof html}`);
let $ = cheerio_1.default.load(html);
return $('body').text();
});
const result = type === 'onePerPage' ?
contents.map((content, index) => ({ file: newFilename(index), content }))
: [{ file: newFilename(0), content: contents.join('\n') }];
return result.filter(({ content }) => content !== undefined && content !== null && content.trim().length > 0);
};
const config = {
inputDir: tika,
outputDir: text,
fn,
readFile: async (file) => node_fs_1.default.readFileSync(file, 'utf8'),
filter: (file) => file.endsWith('.json'),
newFileNameFn: (0, fileutils_1.changeExtensionAddIndex)('.txt'),
debug: opts.debug === true,
dryRun: opts.dryRun === true
};
console.log('made html files', await (0, fileutils_1.transformFiles)(config));
};
}
exports.tikaAction = tikaAction;
function addTikaCommand(tc) {
return {
cmd: 'tika',
description: `turn tika files to text files ${tc.config.directories.tika} ==> ${tc.config.directories.text}`,
options: {
'--clean': { description: 'Delete the output file directory at the start' },
'--debug': { description: 'Show debug information' },
'--onePer <type>': { description: `One file per page or per file. Legal values 'page' or 'default'. overrides config`, default: tc.config.transform.type },
'--dryRun': { description: `Just do a dry run instead of actually making the pipelines` }
},
action: tikaAction(tc)
};
}
exports.addTikaCommand = addTikaCommand;