UNPKG

@nosferatu500/textract

Version:

Extracting text from files of various type including html, pdf, doc, docx, xls, xlsx, csv, pptx, png, jpg, gif, rtf, text/*, and various open office.

154 lines (143 loc) 5.15 kB
const { exec } = require("child_process"); const fs = require("fs"); const os = require("os"); const path = require("path"); const outDir = path.join(os.tmpdir(), "textract"); const replacements = [ [/[|\u201C\u201D]|“|â€/g, '"'], // fancy double quotes [/[|\u2018\u2019]|’|‘]/g, "'"], // fancy single quotes/apostrophes [/…/g, "…"], // elipses [/–|—/g, "–"], // long hyphen ]; const rLen = replacements.length; // Up front creation of tmp dir if (!fs.existsSync(outDir)) { fs.mkdirSync(outDir); } // replace nasty quotes with simple ones function replaceBadCharacters(text) { let i; let repl; for (i = 0; i < rLen; i++) { repl = replacements[i]; text = text.replace(repl[0], repl[1]); } return text; } function yauzlError(err, cb) { let msg = err.message; if (msg === "end of central directory record signature not found") { msg = `File not correctly recognized as zip file, ${msg}`; } cb(new Error(msg), null); } function createExecOptions(type, options) { let execOptions = {}; if (options[type] && options[type].exec) { execOptions = options[type].exec; } else if (options.exec) { execOptions = options.exec; } return execOptions; } function unzipCheck(type, cb) { exec("unzip", function (error /* , stdout, stderr */) { if (error) { // eslint-disable-next-line no-console console.error( `textract: 'unzip' does not appear to be installed, ` + `so textract will be unable to extract ${type}.` ); } cb(error === null); }); } function getTextFromZipFile(zipfile, entry, cb) { zipfile.openReadStream(entry, function (err, readStream) { let text = ""; let error = ""; if (err) { cb(err, null); return; } readStream.on("data", function (chunk) { text += chunk; }); readStream.on("end", function () { if (error.length > 0) { cb(error, null); } else { cb(null, text); } }); readStream.on("error", function (_err) { error += _err; }); }); } /** * 1) builds an exec command using provided `genCommand` callback * 2) runs that command against an input file path * resulting in an output file * 3) reads that output file in * 4) cleans the output file up * 5) executes a callback with the contents of the file * * @param {string} label Name for the extractor, e.g. `Tesseract` * @param {string} filePath path to file to be extractor * @param {object} options extractor options as provided * via user configuration * @param {object} execOptions execution options passed to * `exec` commmand as provided via user configuration * @param {function} genCommand function used to generate * the command to be executed * @param {string} cb callback that is passed error/text * */ function runExecIntoFile(label, filePath, options, execOptions, genCommand, cb) { // escape the file paths const fileTempOutPath = path.join(outDir, path.basename(filePath, path.extname(filePath))); const escapedFilePath = filePath.replace(/\s/g, "\\ "); const escapedFileTempOutPath = fileTempOutPath.replace(/\s/g, "\\ "); const cmd = genCommand(options, escapedFilePath, escapedFileTempOutPath); exec(cmd, execOptions, function (error /* , stdout, stderr */) { if (error !== null) { error = new Error(`Error extracting [[ ${path.basename(filePath)} ]], exec error: ${error.message}`); cb(error, null); return; } fs.exists(`${fileTempOutPath}.txt`, function (exists) { if (exists) { fs.readFile(`${fileTempOutPath}.txt`, "utf8", function (error2, text) { if (error2) { error2 = new Error( `Error reading${label} output at [[ ${fileTempOutPath} ]], error: ${error2.message}` ); cb(error2, null); } else { fs.unlink(`${fileTempOutPath}.txt`, function (error3) { if (error3) { error3 = new Error( `Error, ${label} , cleaning up temp file [[ ${fileTempOutPath} ]], error: ${error3.message}` ); cb(error3, null); } else { cb(null, text.toString()); } }); } }); } else { error = new Error(`Error reading ${label} output at [[ ${fileTempOutPath} ]], file does not exist`); cb(error, null); } }); }); } module.exports = { createExecOptions, unzipCheck, getTextFromZipFile, yauzlError, runExecIntoFile, replaceBadCharacters, };