@nosferatu500/textract
Version:
Extracting text from files of various type including html, pdf, doc, docx, xls, xlsx, csv, pptx, png, jpg, gif, rtf, text/*, and various open office.
154 lines (143 loc) • 5.15 kB
JavaScript
const { exec } = require("child_process");
const fs = require("fs");
const os = require("os");
const path = require("path");
const outDir = path.join(os.tmpdir(), "textract");
const replacements = [
[/[|\u201C\u201D]|“|â€/g, '"'], // fancy double quotes
[/[|\u2018\u2019]|’|‘]/g, "'"], // fancy single quotes/apostrophes
[/…/g, "…"], // elipses
[/–|—/g, "–"], // long hyphen
];
const rLen = replacements.length;
// Up front creation of tmp dir
if (!fs.existsSync(outDir)) {
fs.mkdirSync(outDir);
}
// replace nasty quotes with simple ones
function replaceBadCharacters(text) {
let i;
let repl;
for (i = 0; i < rLen; i++) {
repl = replacements[i];
text = text.replace(repl[0], repl[1]);
}
return text;
}
function yauzlError(err, cb) {
let msg = err.message;
if (msg === "end of central directory record signature not found") {
msg = `File not correctly recognized as zip file, ${msg}`;
}
cb(new Error(msg), null);
}
function createExecOptions(type, options) {
let execOptions = {};
if (options[type] && options[type].exec) {
execOptions = options[type].exec;
} else if (options.exec) {
execOptions = options.exec;
}
return execOptions;
}
function unzipCheck(type, cb) {
exec("unzip", function (error /* , stdout, stderr */) {
if (error) {
// eslint-disable-next-line no-console
console.error(
`textract: 'unzip' does not appear to be installed, ` + `so textract will be unable to extract ${type}.`
);
}
cb(error === null);
});
}
function getTextFromZipFile(zipfile, entry, cb) {
zipfile.openReadStream(entry, function (err, readStream) {
let text = "";
let error = "";
if (err) {
cb(err, null);
return;
}
readStream.on("data", function (chunk) {
text += chunk;
});
readStream.on("end", function () {
if (error.length > 0) {
cb(error, null);
} else {
cb(null, text);
}
});
readStream.on("error", function (_err) {
error += _err;
});
});
}
/**
* 1) builds an exec command using provided `genCommand` callback
* 2) runs that command against an input file path
* resulting in an output file
* 3) reads that output file in
* 4) cleans the output file up
* 5) executes a callback with the contents of the file
*
* @param {string} label Name for the extractor, e.g. `Tesseract`
* @param {string} filePath path to file to be extractor
* @param {object} options extractor options as provided
* via user configuration
* @param {object} execOptions execution options passed to
* `exec` commmand as provided via user configuration
* @param {function} genCommand function used to generate
* the command to be executed
* @param {string} cb callback that is passed error/text
*
*/
function runExecIntoFile(label, filePath, options, execOptions, genCommand, cb) {
// escape the file paths
const fileTempOutPath = path.join(outDir, path.basename(filePath, path.extname(filePath)));
const escapedFilePath = filePath.replace(/\s/g, "\\ ");
const escapedFileTempOutPath = fileTempOutPath.replace(/\s/g, "\\ ");
const cmd = genCommand(options, escapedFilePath, escapedFileTempOutPath);
exec(cmd, execOptions, function (error /* , stdout, stderr */) {
if (error !== null) {
error = new Error(`Error extracting [[ ${path.basename(filePath)} ]], exec error: ${error.message}`);
cb(error, null);
return;
}
fs.exists(`${fileTempOutPath}.txt`, function (exists) {
if (exists) {
fs.readFile(`${fileTempOutPath}.txt`, "utf8", function (error2, text) {
if (error2) {
error2 = new Error(
`Error reading${label} output at [[ ${fileTempOutPath} ]], error: ${error2.message}`
);
cb(error2, null);
} else {
fs.unlink(`${fileTempOutPath}.txt`, function (error3) {
if (error3) {
error3 = new Error(
`Error, ${label} , cleaning up temp file [[ ${fileTempOutPath} ]], error: ${error3.message}`
);
cb(error3, null);
} else {
cb(null, text.toString());
}
});
}
});
} else {
error = new Error(`Error reading ${label} output at [[ ${fileTempOutPath} ]], file does not exist`);
cb(error, null);
}
});
});
}
module.exports = {
createExecOptions,
unzipCheck,
getTextFromZipFile,
yauzlError,
runExecIntoFile,
replaceBadCharacters,
};