@nosferatu500/textract
Version:
Extracting text from files of various type including html, pdf, doc, docx, xls, xlsx, csv, pptx, png, jpg, gif, rtf, text/*, and various open office.
60 lines (53 loc) • 2.04 kB
JavaScript
const { exec } = require("child_process");
const os = require("os");
const path = require("path");
const util = require("../util");
const htmlExtract = require("./html");
let types;
function extractText(filePath, options, cb) {
const execOptions = util.createExecOptions("rtf", options);
const escapedPath = filePath.replace(/\s/g, "\\ ");
// Going to output html from unrtf because unrtf does a great job of
// going to html, but does a crap job of going to text. It leaves sections
// out, strips apostrophes, leaves nasty quotes in for bullets and more
// that I've likely not yet discovered.
//
// textract can go from html to text on its own, so let unrtf go to html
// then extract the text from that
//
// Also do not have to worry about stripping comments from unrtf text
// output since HTML comments are not included in output. Also, the
// unrtf --quiet option doesn't work.
exec(`unrtf --html --nopict ${escapedPath}`, execOptions, function (error, stdout /* , stderr */) {
let err;
if (error) {
err = new Error(`unrtf read of file named [[ ${path.basename(filePath)} ]] failed: ${error}`);
cb(err, null);
} else {
htmlExtract.extractFromText(stdout.trim(), {}, cb);
}
});
}
function testForBinary(options, cb) {
// just non-osx extractor
if (os.platform() === "darwin") {
cb(true);
return;
}
exec(`unrtf ${__filename}`, function (error /* , stdout, stderr */) {
let msg;
if (error !== null && error.message && error.message.includes("not found")) {
msg = "INFO: 'unrtf' does not appear to be installed, " + "so textract will be unable to extract RTFs.";
cb(false, msg);
} else {
cb(true);
}
});
}
// rely on native tools on osx
types = os.platform() === "darwin" ? [] : ["application/rtf", "text/rtf"];
module.exports = {
types,
extract: extractText,
test: testForBinary,
};