UNPKG

@nosferatu500/textract

Version:

Extracting text from files of various type including html, pdf, doc, docx, xls, xlsx, csv, pptx, png, jpg, gif, rtf, text/*, and various open office.

57 lines (48 loc) 1.56 kB
const { spawn } = require("child_process"); const { exec } = require("child_process"); const os = require("os"); const path = require("path"); let types; // textutil -convert txt -stdout foo.doc function extractText(filePath, options, cb) { let result = ""; let error = null; const textutil = spawn("textutil", ["-convert", "txt", "-stdout", filePath]); textutil.stdout.on("data", function (buffer) { result += buffer.toString(); }); textutil.stderr.on("error", function (buffer) { if (!error) { error = ""; } error += buffer.toString(); }); textutil.on("close", function (/* code */) { if (error) { error = new Error(`textutil read of file named [[ ${path.basename(filePath)} ]] failed: ${error}`); cb(error, null); return; } cb(null, result.trim()); }); } function testForBinary(options, cb) { // just osx extractor, so don't bother checking on osx if (os.platform() !== "darwin") { cb(true); return; } exec(`textutil ${__filename}`, function (error /* , stdout, stderr */) { let msg; if (error !== null) { msg = "INFO: 'textutil' does not appear to be installed, " + "so textract will be unable to extract DOCs."; } cb(error === null, msg); }); } types = os.platform() === "darwin" ? ["application/msword", "application/rtf", "text/rtf"] : []; module.exports = { types, extract: extractText, test: testForBinary, };