@nosferatu500/textract
Version:
Extracting text from files of various type including html, pdf, doc, docx, xls, xlsx, csv, pptx, png, jpg, gif, rtf, text/*, and various open office.
41 lines (36 loc) • 1.25 kB
JavaScript
const { exec } = require("child_process");
const path = require("path");
const extract = require("pdf-text-extract");
function extractText(filePath, options, cb) {
// See https://github.com/dbashford/textract/issues/75 for description of
// what is happening here
const pdftotextOptions = options.pdftotextOptions || { layout: "raw" };
extract(filePath, pdftotextOptions, function (error, pages) {
let fullText;
if (error) {
error = new Error(
`Error extracting PDF text for file at [[ ${path.basename(filePath)} ]], error: ${error.message}`
);
cb(error, null);
return;
}
fullText = pages.join(" ").trim();
cb(null, fullText);
});
}
function testForBinary(options, cb) {
exec("pdftotext -v", function (error, stdout, stderr) {
let msg;
if (stderr && stderr.includes("pdftotext version")) {
cb(true);
} else {
msg = "INFO: 'pdftotext' does not appear to be installed, " + "so textract will be unable to extract PDFs.";
cb(false, msg);
}
});
}
module.exports = {
types: ["application/pdf"],
extract: extractText,
test: testForBinary,
};