@nosferatu500/textract
Version:
Extracting text from files of various type including html, pdf, doc, docx, xls, xlsx, csv, pptx, png, jpg, gif, rtf, text/*, and various open office.
114 lines (101 loc) • 4.42 kB
JavaScript
const cheerio = require("cheerio");
const yauzl = require("yauzl");
const util = require("../util");
function extractText(filePath, options, cb) {
yauzl.open(filePath, function (err, zipfile) {
let textOnTheWay = false;
let outputContent;
let outputStyles;
const headerTexts = [];
const nodeTexts = [];
const footerTexts = [];
if (err) {
util.yauzlError(err, cb);
return;
}
zipfile.on("end", function () {
if (!textOnTheWay) {
cb(
new Error(
"Extraction could not find content.xml in file, " +
"are you sure it is the mime type it says it is?"
),
null
);
}
});
zipfile.on("entry", function (entry) {
let $;
let nodes;
let header;
let footer;
let i;
if (entry.fileName === "content.xml" || entry.fileName === "styles.xml") {
textOnTheWay = true;
util.getTextFromZipFile(zipfile, entry, function (err2, text) {
if (entry.fileName === "content.xml") {
outputContent = text
.replace(/^(.Archive).*/, "")
.replace("inflating: content.xml", "")
.replace(/text:p/g, "textractTextNode")
.replace(/text:h/g, "textractTextNode")
// remove empty nodes
.replace(/<textractTextNode\/>/g, "")
// remove empty nodes that have styles
.replace(/<textractTextNode[^>]*\/>/g, "")
.trim();
$ = cheerio.load(`<body>${outputContent}</body>`);
nodes = $("textractTextNode");
for (i = 0; i < nodes.length; i++) {
nodeTexts.push($(nodes[i]).text());
}
if (!options.allowHeaderAndFooter) {
cb(null, nodeTexts.join("\n"));
}
} else if (entry.fileName === "styles.xml") {
outputStyles = text
.replace(/^(.Archive).*/, "")
.replace("inflating: style.xml", "")
.replace(/style:header/g, "textractHeader")
.replace(/style:footer/g, "textractFooter")
.replace(/<textractHeader\/>/g, "")
.replace(/<textractFooter\/>/g, "")
.replace(/<textractHeader[^>]*\/>/g, "")
.replace(/<textractFooter[^>]*\/>/g, "")
.trim();
$ = cheerio.load(`<body>${outputStyles}</body>`);
header = $("textractHeader");
footer = $("textractFooter");
for (i = 0; i < header.length; i++) {
headerTexts.push($(header[i]).text());
}
for (i = 0; i < footer.length; i++) {
footerTexts.push($(footer[i]).text());
}
}
if (outputContent && outputStyles && options.allowHeaderAndFooter) {
const divider = "\n";
let t = headerTexts.join(divider);
t += divider + nodeTexts.join(divider);
t += footerTexts.join(divider);
cb(null, t);
}
});
}
});
zipfile.on("error", function (err3) {
cb(err3);
});
});
}
module.exports = {
types: [
"application/vnd.oasis.opendocument.text",
"application/vnd.oasis.opendocument.text-template",
"application/vnd.oasis.opendocument.graphics",
"application/vnd.oasis.opendocument.graphics-template",
"application/vnd.oasis.opendocument.presentation",
"application/vnd.oasis.opendocument.presentation-template",
],
extract: extractText,
};