UNPKG

@nosferatu500/textract

Version:

Extracting text from files of various type including html, pdf, doc, docx, xls, xlsx, csv, pptx, png, jpg, gif, rtf, text/*, and various open office.

114 lines (101 loc) 4.42 kB
const cheerio = require("cheerio"); const yauzl = require("yauzl"); const util = require("../util"); function extractText(filePath, options, cb) { yauzl.open(filePath, function (err, zipfile) { let textOnTheWay = false; let outputContent; let outputStyles; const headerTexts = []; const nodeTexts = []; const footerTexts = []; if (err) { util.yauzlError(err, cb); return; } zipfile.on("end", function () { if (!textOnTheWay) { cb( new Error( "Extraction could not find content.xml in file, " + "are you sure it is the mime type it says it is?" ), null ); } }); zipfile.on("entry", function (entry) { let $; let nodes; let header; let footer; let i; if (entry.fileName === "content.xml" || entry.fileName === "styles.xml") { textOnTheWay = true; util.getTextFromZipFile(zipfile, entry, function (err2, text) { if (entry.fileName === "content.xml") { outputContent = text .replace(/^(.Archive).*/, "") .replace("inflating: content.xml", "") .replace(/text:p/g, "textractTextNode") .replace(/text:h/g, "textractTextNode") // remove empty nodes .replace(/<textractTextNode\/>/g, "") // remove empty nodes that have styles .replace(/<textractTextNode[^>]*\/>/g, "") .trim(); $ = cheerio.load(`<body>${outputContent}</body>`); nodes = $("textractTextNode"); for (i = 0; i < nodes.length; i++) { nodeTexts.push($(nodes[i]).text()); } if (!options.allowHeaderAndFooter) { cb(null, nodeTexts.join("\n")); } } else if (entry.fileName === "styles.xml") { outputStyles = text .replace(/^(.Archive).*/, "") .replace("inflating: style.xml", "") .replace(/style:header/g, "textractHeader") .replace(/style:footer/g, "textractFooter") .replace(/<textractHeader\/>/g, "") .replace(/<textractFooter\/>/g, "") .replace(/<textractHeader[^>]*\/>/g, "") .replace(/<textractFooter[^>]*\/>/g, "") .trim(); $ = cheerio.load(`<body>${outputStyles}</body>`); header = $("textractHeader"); footer = $("textractFooter"); for (i = 0; i < header.length; i++) { headerTexts.push($(header[i]).text()); } for (i = 0; i < footer.length; i++) { footerTexts.push($(footer[i]).text()); } } if (outputContent && outputStyles && options.allowHeaderAndFooter) { const divider = "\n"; let t = headerTexts.join(divider); t += divider + nodeTexts.join(divider); t += footerTexts.join(divider); cb(null, t); } }); } }); zipfile.on("error", function (err3) { cb(err3); }); }); } module.exports = { types: [ "application/vnd.oasis.opendocument.text", "application/vnd.oasis.opendocument.text-template", "application/vnd.oasis.opendocument.graphics", "application/vnd.oasis.opendocument.graphics-template", "application/vnd.oasis.opendocument.presentation", "application/vnd.oasis.opendocument.presentation-template", ], extract: extractText, };