UNPKG

@nosferatu500/textract

Version:

Extracting text from files of various type including html, pdf, doc, docx, xls, xlsx, csv, pptx, png, jpg, gif, rtf, text/*, and various open office.

86 lines (76 loc) 2.94 kB
const xpath = require("xpath"); const Dom = require("@xmldom/xmldom").DOMParser; const yauzl = require("yauzl"); const util = require("../util"); const includeRegex = /.xml$/; const excludeRegex = /^(word\/media\/|word\/_rels\/)/; function _calculateExtractedText(inText, preserveLineBreaks) { // Security workaround for xmldom >= v0.8.4 inText = `<?xml version="1.0" encoding="UTF-8" standalone="yes"?>` + `<Properties>${inText}</Properties>`; const doc = new Dom().parseFromString(inText); const ps = xpath.select("//*[local-name()='p']", doc); let text = ""; for (let paragraph of ps) { var ts; let localText = ""; paragraph = new Dom().parseFromString(paragraph.toString()); ts = xpath.select("//*[local-name()='t' or local-name()='tab' or local-name()='br']", paragraph); for (const t of ts) { if (t.localName === "t" && t.childNodes.length > 0) { localText += t.childNodes[0].data; } else if (t.localName === "tab") { localText += " "; } else if (t.localName === "br") { localText += preserveLineBreaks === true ? "\n" : " "; } } text += `${localText}\n`; } return text; } function extractText(filePath, options, cb) { let result = ""; yauzl.open(filePath, function (err, zipfile) { let processEnd; let processedEntries = 0; if (err) { util.yauzlError(err, cb); return; } processEnd = function () { let text; if (zipfile.entryCount === ++processedEntries) { if (result.length > 0) { text = _calculateExtractedText(result, options.preserveLineBreaks); cb(null, text); } else { cb( new Error( "Extraction could not find content in file, are you" + " sure it is the mime type it says it is?" ), null ); } } }; zipfile.on("entry", function (entry) { if (includeRegex.test(entry.fileName) && !excludeRegex.test(entry.fileName)) { util.getTextFromZipFile(zipfile, entry, function (err2, text) { // Security workaround for xmldom >= v0.8.4 result += `${text}\n`.replace('<?xml version="1.0" encoding="UTF-8" standalone="yes"?>\r\n', ""); processEnd(); }); } else { processEnd(); } }); zipfile.on("error", function (err3) { cb(err3); }); }); } module.exports = { types: ["application/vnd.openxmlformats-officedocument.wordprocessingml.document"], extract: extractText, };