UNPKG

@nosferatu500/textract

Version:

Extracting text from files of various type including html, pdf, doc, docx, xls, xlsx, csv, pptx, png, jpg, gif, rtf, text/*, and various open office.

github.com/nosferatu500/textract

nosferatu500/textract

98 lines (82 loc) • 2.84 kB

JavaScript

/* eslint-disable max-len */ const fs = require("fs"); const cheerio = require("cheerio"); function getTextWithAlt($, $element) { if (!$element) { return ""; } if ($element.is("img")) { return ` ${$element.attr("alt")} `; } if ($element.is("input")) { return $element.attr("value"); } return $element .contents() .map(function (i, domElement) { let returnText; if (domElement.nodeType === 3) { returnText = domElement.data; } else if (domElement.nodeType === 1) { $element = $(domElement); returnText = $element.is("img, input") || $element.find("img[alt], input[value]").length > 0 ? getTextWithAlt($, $element) : $element.text(); } return returnText; }) .get() .join(""); } function extractFromText(data, options, cb) { let $; let text; text = data .toString() .replace( /< *(br|p|div|section|aside|button|header|footer|li|article|blockquote|cite|code|h1|h2|h3|h4|h5|h6|legend|nav)((.*?)>)/g, "<$1$2|||||" ) .replace(/< *\/(td|a|option) *>/g, " </$1>") // spacing some things out so text doesn't get smashed together .replace(/< *(a|td|option)/g, " <$1") // spacing out links .replace(/< *(br|hr) +\/>/g, "|||||<$1\\>") .replace( /<\/ +?(p|div|section|aside|button|header|footer|li|article|blockquote|cite|code|h1|h2|h3|h4|h5|h6|legend|nav)>/g, "|||||</$1>" ); text = `<textractwrapper>${text}<textractwrapper>`; try { $ = cheerio.load(text); $("script").remove(); $("style").remove(); $("noscript").remove(); const $docElement = $("textractwrapper"); text = options.includeAltText ? getTextWithAlt($, $docElement) : $docElement.text(); text = text .replace(/\|{5}/g, "\n") .replace(/(\n\u00A0|\u00A0\n|\n | \n)+/g, "\n") .replace(/(\r\u00A0|\u00A0\r|\r | \r)+/g, "\n") .replace(/(\v\u00A0|\u00A0\v|\v | \v)+/g, "\n") .replace(/(\t\u00A0|\u00A0\t|\t | \t)+/g, "\n") .replace(/[\t\n\v\r]+/g, "\n"); } catch (error) { cb(error, null); return; } cb(null, text); } function extractText(filePath, options, cb) { fs.readFile(filePath, function (error, data) { if (error) { cb(error, null); return; } extractFromText(data, options, cb); }); } module.exports = { types: ["text/html", "text/xml", "application/xml", "application/rss+xml", "application/atom+xml"], extract: extractText, extractFromText, };