UNPKG

@nosferatu500/textract

Version:

Extracting text from files of various type including html, pdf, doc, docx, xls, xlsx, csv, pptx, png, jpg, gif, rtf, text/*, and various open office.

46 lines (42 loc) 1.6 kB
const EPub = require("epub2/node"); const htmlExtract = require("./html"); function extractText(filePath, options, cb) { const epub = new EPub(filePath); let allText = ""; let hasError = false; let chapterCount = 0; epub.on("end", function () { // Iterate over each chapter... for (const chapter of epub.flow) { // if already error, don't do anything if (!hasError) { // Get the chapter text epub.getChapterRaw(chapter.id, function (rawChaperError, text) { if (rawChaperError) { hasError = true; cb(rawChaperError, null); } else { // Extract the raw text from the chapter text (it's html) htmlExtract.extractFromText(text, options, function (htmlExtractError, outText) { if (htmlExtractError) { hasError = true; cb(htmlExtractError, null); } else { allText += outText; chapterCount++; if (chapterCount === epub.flow.length) { cb(null, allText); } } }); } }); } } }); epub.parse(); } module.exports = { types: ["application/epub+zip"], extract: extractText, };