@nosferatu500/textract
Version:
Extracting text from files of various type including html, pdf, doc, docx, xls, xlsx, csv, pptx, png, jpg, gif, rtf, text/*, and various open office.
46 lines (42 loc) • 1.6 kB
JavaScript
const EPub = require("epub2/node");
const htmlExtract = require("./html");
function extractText(filePath, options, cb) {
const epub = new EPub(filePath);
let allText = "";
let hasError = false;
let chapterCount = 0;
epub.on("end", function () {
// Iterate over each chapter...
for (const chapter of epub.flow) {
// if already error, don't do anything
if (!hasError) {
// Get the chapter text
epub.getChapterRaw(chapter.id, function (rawChaperError, text) {
if (rawChaperError) {
hasError = true;
cb(rawChaperError, null);
} else {
// Extract the raw text from the chapter text (it's html)
htmlExtract.extractFromText(text, options, function (htmlExtractError, outText) {
if (htmlExtractError) {
hasError = true;
cb(htmlExtractError, null);
} else {
allText += outText;
chapterCount++;
if (chapterCount === epub.flow.length) {
cb(null, allText);
}
}
});
}
});
}
}
});
epub.parse();
}
module.exports = {
types: ["application/epub+zip"],
extract: extractText,
};