UNPKG

@nosferatu500/textract

Version:

Extracting text from files of various type including html, pdf, doc, docx, xls, xlsx, csv, pptx, png, jpg, gif, rtf, text/*, and various open office.

43 lines (37 loc) 1.18 kB
const path = require("path"); const xlsx = require("xlsx"); function extractText(filePath, options, cb) { const CSVs = {}; let wb; let result; let error; let sheet; try { wb = xlsx.readFile(filePath); for (const key of Object.keys(wb.Sheets)) { sheet = wb.Sheets[key]; CSVs[key] = xlsx.utils.sheet_to_csv(sheet); } } catch (error_) { error = new Error(`Could not extract ${path.basename(filePath)}, ${error_}`); cb(error, null); return; } result = ""; for (const key of Object.keys(CSVs)) { result += CSVs[key]; } cb(null, result); } module.exports = { types: [ "application/vnd.ms-excel", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "application/vnd.ms-excel.sheet.binary.macroEnabled.12", "application/vnd.ms-excel.sheet.macroEnabled.12", "application/vnd.oasis.opendocument.spreadsheet", "application/vnd.openxmlformats-officedocument.spreadsheetml.template", "application/vnd.oasis.opendocument.spreadsheet-template", ], extract: extractText, };