UNPKG

@nosferatu500/textract

Version:

Extracting text from files of various type including html, pdf, doc, docx, xls, xlsx, csv, pptx, png, jpg, gif, rtf, text/*, and various open office.

177 lines (158 loc) 5.4 kB
const fs = require("fs"); const os = require("os"); const path = require("path"); const got = require("got"); const mime = require("mime"); const extract = require("./extract"); const tmpDir = os.tmpdir(); function _genRandom() { return Math.floor(Math.random() * 100_000_000_000 + 1); } function _extractWithType(type, filePath, options, cb) { fs.exists(filePath, function (exists) { if (exists) { extract(type, filePath, options, cb); } else { cb(new Error(`File at path [[ ${filePath} ]] does not exist.`), null); } }); } function _returnArgsError(_args) { const args = Array.prototype.slice.call(_args); let callback; for (const parm of args) { if (parm && typeof parm === "function") { callback = parm; } } if (callback) { callback(new Error("Incorrect parameters passed to textract."), null); } else { // eslint-disable-next-line no-console console.error("textract could not find a callback function to execute."); } } function _writeBufferToDisk(buff, cb) { const fullPath = path.join(tmpDir, `textract_file_${_genRandom()}`); fs.open(fullPath, "w", function (err, fd) { if (err) { throw new Error(`error opening temp file: ${err}`); } else { fs.write(fd, buff, 0, buff.length, null, function (err2) { if (err2) { throw new Error(`error writing temp file: ${err2}`); } else { fs.close(fd, function () { cb(fullPath); }); } }); } }); } function fromFileWithMimeAndPath(type, filePath, options, cb) { let called = false; if (typeof type === "string" && typeof filePath === "string") { if (typeof cb === "function" && typeof options === "object") { // (mimeType, filePath, options, callback) _extractWithType(type, filePath, options, cb); called = true; } else if (typeof options === "function" && cb === undefined) { // (mimeType, filePath, callback) _extractWithType(type, filePath, {}, options); called = true; } } if (!called) { _returnArgsError(arguments); } } function fromFileWithPath(filePath, options, cb) { let type; if (typeof filePath === "string" && (typeof options === "function" || typeof cb === "function")) { type = (options && options.typeOverride) || mime.getType(filePath); fromFileWithMimeAndPath(type, filePath, options, cb); } else { _returnArgsError(arguments); } } // eslint-disable-next-line no-unused-vars function fromBufferWithMime(type, bufferContent, options, cb, withPath) { if ( typeof type === "string" && bufferContent && bufferContent instanceof Buffer && (typeof options === "function" || typeof cb === "function") ) { if (typeof options === "function") { cb = options; options = {}; } _writeBufferToDisk(bufferContent, function (newPath) { fromFileWithMimeAndPath(type, newPath, options, function (err, text) { // Remove temporary file regardless of error, ignore error on unlink fs.unlink(newPath, function () {}); if (cb) cb(err, text); }); }); } else { _returnArgsError(arguments); } } function fromBufferWithName(filePath, bufferContent, options, cb) { let type; if (typeof filePath === "string") { type = mime.getType(filePath); fromBufferWithMime(type, bufferContent, options, cb, true); } else { _returnArgsError(arguments); } } function fromUrl(url, options, cb) { let urlNoQueryParams; let extname; let filePath; let fullFilePath; let file; let href; let callbackCalled; // allow url to be either a string or to be a // Node URL Object: https://nodejs.org/api/url.html href = typeof url === "string" ? url : url.href; if (href) { options = options || {}; urlNoQueryParams = href.split("?")[0]; extname = path.extname(urlNoQueryParams); filePath = _genRandom() + extname; fullFilePath = path.join(tmpDir, filePath); file = fs.createWriteStream(fullFilePath); file.on("finish", function () { if (!callbackCalled) { fromFileWithPath(fullFilePath, options, cb); } }); got.stream(url) .on("response", function (response) { // allows for overriding by the developer or automatically // populating based on server response. if (!options.typeOverride) { options.typeOverride = response.headers["content-type"].split(/;/)[0]; } }) .on("error", function (error) { const _cb = typeof options === "function" ? options : cb; callbackCalled = true; _cb(error); }) .pipe(file); } else { _returnArgsError(arguments); } } module.exports = { fromFileWithPath, fromFileWithMimeAndPath, fromBufferWithName, fromBufferWithMime, fromUrl, };