textract
Version:
Extracting text from files of various type including html, pdf, doc, docx, xls, xlsx, csv, pptx, png, jpg, gif, rtf, text/*, and various open office.
164 lines (145 loc) • 4.79 kB
JavaScript
var fs = require( 'fs' )
, path = require( 'path' )
, mime = require( 'mime' )
, extract = require( './extract' )
, os = require( 'os' )
, got = require( 'got' )
, tmpDir = os.tmpdir()
;
function _genRandom() {
return Math.floor( ( Math.random() * 100000000000 ) + 1 );
}
function _extractWithType( type, filePath, options, cb ) {
fs.exists( filePath, function( exists ) {
if ( exists ) {
extract( type, filePath, options, cb );
} else {
cb( new Error( 'File at path [[ ' + filePath + ' ]] does not exist.' ), null );
}
});
}
function _returnArgsError( _args ) {
var args = Array.prototype.slice.call( _args )
, callback
;
args.forEach( function( parm ) {
if ( parm && typeof parm === 'function' ) {
callback = parm;
}
});
if ( callback ) {
callback( new Error( 'Incorrect parameters passed to textract.' ), null );
} else {
// eslint-disable-next-line no-console
console.error( 'textract could not find a callback function to execute.' );
}
}
function _writeBufferToDisk( buff, cb ) {
var fullPath = path.join( tmpDir, 'textract_file_' + _genRandom() );
fs.open( fullPath, 'w', function( err, fd ) {
if ( err ) {
throw new Error( 'error opening temp file: ' + err );
} else {
fs.write( fd, buff, 0, buff.length, null, function( err2 ) {
if ( err2 ) {
throw new Error( 'error writing temp file: ' + err2 );
} else {
fs.close( fd, function() {
cb( fullPath );
});
}
});
}
});
}
function fromFileWithMimeAndPath( type, filePath, options, cb ) {
var called = false;
if ( typeof type === 'string' && typeof filePath === 'string' ) {
if ( typeof cb === 'function' && typeof options === 'object' ) {
// (mimeType, filePath, options, callback)
_extractWithType( type, filePath, options, cb );
called = true;
} else if ( typeof options === 'function' && cb === undefined ) {
// (mimeType, filePath, callback)
_extractWithType( type, filePath, {}, options );
called = true;
}
}
if ( !called ) {
_returnArgsError( arguments );
}
}
function fromFileWithPath( filePath, options, cb ) {
var type;
if ( typeof filePath === 'string' &&
( typeof options === 'function' || typeof cb === 'function' ) ) {
type = ( options && options.typeOverride ) || mime.getType( filePath );
fromFileWithMimeAndPath( type, filePath, options, cb );
} else {
_returnArgsError( arguments );
}
}
// eslint-disable-next-line no-unused-vars
function fromBufferWithMime( type, bufferContent, options, cb, withPath ) {
if ( typeof type === 'string' &&
bufferContent &&
bufferContent instanceof Buffer &&
( typeof options === 'function' || typeof cb === 'function' ) ) {
_writeBufferToDisk( bufferContent, function( newPath ) {
fromFileWithMimeAndPath( type, newPath, options, cb );
});
} else {
_returnArgsError( arguments );
}
}
function fromBufferWithName( filePath, bufferContent, options, cb ) {
var type;
if ( typeof filePath === 'string' ) {
type = mime.getType( filePath );
fromBufferWithMime( type, bufferContent, options, cb, true );
} else {
_returnArgsError( arguments );
}
}
function fromUrl( url, options, cb ) {
var urlNoQueryParams, extname, filePath, fullFilePath, file, href, callbackCalled;
// allow url to be either a string or to be a
// Node URL Object: https://nodejs.org/api/url.html
href = ( typeof url === 'string' ) ? url : url.href;
if ( href ) {
options = options || {};
urlNoQueryParams = href.split( '?' )[0];
extname = path.extname( urlNoQueryParams );
filePath = _genRandom() + extname;
fullFilePath = path.join( tmpDir, filePath );
file = fs.createWriteStream( fullFilePath );
file.on( 'finish', function() {
if ( !callbackCalled ) {
fromFileWithPath( fullFilePath, options, cb );
}
});
got.stream( url )
.on( 'response', function( response ) {
// allows for overriding by the developer or automatically
// populating based on server response.
if ( !options.typeOverride ) {
options.typeOverride = response.headers['content-type'].split( /;/ )[0];
}
})
.on( 'error', function( error ) {
var _cb = ( typeof options === 'function' ) ? options : cb;
callbackCalled = true;
_cb( error );
})
.pipe( file );
} else {
_returnArgsError( arguments );
}
}
module.exports = {
fromFileWithPath: fromFileWithPath,
fromFileWithMimeAndPath: fromFileWithMimeAndPath,
fromBufferWithName: fromBufferWithName,
fromBufferWithMime: fromBufferWithMime,
fromUrl: fromUrl
};