UNPKG

textract

Version:

Extracting text from files of various type including html, pdf, doc, docx, xls, xlsx, csv, pptx, png, jpg, gif, rtf, text/*, and various open office.

github.com/dbashford/textract

dbashford/textract

37 lines (33 loc) • 917 B

JavaScript

var fs = require( 'fs' ) , iconv = require( 'iconv-lite' ) , jschardet = require( 'jschardet' ) , path = require( 'path' ) ; function extractText( filePath, options, cb ) { fs.readFile( filePath, function( error, data ) { var encoding, decoded, detectedEncoding; if ( error ) { cb( error, null ); return; } try { detectedEncoding = jschardet.detect( data ).encoding; if ( !detectedEncoding ) { error = new Error( 'Could not detect encoding for file named [[ ' + path.basename( filePath ) + ' ]]' ); cb( error, null ); return; } encoding = detectedEncoding.toLowerCase(); decoded = iconv.decode( data, encoding ); } catch ( e ) { cb( e ); return; } cb( null, decoded ); }); } module.exports = { types: [/text\//, 'application/csv', 'application/javascript'], extract: extractText };