UNPKG

textract

Version:

Extracting text from files of various type including html, pdf, doc, docx, xls, xlsx, csv, pptx, png, jpg, gif, rtf, text/*, and various open office.

93 lines (81 loc) 2.4 kB
var xpath = require( 'xpath' ) , Dom = require( 'xmldom' ).DOMParser , yauzl = require( 'yauzl' ) , util = require( '../util' ) , includeRegex = /.xml$/ , excludeRegex = /^(word\/media\/|word\/_rels\/)/ ; function _calculateExtractedText( inText, preserveLineBreaks ) { var doc = new Dom().parseFromString( inText ) , ps = xpath.select( "//*[local-name()='p']", doc ) , text = '' ; ps.forEach( function( paragraph ) { var ts , localText = '' ; paragraph = new Dom().parseFromString( paragraph.toString() ); ts = xpath.select( "//*[local-name()='t' or local-name()='tab' or local-name()='br']", paragraph ); ts.forEach( function( t ) { if ( t.localName === 't' && t.childNodes.length > 0 ) { localText += t.childNodes[0].data; } else { if ( t.localName === 'tab' ) { localText += ' '; } else if ( t.localName === 'br' ) { if ( preserveLineBreaks !== true ) { localText += ' '; } else { localText += '\n'; } } } }); text += localText + '\n'; }); return text; } function extractText( filePath, options, cb ) { var result = ''; yauzl.open( filePath, function( err, zipfile ) { var processEnd , processedEntries = 0 ; if ( err ) { util.yauzlError( err, cb ); return; } processEnd = function() { var text; if ( zipfile.entryCount === ++processedEntries ) { if ( result.length ) { text = _calculateExtractedText( result, options.preserveLineBreaks ); cb( null, text ); } else { cb( new Error( 'Extraction could not find content in file, are you' + ' sure it is the mime type it says it is?' ), null ); } } }; zipfile.on( 'entry', function( entry ) { if ( includeRegex.test( entry.fileName ) && !excludeRegex.test( entry.fileName ) ) { util.getTextFromZipFile( zipfile, entry, function( err2, text ) { result += text + '\n'; processEnd(); }); } else { processEnd(); } }); zipfile.on( 'error', function( err3 ) { cb( err3 ); }); }); } module.exports = { types: ['application/vnd.openxmlformats-officedocument.wordprocessingml.document'], extract: extractText };