textract
Version:
Extracting text from files of various type including html, pdf, doc, docx, xls, xlsx, csv, pptx, png, jpg, gif, rtf, text/*, and various open office.
95 lines (83 loc) • 2.51 kB
JavaScript
var xpath = require( 'xpath' )
, Dom = require( 'xmldom' ).DOMParser
, yauzl = require( 'yauzl' )
, util = require( '../util' )
, slideMatch = /^ppt\/slides\/slide/
, noteMatch = /^ppt\/notesSlides\/notesSlide/
;
function _compareSlides( a, b ) {
if ( a.slide < b.slide ) {
return -1;
}
if ( a.slide > b.slide ) {
return 1;
}
return 0;
}
function _calculateExtractedText( slideText ) {
var doc = new Dom().parseFromString( slideText )
, ps = xpath.select( "//*[local-name()='p']", doc )
, text = ''
;
ps.forEach( function( paragraph ) {
var ts
, localText = ''
;
paragraph = new Dom().parseFromString( paragraph.toString() );
ts = xpath.select( "//*[local-name()='t' or local-name()='tab' or local-name()='br']",
paragraph );
ts.forEach( function( t ) {
if ( t.localName === 't' && t.childNodes.length > 0 ) {
localText += t.childNodes[0].data;
} else {
if ( t.localName === 'tab' || t.localName === 'br' ) {
localText += '';
}
}
});
text += localText + '\n';
});
return text;
}
function extractText( filePath, options, cb ) {
var slides = [];
yauzl.open( filePath, function( err, zipfile ) {
if ( err ) {
util.yauzlError( err, cb );
return;
}
zipfile.on( 'end', function() {
var slidesText, text;
if ( slides.length ) {
slides.sort( _compareSlides );
slidesText = slides.map( function( slide ) {
return slide.text;
}).join( '\n' );
text = _calculateExtractedText( slidesText );
cb( null, text );
} else {
cb(
new Error( 'Extraction could not find slides in file, are you' +
' sure it is the mime type it says it is?' ),
null );
}
});
zipfile.on( 'entry', function( entry ) {
if ( slideMatch.test( entry.fileName ) || noteMatch.test( entry.fileName ) ) {
util.getTextFromZipFile( zipfile, entry, function( err2, text ) {
var slide = +entry.fileName.replace( 'ppt/slides/slide', '' ).replace( '.xml', '' );
slides.push({ slide: slide, text: text });
});
}
});
zipfile.on( 'error', function( err3 ) {
cb( err3 );
});
});
}
module.exports = {
types: [
'application/vnd.openxmlformats-officedocument.presentationml.presentation',
'application/vnd.openxmlformats-officedocument.presentationml.template'],
extract: extractText
};