UNPKG

office-text-extractor-browser

Version:

Fork of office-text-extractor with unreleased changes that include browser support

118 lines (117 loc) 4.53 kB
// source/parsers/ppt.ts // The text extracter for MS PowerPoint files. import { unzip } from 'fflate'; import { parseStringPromise as xmlToJson } from 'xml2js'; import encoding from 'text-encoding'; export class PptExtractor { constructor() { this.decoder = new encoding.TextDecoder(); /** * The type(s) of input acceptable to this method. */ this.mimes = [ 'application/vnd.openxmlformats-officedocument.presentationml.presentation', ]; /** * Extract text from a PPT file if possible. * * @param payload The input and its type. * @returns The text extracted from the input. */ this.apply = async (input) => { const files = await unzipBuffer(input); const slides = []; for (const file of files) { const { buffer } = file.content; const contents = this.decoder.decode(buffer); const slide = await xmlToJson(contents); const lines = await parseSlideSection(slide); slides.push(lines?.join('\n')); } const formattedText = slides.join('\n---\n') + '\n'; return formattedText; }; } } /** * Unzip a PPT file, and return a list of slides. * * @param buffer The buffer containing the file. * @returns The slide files. */ const unzipBuffer = async (input) => { // Convert the buffer to a uint-8 array, and pass it to the unzip function. const zipBuffer = new Uint8Array(input.buffer); const ppt = (await new Promise((resolve, reject) => { unzip(zipBuffer, (error, result) => { if (error) reject(error); else resolve(result); }); })); // Filter out the files that don't contain the text on the slides. const files = Object.keys(ppt) .filter((name) => /ppt\/slides\/slide\d*.xml/.test(name)) .map((name) => { return { name, content: ppt[name] }; }); return files; }; /** * Extracts text from a section of the slide, recursively. * * @param slideSection The section of the slide, converted to JSON from XML. * @param collectedText The lines of text parsed from the slide so far. * * @returns The lines of text on the slide. */ const parseSlideSection = async (slideSection, collectedText) => { // Keep track of the text being collected. const beingCollectedText = collectedText ?? []; // Parse the section according to what type it is. if (Array.isArray(slideSection)) { // If it is, loop through the elements of the array. for (const element of slideSection) { // Collect all the pieces of text from the array. if (typeof element === 'string' && element !== '') { beingCollectedText.push(element); } else { // However, if it is an object or another array, call this function // again to parse that. await parseSlideSection(element, beingCollectedText); } } // Finally, return the collected text. return beingCollectedText; } // If the section is an object, loop through its properties. if (typeof slideSection === 'object') { for (const property of Object.keys(slideSection)) { // Get the value of the property. const value = slideSection[property]; // The `pptx` format stores the actual text inside the `a:t` or `_` // properties, so extract text from those properties. // Check if it is a string or array that contains a string. If it is // either, then collect the text content. if (typeof value === 'string') { if ((property === 'a:t' || property === '_') && value !== '') { beingCollectedText.push(value); } } else if (typeof value[0] === 'string') { if ((property === 'a:t' || property === '_') && value[0] !== '') { beingCollectedText.push(value[0]); } } else { // However, if it is an object or another array, call this function // again to parse that. await parseSlideSection(value, beingCollectedText); } } // Finally, return the collected text. return beingCollectedText; } };