office-text-extractor-browser

// source/parsers/ppt.ts // The text extracter for MS PowerPoint files. /* eslint-disable @typescript-eslint/no-unsafe-assignment */ import { type Buffer } from 'buffer/' import { unzip } from 'fflate' import { parseStringPromise as xmlToJson } from 'xml2js' import encoding from 'text-encoding' import type { TextExtractionMethod } from '../lib.js' export class PptExtractor implements TextExtractionMethod { decoder = new encoding.TextDecoder() /** * The type(s) of input acceptable to this method. */ mimes = [ 'application/vnd.openxmlformats-officedocument.presentationml.presentation', ] /** * Extract text from a PPT file if possible. * * @param payload The input and its type. * @returns The text extracted from the input. */ apply = async (input: Buffer): Promise<string> => { const files = await unzipBuffer(input) const slides = [] for (const file of files) { const { buffer } = file.content const contents = this.decoder.decode(buffer) const slide = await xmlToJson(contents) const lines = await parseSlideSection(slide) slides.push(lines?.join('\n')) } const formattedText = slides.join('\n---\n') + '\n' return formattedText } } /** * A slide file. */ type SlideFile = { name: string content: Buffer } /** * Unzip a PPT file, and return a list of slides. * * @param buffer The buffer containing the file. * @returns The slide files. */ const unzipBuffer = async (input: Buffer): Promise<SlideFile[]> => { // Convert the buffer to a uint-8 array, and pass it to the unzip function. const zipBuffer = new Uint8Array(input.buffer) const ppt = (await new Promise((resolve, reject) => { unzip(zipBuffer, (error, result) => { if (error) reject(error) else resolve(result) }) })) as any // Filter out the files that don't contain the text on the slides. const files = Object.keys(ppt) .filter((name) => /ppt\/slides\/slide\d*.xml/.test(name)) .map((name) => { return { name, content: ppt[name] } }) return files } /** * Extracts text from a section of the slide, recursively. * * @param slideSection The section of the slide, converted to JSON from XML. * @param collectedText The lines of text parsed from the slide so far. * * @returns The lines of text on the slide. */ const parseSlideSection = async ( slideSection: any, collectedText?: string[], ): Promise<string[] | undefined> => { // Keep track of the text being collected. const beingCollectedText = collectedText ?? [] // Parse the section according to what type it is. if (Array.isArray(slideSection)) { // If it is, loop through the elements of the array. for (const element of slideSection) { // Collect all the pieces of text from the array. if (typeof element === 'string' && element !== '') { beingCollectedText.push(element) } else { // However, if it is an object or another array, call this function // again to parse that. await parseSlideSection(element, beingCollectedText) } } // Finally, return the collected text. return beingCollectedText } // If the section is an object, loop through its properties. if (typeof slideSection === 'object') { for (const property of Object.keys(slideSection)) { // Get the value of the property. const value = slideSection[property] // The `pptx` format stores the actual text inside the `a:t` or `_` // properties, so extract text from those properties. // Check if it is a string or array that contains a string. If it is // either, then collect the text content. if (typeof value === 'string') { if ((property === 'a:t' || property === '_') && value !== '') { beingCollectedText.push(value) } } else if (typeof value[0] === 'string') { if ((property === 'a:t' || property === '_') && value[0] !== '') { beingCollectedText.push(value[0]) } } else { // However, if it is an object or another array, call this function // again to parse that. await parseSlideSection(value, beingCollectedText) } } // Finally, return the collected text. return beingCollectedText } }