office-text-extractor-browser
Version:
Fork of office-text-extractor with unreleased changes that include browser support
118 lines (117 loc) • 4.53 kB
JavaScript
// source/parsers/ppt.ts
// The text extracter for MS PowerPoint files.
import { unzip } from 'fflate';
import { parseStringPromise as xmlToJson } from 'xml2js';
import encoding from 'text-encoding';
export class PptExtractor {
constructor() {
this.decoder = new encoding.TextDecoder();
/**
* The type(s) of input acceptable to this method.
*/
this.mimes = [
'application/vnd.openxmlformats-officedocument.presentationml.presentation',
];
/**
* Extract text from a PPT file if possible.
*
* @param payload The input and its type.
* @returns The text extracted from the input.
*/
this.apply = async (input) => {
const files = await unzipBuffer(input);
const slides = [];
for (const file of files) {
const { buffer } = file.content;
const contents = this.decoder.decode(buffer);
const slide = await xmlToJson(contents);
const lines = await parseSlideSection(slide);
slides.push(lines?.join('\n'));
}
const formattedText = slides.join('\n---\n') + '\n';
return formattedText;
};
}
}
/**
* Unzip a PPT file, and return a list of slides.
*
* @param buffer The buffer containing the file.
* @returns The slide files.
*/
const unzipBuffer = async (input) => {
// Convert the buffer to a uint-8 array, and pass it to the unzip function.
const zipBuffer = new Uint8Array(input.buffer);
const ppt = (await new Promise((resolve, reject) => {
unzip(zipBuffer, (error, result) => {
if (error)
reject(error);
else
resolve(result);
});
}));
// Filter out the files that don't contain the text on the slides.
const files = Object.keys(ppt)
.filter((name) => /ppt\/slides\/slide\d*.xml/.test(name))
.map((name) => {
return { name, content: ppt[name] };
});
return files;
};
/**
* Extracts text from a section of the slide, recursively.
*
* @param slideSection The section of the slide, converted to JSON from XML.
* @param collectedText The lines of text parsed from the slide so far.
*
* @returns The lines of text on the slide.
*/
const parseSlideSection = async (slideSection, collectedText) => {
// Keep track of the text being collected.
const beingCollectedText = collectedText ?? [];
// Parse the section according to what type it is.
if (Array.isArray(slideSection)) {
// If it is, loop through the elements of the array.
for (const element of slideSection) {
// Collect all the pieces of text from the array.
if (typeof element === 'string' && element !== '') {
beingCollectedText.push(element);
}
else {
// However, if it is an object or another array, call this function
// again to parse that.
await parseSlideSection(element, beingCollectedText);
}
}
// Finally, return the collected text.
return beingCollectedText;
}
// If the section is an object, loop through its properties.
if (typeof slideSection === 'object') {
for (const property of Object.keys(slideSection)) {
// Get the value of the property.
const value = slideSection[property];
// The `pptx` format stores the actual text inside the `a:t` or `_`
// properties, so extract text from those properties.
// Check if it is a string or array that contains a string. If it is
// either, then collect the text content.
if (typeof value === 'string') {
if ((property === 'a:t' || property === '_') && value !== '') {
beingCollectedText.push(value);
}
}
else if (typeof value[0] === 'string') {
if ((property === 'a:t' || property === '_') && value[0] !== '') {
beingCollectedText.push(value[0]);
}
}
else {
// However, if it is an object or another array, call this function
// again to parse that.
await parseSlideSection(value, beingCollectedText);
}
}
// Finally, return the collected text.
return beingCollectedText;
}
};