pptx-content-extractor
Version:
A Node.js library for extracting slides, notes, and media from PowerPoint (.pptx) files.
193 lines (192 loc) • 8.68 kB
JavaScript
var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
return new (P || (P = Promise))(function (resolve, reject) {
function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
step((generator = generator.apply(thisArg, _arguments || [])).next());
});
};
var __importDefault = (this && this.__importDefault) || function (mod) {
return (mod && mod.__esModule) ? mod : { "default": mod };
};
Object.defineProperty(exports, "__esModule", { value: true });
exports.extractPptx = extractPptx;
exports.extractPptxSlides = extractPptxSlides;
exports.extractPptxMedia = extractPptxMedia;
exports.extractPptxNotes = extractPptxNotes;
const fs_1 = __importDefault(require("fs"));
const jszip_1 = __importDefault(require("jszip"));
const xml2js_1 = require("xml2js");
/**
* Extract text (slides + notes) and images from a .pptx file.
* @param filePath path to the .pptx file on disk
* @returns Promise of ParsedPowerPoint with notes, media and slides
*/
function extractPptx(filePath) {
return __awaiter(this, void 0, void 0, function* () {
const files = yield loadFilesOfFile(filePath);
const { slides: rawSlides, media: rawMedia, notes: rawNotes } = extractParts(files);
const slides = yield parsePart(rawSlides, parseSlideContent);
const media = yield parsePart(rawMedia, parseMediaContent);
const notes = yield parsePart(rawNotes, parseNotesContent);
return {
slides,
media,
notes
};
});
}
function extractPptxSlides(filePath) {
return __awaiter(this, void 0, void 0, function* () {
const files = yield loadFilesOfFile(filePath);
const rawSlides = getSlides(files);
return yield parsePart(rawSlides, parseSlideContent);
});
}
function extractPptxMedia(filePath) {
return __awaiter(this, void 0, void 0, function* () {
const files = yield loadFilesOfFile(filePath);
const rawSlides = getMedia(files);
return yield parsePart(rawSlides, parseMediaContent);
});
}
function extractPptxNotes(filePath) {
return __awaiter(this, void 0, void 0, function* () {
const files = yield loadFilesOfFile(filePath);
const rawSlides = getNotes(files);
return yield parsePart(rawSlides, parseNotesContent);
});
}
function loadFilesOfFile(filePath) {
return __awaiter(this, void 0, void 0, function* () {
const fileBuffer = readFileAsBuffer(filePath);
return (yield loadPpt(fileBuffer)).files;
});
}
function readFileAsBuffer(filePath) {
const fileBuffer = fs_1.default.readFileSync(filePath);
if (!fileBuffer) {
throw new Error("Failed to read file");
}
return fileBuffer;
}
function loadPpt(fileBuffer) {
return __awaiter(this, void 0, void 0, function* () {
return jszip_1.default.loadAsync(fileBuffer).catch((e) => {
console.error(e);
throw new Error("Failed to load .pptx file");
});
});
}
function parsePart(toParse, parser) {
return __awaiter(this, void 0, void 0, function* () {
return yield Promise.all(toParse.map((part) => __awaiter(this, void 0, void 0, function* () { return yield parser(part); })));
});
}
function parseNotesContent(note) {
return __awaiter(this, void 0, void 0, function* () {
const content = yield note.async('string');
// TODO
return { name: note.name, content };
});
}
function parseMediaContent(media) {
return __awaiter(this, void 0, void 0, function* () {
const binaries = yield media.async('base64');
const fileName = media.name.split('/').pop() || media.name;
const mediaType = fileName.split('.').pop() || 'unknown';
return {
name: media.name,
content: `data:image/${mediaType};base64,${binaries}`,
};
});
}
function getMediaIndexesInSlide(parsedSlide, search = 'media/') {
const indexes = [];
let index = parsedSlide.indexOf(search);
while (index !== -1) {
indexes.push(index);
index = parsedSlide.indexOf(search, index + 1);
}
return indexes;
}
function getMediaReferencesInSlide(parsedSlide, mediaIndex, startOffset = 6) {
return mediaIndex.map(i => {
const startIndex = i + startOffset;
const endIndex = parsedSlide.indexOf('"', startIndex);
return parsedSlide.slice(startIndex, endIndex);
});
}
function parseSlideContent(slide) {
return __awaiter(this, void 0, void 0, function* () {
var _a, _b, _c, _d, _e;
const xml = yield slide.async('string');
const parsed = yield (0, xml2js_1.parseStringPromise)(xml);
const parsedStringified = JSON.stringify(parsed);
const results = [];
const shapes = (_e = (_d = (_c = (_b = (_a = parsed['p:sld']) === null || _a === void 0 ? void 0 : _a['p:cSld']) === null || _b === void 0 ? void 0 : _b[0]) === null || _c === void 0 ? void 0 : _c['p:spTree']) === null || _d === void 0 ? void 0 : _d[0]) === null || _e === void 0 ? void 0 : _e['p:sp'];
const mediaNames = getMediaReferencesInSlide(parsedStringified, getMediaIndexesInSlide(parsedStringified));
if (shapes) {
shapes.forEach((shape) => {
var _a, _b, _c, _d, _e, _f, _g, _h, _j, _k, _l, _m, _o, _p;
const cNvPr = (_d = (_c = (_b = (_a = shape['p:nvSpPr']) === null || _a === void 0 ? void 0 : _a[0]) === null || _b === void 0 ? void 0 : _b['p:cNvPr']) === null || _c === void 0 ? void 0 : _c[0]) === null || _d === void 0 ? void 0 : _d['$'];
const phType = ((_l = (_k = (_j = (_h = (_g = (_f = (_e = shape['p:nvSpPr']) === null || _e === void 0 ? void 0 : _e[0]) === null || _f === void 0 ? void 0 : _f['p:nvPr']) === null || _g === void 0 ? void 0 : _g[0]) === null || _h === void 0 ? void 0 : _h['p:ph']) === null || _j === void 0 ? void 0 : _j[0]) === null || _k === void 0 ? void 0 : _k['$']) === null || _l === void 0 ? void 0 : _l['type']) || 'unknown';
const texts = ((_p = (_o = (_m = shape['p:txBody']) === null || _m === void 0 ? void 0 : _m[0]) === null || _o === void 0 ? void 0 : _o['a:p']) === null || _p === void 0 ? void 0 : _p.map((paragraph) => {
var _a;
return ((_a = paragraph['a:r']) === null || _a === void 0 ? void 0 : _a.map(run => { var _a; return (_a = run['a:t']) === null || _a === void 0 ? void 0 : _a[0]; }).join(' ')) || '';
}).filter((text) => text)) || [];
if (cNvPr && texts.length > 0) {
results.push({
id: cNvPr.id,
type: phType,
text: texts,
});
}
});
}
return {
name: slide.name,
content: results,
mediaNames
};
});
}
function extractNumberFromName(fileName, pattern) {
const match = fileName.match(pattern);
return match ? parseInt(match[1], 10) : Number.MAX_SAFE_INTEGER;
}
function getPartByBasePathAndPattern(files, basePath, pattern) {
const partObjects = Object.keys(files)
.filter((fileName) => fileName.startsWith(basePath))
.map((fileName) => files[fileName]);
partObjects.sort((a, b) => {
const aNum = extractNumberFromName(a.name, pattern);
const bNum = extractNumberFromName(b.name, pattern);
return aNum - bNum;
});
return partObjects;
}
function getSlides(files) {
const slidesBasePath = "ppt/slides/";
const slidePattern = /slide(\d+)\.xml(\.rels)?$/;
return getPartByBasePathAndPattern(files, slidesBasePath, slidePattern);
}
function getMedia(files) {
const mediaBasePath = "ppt/media/";
const mediaPattern = /(\d+)\.(jpg|jpeg|png|gif)$/;
return getPartByBasePathAndPattern(files, mediaBasePath, mediaPattern);
}
function getNotes(files) {
const notesBasePath = "ppt/notesSlides/";
const notesPattern = /notesSlide(\d+)\.xml(\.rels)?$/;
return getPartByBasePathAndPattern(files, notesBasePath, notesPattern);
}
function extractParts(files) {
return {
slides: getSlides(files),
media: getMedia(files),
notes: getNotes(files),
};
}
;