word-extractor
Version:
Node.js package to read Word .doc files
225 lines (181 loc) • 6.6 kB
JavaScript
/**
* @module ole-compound-doc
*/
// Copyright (c) 2012 Chris Geiersbach
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.
//
// This component as adapted from node-ole-doc, available at:
// https://github.com/atariman486/node-ole-doc.
//
// Modified extensively by Stuart Watt <stuart@morungos.com> to keep the
// principal logic, but replacing callbacks and some weird stream usages
// with promises.
const Header = require('./ole-header');
const AllocationTable = require('./ole-allocation-table');
const DirectoryTree = require('./ole-directory-tree');
const Storage = require('./ole-storage');
/**
* Implements the main interface used to read from an OLE compoound file.
*/
class OleCompoundDoc {
constructor(reader) {
this._reader = reader;
this._skipBytes = 0;
}
read() {
return Promise.resolve()
.then(() => this._readHeader())
.then(() => this._readMSAT())
.then(() => this._readSAT())
.then(() => this._readSSAT())
.then(() => this._readDirectoryTree())
.then(() => {
if (this._skipBytes != 0) {
return this._readCustomHeader();
}
})
.then(() => this);
}
_readCustomHeader() {
const buffer = Buffer.alloc(this._skipBytes);
return this._reader.read(buffer, 0, this._skipBytes, 0)
.then((buffer) => {
if (!this._customHeaderCallback(buffer))
return;
});
}
_readHeader() {
const buffer = Buffer.alloc(512);
return this._reader.read(buffer, 0, 512, 0 + this._skipBytes)
.then((buffer) => {
const header = this._header = new Header();
if (!header.load(buffer)) {
throw new Error("Not a valid compound document");
}
});
}
_readMSAT() {
const header = this._header;
this._MSAT = header.partialMSAT.slice(0);
this._MSAT.length = header.SATSize;
if(header.SATSize <= 109 || header.MSATSize == 0) {
return Promise.resolve();
}
let currMSATIndex = 109;
let i = 0;
const readOneMSAT = (i, currMSATIndex, secId) => {
if (i >= header.MSATSize) {
return Promise.resolve();
}
return this._readSector(secId)
.then((sectorBuffer) => {
let s;
for(s = 0; s < header.secSize - 4; s += 4) {
if(currMSATIndex >= header.SATSize)
break;
else
this._MSAT[currMSATIndex] = sectorBuffer.readInt32LE(s);
currMSATIndex++;
}
secId = sectorBuffer.readInt32LE(header.secSize - 4);
return readOneMSAT(i + 1, currMSATIndex, secId);
});
};
return readOneMSAT(i, currMSATIndex, header.MSATSecId);
}
_readSector(secId) {
return this._readSectors([ secId ]);
}
_readSectors(secIds) {
const header = this._header;
const buffer = Buffer.alloc(secIds.length * header.secSize);
const readOneSector = (i) => {
if (i >= secIds.length) {
return Promise.resolve(buffer);
}
const bufferOffset = i * header.secSize;
const fileOffset = this._getFileOffsetForSec(secIds[i]);
return this._reader.read(buffer, bufferOffset, header.secSize, fileOffset)
.then(() => readOneSector(i + 1));
};
return readOneSector(0);
}
_readShortSector(secId) {
return this._readShortSectors([ secId ]);
}
_readShortSectors(secIds) {
const header = this._header;
const buffer = Buffer.alloc(secIds.length * header.shortSecSize);
const readOneShortSector = (i) => {
if (i >= secIds.length) {
return Promise.resolve(buffer);
}
const bufferOffset = i * header.shortSecSize;
const fileOffset = this._getFileOffsetForShortSec(secIds[i]);
return this._reader.read(buffer, bufferOffset, header.shortSecSize, fileOffset)
.then(() => readOneShortSector(i + 1));
};
return readOneShortSector(0);
}
_readSAT() {
this._SAT = new AllocationTable(this);
return this._SAT.load(this._MSAT);
}
_readSSAT() {
const header = this._header;
const secIds = this._SAT.getSecIdChain(header.SSATSecId);
if (secIds.length != header.SSATSize) {
return Promise.reject(new Error("Invalid Short Sector Allocation Table"));
}
this._SSAT = new AllocationTable(this);
return this._SSAT.load(secIds);
}
_readDirectoryTree() {
const header = this._header;
this._directoryTree = new DirectoryTree(this);
const secIds = this._SAT.getSecIdChain(header.dirSecId);
return this._directoryTree.load(secIds)
.then(() => {
const rootEntry = this._directoryTree.root;
this._rootStorage = new Storage(this, rootEntry);
this._shortStreamSecIds = this._SAT.getSecIdChain(rootEntry.secId);
});
}
_getFileOffsetForSec(secId) {
const secSize = this._header.secSize;
return this._skipBytes + (secId + 1) * secSize; // Skip past the header sector
}
_getFileOffsetForShortSec(shortSecId) {
const shortSecSize = this._header.shortSecSize;
const shortStreamOffset = shortSecId * shortSecSize;
const secSize = this._header.secSize;
const secIdIndex = Math.floor(shortStreamOffset / secSize);
const secOffset = shortStreamOffset % secSize;
const secId = this._shortStreamSecIds[secIdIndex];
return this._getFileOffsetForSec(secId) + secOffset;
}
storage(storageName) {
return this._rootStorage.storage(storageName);
}
stream(streamName) {
return this._rootStorage.stream(streamName);
}
}
module.exports = OleCompoundDoc;