word-extractor
Version:
Node.js package to read Word .doc files
118 lines (107 loc) • 4.1 kB
JavaScript
/**
* @module filters
*
* @description
* Exports several functions that implement various methods for translating
* characters into Unicode, and cleaning up some of the remaining residues from
* Word's odd internal marker character usage.
*/
/**
* A replacement table, that maps Word control characters to either NULL, for
* deletion, or to another more acceptable character ina Unicode world, such
* as a newline.
*/
const replaceTable = [];
replaceTable[0x0002] = '\x00';
replaceTable[0x0005] = '\x00';
replaceTable[0x0007] = "\t";
replaceTable[0x0008] = '\x00';
replaceTable[0x000A] = "\n";
replaceTable[0x000B] = "\n";
replaceTable[0x000C] = "\n";
replaceTable[0x000D] = "\n";
replaceTable[0x001E] = "\u2011";
/**
* @constant
* Maps between Windows character codes, especially between 0x80 and 0x9f,
* into official Unicode code points. This smooths over the differences
* between UCS-2 and 8-bit code runs in Word, by allowing us to work
* entirely within Unicode later on.
*/
const binaryToUnicodeTable = [];
binaryToUnicodeTable[0x0082] = "\u201a";
binaryToUnicodeTable[0x0083] = "\u0192";
binaryToUnicodeTable[0x0084] = "\u201e";
binaryToUnicodeTable[0x0085] = "\u2026";
binaryToUnicodeTable[0x0086] = "\u2020";
binaryToUnicodeTable[0x0087] = "\u2021";
binaryToUnicodeTable[0x0088] = "\u02C6";
binaryToUnicodeTable[0x0089] = "\u2030";
binaryToUnicodeTable[0x008a] = "\u0160";
binaryToUnicodeTable[0x008b] = "\u2039";
binaryToUnicodeTable[0x008c] = "\u0152";
binaryToUnicodeTable[0x008e] = "\u017D";
binaryToUnicodeTable[0x0091] = "\u2018";
binaryToUnicodeTable[0x0092] = "\u2019";
binaryToUnicodeTable[0x0093] = "\u201C";
binaryToUnicodeTable[0x0094] = "\u201D";
binaryToUnicodeTable[0x0095] = "\u2022";
binaryToUnicodeTable[0x0096] = "\u2013";
binaryToUnicodeTable[0x0097] = "\u2014";
binaryToUnicodeTable[0x0098] = "\u02DC";
binaryToUnicodeTable[0x0099] = "\u2122";
binaryToUnicodeTable[0x009a] = "\u0161";
binaryToUnicodeTable[0x009b] = "\u203A";
binaryToUnicodeTable[0x009c] = "\u0153";
binaryToUnicodeTable[0x009e] = "\u017E";
binaryToUnicodeTable[0x009f] = "\u0178";
/**
* Converts character codes from 0x80 to 0x9f to Unicode equivalents
* within a string
* @param {string} string - the input string
* @returns a converted string
*/
module.exports.binaryToUnicode = (string) => {
return string.replace(/([\x80-\x9f])/g, (match) => binaryToUnicodeTable[match.charCodeAt(0)]);
};
/**
* The main function for cleaning OLE-based text. It runs a few standard replacements on characters
* that are reserved for special purposes, also removes fields, and finally strips out any weird
* characters that are likely not to be useful for anyone.
*
* @param {string} string - an input string
* @returns a cleaned up string
*/
module.exports.clean = (string) => {
// Fields can be nested, which makes this awkward. We use a strict non-nesting model
// and repeat until we find no substitutions. This is because a second match might
// start before an earlier one, due to our replacements.
string = string.replace(/([\x02\x05\x07\x08\x0a\x0b\x0c\x0d\x1f])/g, (match) => replaceTable[match.charCodeAt(0)]);
let called = true;
while (called) {
called = false;
string = string.replace(/(?:\x13[^\x13\x14\x15]*\x14?([^\x13\x14\x15]*)\x15)/g, (match, p1) => { called = true; return p1; });
}
return string
.replace(/[\x00-\x07]/g, '');
};
const filterTable = [];
filterTable[0x2002] = " ";
filterTable[0x2003] = " ";
filterTable[0x2012] = "-";
filterTable[0x2013] = "-";
filterTable[0x2014] = "-";
filterTable[0x2018] = "'";
filterTable[0x2019] = "'";
filterTable[0x201c] = "\"";
filterTable[0x201d] = "\"";
/**
* Filters a string, with a few common Unicode replacements, primarily for standard
* punctuation like non-breaking spaces, hyphens, and left and right curly quotes.
* @param {string} string - the input string
* @returns a filtered string
*/
module.exports.filter = (string) => {
return string
.replace(/[\u2002\u2003\u2012\u2013\u2014\u2018\u2019\u201c\u201d]/g, (match) => filterTable[match.charCodeAt(0)]);
};