pdf3json
Version:
A PDF file parser that converts PDF binaries to text based JSON, powered by porting a fork of PDF.JS to Node.js
328 lines (282 loc) • 12 kB
JavaScript
var nodeUtil = require("util"),
_ = require("underscore"),
PDFUnit = require('./pdfunit.js');
var PDFFont = (function PFPFontClosure() {
'use strict';
// private static
var _nextId = 1;
var _name = 'PDFFont';
var _boldSubNames = ["bd", "bold", "demi", "black"];
var _stdFonts = ["arial", "helvetica", "sans-serif ", "courier ","monospace ", "ocr "];
var _kFontFaces = [
"quicktype,arial,helvetica,sans-serif", // 00 - QuickType - sans-serif variable font
"quicktype condensed,arial narrow,arial,helvetica,sans-serif", // 01 - QuickType Condensed - thin sans-serif variable font
"quicktypepi,quicktypeiipi", // 02 - QuickType Pi
"quicktype mono,courier new,courier,monospace", // 03 - QuickType Mono - san-serif fixed font
"ocr-a,courier new,courier,monospace", // 04 - OCR-A - OCR readable san-serif fixed font
"ocr b mt,courier new,courier,monospace" // 05 - OCR-B MT - OCR readable san-serif fixed font
];
var _kFontStyles = [
// Face Size Bold Italic StyleID(Comment)
// ----- ---- ---- ----- -----------------
[0, 6, 0, 0], //00
[0, 8, 0, 0], //01
[0, 10, 0, 0], //02
[0, 12, 0, 0], //03
[0, 14, 0, 0], //04
[0, 18, 0, 0], //05
[0, 6, 1, 0], //06
[0, 8, 1, 0], //07
[0, 10, 1, 0], //08
[0, 12, 1, 0], //09
[0, 14, 1, 0], //10
[0, 18, 1, 0], //11
[0, 6, 0, 1], //12
[0, 8, 0, 1], //13
[0, 10, 0, 1], //14
[0, 12, 0, 1], //15
[0, 14, 0, 1], //16
[0, 18, 0, 1], //17
[0, 6, 1, 1], //18
[0, 8, 1, 1], //19
[0, 10, 1, 1], //20
[0, 12, 1, 1], //21
[0, 14, 1, 1], //22
[0, 18, 1, 1], //23
[1, 6, 0, 0], //24
[1, 8, 0, 0], //25
[1, 10, 0, 0], //26
[1, 12, 0, 0], //27
[1, 14, 0, 0], //28
[1, 18, 0, 0], //29
[1, 6, 1, 0], //30
[1, 8, 1, 0], //31
[1, 10, 1, 0], //32
[1, 12, 1, 0], //33
[1, 14, 1, 0], //34
[1, 18, 1, 0], //35
[1, 6, 0, 1], //36
[1, 8, 0, 1], //37
[1, 10, 0, 1], //38
[1, 12, 0, 1], //39
[1, 14, 0, 1], //40
[1, 18, 0, 1], //41
[2, 8, 0, 0], //42
[2, 10, 0, 0], //43
[2, 12, 0, 0], //44
[2, 14, 0, 0], //45
[2, 18, 0, 0], //46
[3, 8, 0, 0], //47
[3, 10, 0, 0], //48
[3, 12, 0, 0], //49
[4, 12, 0, 0], //50
[0, 9, 0, 0], //51
[0, 9, 1, 0], //52
[0, 9, 0, 1], //53
[0, 9, 1, 1], //54
[1, 9, 0, 0], //55
[1, 9, 1, 0], //56
[1, 9, 1, 1], //57
[4, 10, 0, 0], //58
[5, 10, 0, 0], //59
[5, 12, 0, 0] //60
];
// constructor
var cls = function (fontObj) {
// private
var _id = _nextId++;
// public (every instance will have their own copy of these methods, needs to be lightweight)
this.get_id = function() { return _id; };
this.get_name = function() { return _name + _id; };
this.fontObj = fontObj;
var typeName = (fontObj.name || fontObj.fallbackName);
if (!typeName) {
typeName = _kFontFaces[0]; //default font family name
}
typeName = typeName.toLowerCase();
this.typeName = typeName;
var subType = typeName;
var nameArray = typeName.split('+');
if (_.isArray(nameArray) && nameArray.length > 1) {
subType = nameArray[1].split("-");
if (_.isArray(subType) && subType.length > 1) {
if (!this.bold) {
var subName = subType[1].toLowerCase();
this.bold = _boldSubNames.indexOf(subName) >= 0;
}
subType = subType[0];
}
}
this.subType = subType;
this.isSymbol = typeName.indexOf("symbol") > 0 || _kFontFaces[2].indexOf(this.subType) >= 0;
if (this.fontObj.isSymbolicFont) {
var mFonts = _.filter(_stdFonts, function(oneName){
return (typeName.indexOf(oneName) >= 0);
}, this);
if (mFonts.length > 0) {
this.fontObj.isSymbolicFont = false; //lots of Arial-based font is detected as symbol in VA forms (301, 76-c, etc.) reset the flag for now
nodeUtil.p2jinfo("Reset: isSymbolicFont (false) for " + this.fontObj.name);
}
}
else {
if (this.isSymbol) {
this.fontObj.isSymbolicFont = true; //text pdf: va_ind_760c
nodeUtil.p2jinfo("Reset: isSymbolicFont (true) for " + this.fontObj.name);
}
}
this.fontSize = 1;
this.faceIdx = 0;
this.bold = false;
this.italic = false;
this.fontStyleId = -1;
};
// public static
cls.get_nextId = function () {
return _name + _nextId;
};
// private
var _setFaceIndex = function() {
var fontObj = this.fontObj;
this.bold = fontObj.bold;
if (!this.bold) {
this.bold = this.typeName.indexOf("bold") >= 0 || this.typeName.indexOf("black") >= 0;
}
var typeName = this.subType;
if (fontObj.isSerifFont) {
if (_kFontFaces[1].indexOf(typeName) >= 0)
this.faceIdx = 1;
}
else if (_kFontFaces[2].indexOf(this.subType) >= 0) {
this.faceIdx = 2;
}
else if (fontObj.isMonospace) {
this.faceIdx = 3;
if (_kFontFaces[4].indexOf(typeName) >= 0)
this.faceIdx = 4;
else if (_kFontFaces[5].indexOf(typeName) >= 0)
this.faceIdx = 5;
}
else if (fontObj.isSymbolicFont) {
this.faceIdx = 2;
}
if (this.faceIdx == 0) {
if (this.typeName.indexOf("narrow") > 0)
this.faceIdx = 1;
}
// nodeUtil.p2jinfo"typeName = " + typeName + " => faceIdx = " + this.faceIdx);
};
var _getFontStyleIndex = function(fontSize) {
_setFaceIndex.call(this);
//MQZ Feb.28.2013. Adjust bold text fontsize to work around word spacing issue
this.fontSize = (this.bold && (fontSize > 12)) ? fontSize + 1 : fontSize;
var fsa = [this.faceIdx, this.fontSize, this.bold?1:0, this.italic?1:0];
var retVal = -1;
_.each(_kFontStyles, function(element, index, list){
if (retVal === -1) {
if (element[0] === fsa[0] && element[1] === fsa[1] &&
element[2] === fsa[2] && element[3] === fsa[3]) {
retVal = index;
}
}
});
return retVal;
};
var _processSymbolicFont = function(str) {
var retVal = str;
if (!str || str.length !== 1)
return retVal;
if (!this.fontObj.isSymbolicFont || !this.isSymbol) {
if (retVal == "C" || retVal == "G") { //prevent symbolic encoding from the client
retVal = " " + retVal + " "; //sample: va_ind_760c
}
return retVal;
}
switch(str.charCodeAt(0)) {
case 20: retVal = '\u2713'; break; //check mark
case 70: retVal = (this.fontObj.type === "CIDFontType0") ? '\u26A0' : '\u007D'; break; //exclaimation in triangle OR right curly bracket
case 71: retVal = '\u25b6'; break; //right triangle
case 97: retVal = '\u25b6'; break; //right triangle
case 99: retVal = this.isSymbol ? '\u2022' : '\u25b2'; break; //up triangle. set to Bullet Dot for VA SchSCR
case 100: retVal = '\u25bc'; break; //down triangle
case 103: retVal = '\u27A8'; break; //right arrow. sample: va_ind_760pff and pmt
case 106: retVal = ''; break; //VA 301: string j character by the checkbox, hide it for now
case 114: retVal = '\u2022'; break; //Bullet dot
case 115: retVal = '\u25b2'; break; //up triangle
case 116: retVal = '\u2022'; break; //Bullet dot
case 118: retVal = '\u2022'; break; //Bullet dot
default:
nodeUtil.p2jinfo(this.fontObj.type + " - SymbolicFont - (" + this.fontObj.name + ") : " +
str.charCodeAt(0) + "::" + str.charCodeAt(1) + " => " + retVal);
}
return retVal;
};
var _textRotationAngle = function (matrix2D) {
var retVal = 0;
if (matrix2D[0][0] === 0 && matrix2D[1][1] === 0) {
if (matrix2D[0][1] != 0 && matrix2D[1][0] != 0) {
if ((matrix2D[0][1] / matrix2D[1][0]) + 1 < 0.0001)
retVal = 90;
}
}
else if (matrix2D[0][0] !== 0 && matrix2D[1][1] !== 0) {
var r1 = Math.atan(-matrix2D[0][1] / matrix2D[0][0]);
var r2 = Math.atan(matrix2D[1][0] / matrix2D[1][1]);
if (Math.abs(r1) > 0.0001 && (r1 - r2 < 0.0001)) {
retVal = r1 * 180 / Math.PI;
}
}
return retVal;
};
// public (every instance will share the same method, but has no access to private fields defined in constructor)
cls.prototype.processText = function (p, str, maxWidth, color, fontSize, targetData, matrix2D) {
var text = _processSymbolicFont.call(this, str);
if (!text) {
return;
}
this.fontStyleId = _getFontStyleIndex.call(this, fontSize);
// when this.fontStyleId === -1, it means the text style doesn't match any entry in the dictionary
// adding TS to better describe text style [fontFaceId, fontSize, 1/0 for bold, 1/0 for italic];
var TS = [this.faceIdx, this.fontSize, this.bold?1:0, this.italic?1:0];
var clrId = PDFUnit.findColorIndex(color);
var oneText = {x: PDFUnit.toFormX(p.x) - 0.25,
y: PDFUnit.toFormY(p.y) - 0.75,
w: maxWidth,
clr: clrId,
A: "left",
R: [{
T: this.flash_encode(text),
S: this.fontStyleId,
TS: TS
}]
};
//MQZ.07/29/2013: when color is not in color dictionary, set the original color (oc)
if (clrId < 0) {
oneText = _.extend({oc: color}, oneText);
}
var rAngle = _textRotationAngle.call(this, matrix2D);
if (rAngle != 0) {
nodeUtil.p2jinfo(str + ": rotated " + rAngle + " degree.");
_.extend(oneText.R[0], {RA: rAngle});
}
targetData.Texts.push(oneText);
};
cls.prototype.flash_encode = function(str) {
var retVal = encodeURIComponent(str);
retVal = retVal.replace("%C2%96", "-");
retVal = retVal.replace("%C2%91", "%27");
retVal = retVal.replace("%C2%92", "%27");
retVal = retVal.replace("%C2%82", "%27");
retVal = retVal.replace("%C2%93", "%22");
retVal = retVal.replace("%C2%94", "%22");
retVal = retVal.replace("%C2%84", "%22");
retVal = retVal.replace("%C2%8B", "%C2%AB");
retVal = retVal.replace("%C2%9B", "%C2%BB");
return retVal;
};
cls.prototype.clean = function() {
this.fontObj = null;
delete this.fontObj;
};
return cls;
})();
module.exports = PDFFont;