pdf2json
Version:
PDF file parser that converts PDF binaries to JSON and text, powered by porting a fork of PDF.JS to Node.js
383 lines (336 loc) • 11.4 kB
JavaScript
import nodeUtil from 'util';
import PDFUnit from './pdfunit.js';
import { kFontFaces, kFontStyles } from './pdfconst.js';
const _boldSubNames = ['bd', 'bold', 'demi', 'black', 'medi'];
const _stdFonts = [
'arial',
'helvetica',
'sans-serif ',
'courier ',
'monospace ',
'ocr ',
];
const DISTANCE_DELTA = 0.1;
export default class PDFFont {
#initTypeName() {
let typeName = this.fontObj.name || this.fontObj.fallbackName;
if (!typeName) {
typeName = kFontFaces[0]; //default font family name
}
typeName = typeName.toLowerCase();
return typeName;
}
#initSubType() {
let subType = this.typeName;
let bold = false;
const nameArray = this.typeName.split('+');
if (Array.isArray(nameArray) && nameArray.length > 1) {
subType = nameArray[1].split('-');
if (Array.isArray(subType) && subType.length > 1) {
const subName = subType[1].toLowerCase();
bold = _boldSubNames.indexOf(subName) >= 0;
subType = subType[0];
}
}
return { subType, bold };
}
#initSymbol() {
const isSymbol =
this.typeName.indexOf('symbol') > 0 ||
kFontFaces[2].indexOf(this.subType) >= 0;
if (this.fontObj.isSymbolicFont) {
const mFonts = _stdFonts.filter(
(oneName) => this.typeName.indexOf(oneName) >= 0
);
if (mFonts.length > 0) {
this.fontObj.isSymbolicFont = false; //lots of Arial-based font is detected as symbol in VA forms (301, 76-c, etc.) reset the flag for now
nodeUtil.p2jinfo(
`Reset: isSymbolicFont (false) for ${this.fontObj.name}`
);
}
} else {
if (isSymbol) {
this.fontObj.isSymbolicFont = true; //text pdf: va_ind_760c
nodeUtil.p2jinfo(
`Reset: isSymbolicFont (true) for ${this.fontObj.name}`
);
}
}
return isSymbol;
}
#initSpaceWidth() {
let { spaceWidth } = this.fontObj;
if (!spaceWidth) {
var spaceId = Array.isArray(this.fontObj.toFontChar)
? this.fontObj.toFontChar.indexOf(32)
: -1;
spaceWidth =
spaceId >= 0 && Array.isArray(this.fontObj.widths)
? this.fontObj.widths[spaceId]
: 250;
}
spaceWidth = PDFUnit.toFormX(spaceWidth) / 32;
return spaceWidth;
}
// constructor
constructor(fontObj) {
this.fontObj = fontObj;
this.typeName = this.#initTypeName();
const { subType, bold } = this.#initSubType();
this.subType = subType;
this.bold = bold;
this.isSymbol = this.#initSymbol();
this.spaceWidth = this.#initSpaceWidth();
this.fontSize = 1;
this.faceIdx = 0;
this.italic = false;
this.fontStyleId = -1;
}
/** sort text blocks by y then x */
static compareBlockPos(t1, t2) {
if (t1.y < t2.y - DISTANCE_DELTA) {
return -1;
}
if (Math.abs(t1.y - t2.y) <= DISTANCE_DELTA) {
if (t1.x < t2.x - DISTANCE_DELTA) {
return -1;
}
if (Math.abs(t1.x - t2.x) <= DISTANCE_DELTA) {
return 0;
}
}
return 1;
}
static haveSameStyle(t1, t2) {
let retVal = t1.R[0].S === t2.R[0].S;
if (retVal && t1.R[0].S < 0) {
for (let i = 0; i < t1.R[0].TS.length; i++) {
if (t1.R[0].TS[i] !== t2.R[0].TS[i]) {
retVal = false;
break;
}
}
}
if (retVal) {
// make sure both block are not rotated
retVal =
typeof t1.R[0].RA === 'undefined' &&
typeof t2.R[0].RA === 'undefined';
}
return retVal;
}
static getSpaceThreshHold(t1) {
return (PDFFont.getFontSize(t1) / 12) * t1.sw;
}
static areAdjacentBlocks(t1, t2) {
const isInSameLine = Math.abs(t1.y - t2.y) <= DISTANCE_DELTA;
const isDistanceSmallerThanASpace =
t2.x - t1.x - t1.w < PDFFont.getSpaceThreshHold(t1);
return isInSameLine && isDistanceSmallerThanASpace;
}
static getFontSize(textBlock) {
const sId = textBlock.R[0].S;
return sId < 0 ? textBlock.R[0].TS[1] : kFontStyles[sId][1];
}
static areDuplicateBlocks(t1, t2) {
return (
t1.x === t2.x &&
t1.y === t2.y &&
t1.R[0].T === t2.R[0].T &&
PDFFont.haveSameStyle(t1, t2)
);
}
// private
#setFaceIndex() {
const { fontObj } = this;
this.bold = fontObj.bold;
if (!this.bold) {
this.bold =
this.typeName.indexOf('bold') >= 0 ||
this.typeName.indexOf('black') >= 0;
}
this.italic = fontObj.italic; // fix https://github.com/modesty/pdf2json/issues/42
// Extended the fix for https://github.com/modesty/pdf2json/issues/42
if (!this.italic) {
this.italic =
this.typeName.indexOf('italic') >= 0 ||
this.typeName.indexOf('oblique') >= 0;
}
// Added detection of hybrid dual bolditalic fonts
if (
(!this.bold || !this.italic) &&
this.typeName.indexOf('boldobl') >= 0
) {
this.bold = true;
this.italic = true;
}
const typeName = this.subType;
if (fontObj.isSerifFont) {
if (kFontFaces[1].indexOf(typeName) >= 0) this.faceIdx = 1;
} else if (kFontFaces[2].indexOf(this.subType) >= 0) {
this.faceIdx = 2;
} else if (fontObj.isMonospace) {
this.faceIdx = 3;
if (kFontFaces[4].indexOf(typeName) >= 0) this.faceIdx = 4;
else if (kFontFaces[5].indexOf(typeName) >= 0) this.faceIdx = 5;
} else if (fontObj.isSymbolicFont) {
this.faceIdx = 2;
}
if (this.faceIdx === 0) {
if (this.typeName.indexOf('narrow') > 0) this.faceIdx = 1;
}
// nodeUtil.p2jinfo"typeName = " + typeName + " => faceIdx = " + this.faceIdx);
}
#getFontStyleIndex(fontSize) {
this.#setFaceIndex();
//MQZ Feb.28.2013. Adjust bold text fontsize to work around word spacing issue
this.fontSize = this.bold && fontSize > 12 ? fontSize + 1 : fontSize;
const fsa = [
this.faceIdx,
this.fontSize,
this.bold ? 1 : 0,
this.italic ? 1 : 0,
];
let retVal = -1;
kFontStyles.forEach((element, index, list) => {
if (retVal === -1) {
if (
element[0] === fsa[0] &&
element[1] === fsa[1] &&
element[2] === fsa[2] &&
element[3] === fsa[3]
) {
retVal = index;
}
}
});
return retVal;
}
#processSymbolicFont(str) {
let retVal = str;
if (!str || str.length !== 1) return retVal;
if (!this.fontObj.isSymbolicFont || !this.isSymbol) {
if (retVal === 'C' || retVal === 'G') {
//prevent symbolic encoding from the client
retVal = ` ${retVal} `; //sample: va_ind_760c
}
return retVal;
}
switch (str.charCodeAt(0)) {
case 20:
retVal = '\u2713';
break; //check mark
case 70:
retVal = this.fontObj.type === 'CIDFontType0' ? '\u26A0' : '\u007D';
break; //exclaimation in triangle OR right curly bracket
case 71:
retVal = '\u25b6';
break; //right triangle
case 97:
retVal = '\u25b6';
break; //right triangle
case 99:
retVal = this.isSymbol ? '\u2022' : '\u25b2';
break; //up triangle. set to Bullet Dot for VA SchSCR
case 100:
retVal = '\u25bc';
break; //down triangle
case 103:
retVal = '\u27A8';
break; //right arrow. sample: va_ind_760pff and pmt
case 106:
retVal = '';
break; //VA 301: string j character by the checkbox, hide it for now
case 114:
retVal = '\u2022';
break; //Bullet dot
case 115:
retVal = '\u25b2';
break; //up triangle
case 116:
retVal = '\u2022';
break; //Bullet dot
case 118:
retVal = '\u2022';
break; //Bullet dot
default:
nodeUtil.p2jinfo(
`${this.fontObj.type} - SymbolicFont - (${this.fontObj.name}) : ${str.charCodeAt(0)}::${str.charCodeAt(1)} => ${retVal}`
);
}
return retVal;
}
#textRotationAngle(matrix2D) {
let retVal = 0;
if (matrix2D[0][0] === 0 && matrix2D[1][1] === 0) {
if (matrix2D[0][1] !== 0 && matrix2D[1][0] !== 0) {
if (matrix2D[0][1] / matrix2D[1][0] + 1 < 0.0001) retVal = 90;
}
} else if (matrix2D[0][0] !== 0 && matrix2D[1][1] !== 0) {
const r1 = Math.atan(-matrix2D[0][1] / matrix2D[0][0]);
const r2 = Math.atan(matrix2D[1][0] / matrix2D[1][1]);
if (Math.abs(r1) > 0.0001 && r1 - r2 < 0.0001) {
retVal = (r1 * 180) / Math.PI;
}
}
return retVal;
}
// public instance methods
processText(p, str, maxWidth, color, fontSize, targetData, matrix2D) {
const text = this.#processSymbolicFont(str);
if (!text) {
return;
}
this.fontStyleId = this.#getFontStyleIndex(fontSize);
// when this.fontStyleId === -1, it means the text style doesn't match any entry in the dictionary
// adding TS to better describe text style [fontFaceId, fontSize, 1/0 for bold, 1/0 for italic];
const TS = [
this.faceIdx,
this.fontSize,
this.bold ? 1 : 0,
this.italic ? 1 : 0,
];
const clrId = PDFUnit.findColorIndex(color);
const colorObj =
clrId >= 0 && clrId < PDFUnit.colorCount()
? { clr: clrId }
: { oc: color };
let textRun = {
T: this.flashEncode(text),
S: this.fontStyleId,
TS,
};
const rAngle = this.#textRotationAngle(matrix2D);
if (rAngle !== 0) {
nodeUtil.p2jinfo(`${str}: rotated ${rAngle} degree.`);
textRun = { ...textRun, RA: rAngle };
}
const oneText = {
x: PDFUnit.toFormX(p.x) - 0.25,
y: PDFUnit.toFormY(p.y) - 0.75,
w: PDFUnit.toFixedFloat(maxWidth),
...colorObj, //MQZ.07/29/2013: when color is not in color dictionary, set the original color (oc)
sw: this.spaceWidth, //font space width, use to merge adjacent text blocks
A: 'left',
R: [textRun],
};
targetData.Texts.push(oneText);
}
flashEncode(str) {
let retVal = encodeURIComponent(str);
retVal = retVal.replace('%C2%96', '-');
retVal = retVal.replace('%C2%91', '%27');
retVal = retVal.replace('%C2%92', '%27');
retVal = retVal.replace('%C2%82', '%27');
retVal = retVal.replace('%C2%93', '%22');
retVal = retVal.replace('%C2%94', '%22');
retVal = retVal.replace('%C2%84', '%22');
retVal = retVal.replace('%C2%8B', '%C2%AB');
retVal = retVal.replace('%C2%9B', '%C2%BB');
return retVal;
}
clean() {
this.fontObj = null;
delete this.fontObj;
}
}