pdf2json
Version:
PDF file parser that converts PDF binaries to JSON and text, powered by porting a fork of PDF.JS to Node.js
111 lines (98 loc) • 3.6 kB
JavaScript
/* -*- Mode: Java; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
/* vim: set shiftwidth=2 tabstop=2 autoindent cindent expandtab: */
/* globals Document, error, PDFJS */
/* Copyright 2012 Mozilla Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
;
var Metadata = (PDFJS.Metadata = (function MetadataClosure() {
function fixMetadata(meta) {
return meta.replace(/>\\376\\377([^<]+)/g, function (all, codes) {
var bytes = codes.replace(
/\\([0-3])([0-7])([0-7])/g,
function (code, d1, d2, d3) {
return String.fromCharCode(d1 * 64 + d2 * 8 + d3 * 1);
}
);
var chars = '';
for (var i = 0; i < bytes.length; i += 2) {
var code = bytes.charCodeAt(i) * 256 + bytes.charCodeAt(i + 1);
chars +=
code >= 32 &&
code < 127 &&
code != 60 &&
code != 62 &&
code != 38 &&
false
? String.fromCharCode(code)
: '&#x' + (0x10000 + code).toString(16).substring(1) + ';';
}
return '>' + chars;
});
}
function Metadata(meta) {
this.metadata = {};
if (typeof meta === 'string') {
// Ghostscript produces invalid metadata
meta = fixMetadata(meta);
meta = meta.trim();
if (meta.startsWith('<') && meta.endsWith('>')) {
var parser = new DOMParser();
meta = parser.parseFromString(meta, 'application/xml');
this.metaDocument = meta;
this.parse();
}
}
}
Metadata.prototype = {
parse: function Metadata_parse() {
var doc = this.metaDocument;
var rdf = doc.documentElement;
if (rdf && rdf.nodeName.toLowerCase() !== 'rdf:rdf') {
// Wrapped in <xmpmeta>
rdf = rdf.firstChild;
while (rdf && rdf.nodeName.toLowerCase() !== 'rdf:rdf')
rdf = rdf.nextSibling;
}
var nodeName = rdf ? rdf.nodeName.toLowerCase() : null;
if (!rdf || nodeName !== 'rdf:rdf' || !rdf.hasChildNodes()) return;
var children = rdf.childNodes,
desc,
entry,
name,
i,
ii,
length,
iLength;
for (i = 0, length = children.length; i < length; i++) {
desc = children[i];
if (desc.nodeName.toLowerCase() !== 'rdf:description') continue;
for (ii = 0, iLength = desc.childNodes.length; ii < iLength; ii++) {
if (desc.childNodes[ii].nodeName.toLowerCase() !== '#text') {
entry = desc.childNodes[ii];
name = entry.nodeName.toLowerCase();
this.metadata[name] = entry.textContent.trim();
}
}
}
},
get: function Metadata_get(name) {
return this.metadata[name] || null;
},
has: function Metadata_has(name) {
return typeof this.metadata[name] !== 'undefined';
},
};
return Metadata;
})());