@aeolun/muhammara
Version:
Create, read and modify PDF files and streams. A drop in replacement for hummusjs PDF library
108 lines (101 loc) • 3.06 kB
JavaScript
const DOMParser = require("@xmldom/xmldom").DOMParser;
exports.htmlToTextObjects = function (htmlCodes) {
htmlCodes = htmlCodes.replace(/<br\/?>/g, "<p>[@@DONOT_RENDER_THIS@@]</p>");
const nodes = new DOMParser().parseFromString(
`<html>${htmlCodes}</html>`,
"text/html"
);
const textObjects = parseNode(nodes).childs[0].childs;
return textObjects;
};
function getFontSizeRatio(tagName = "") {
const fontSizeRatio = {
p: 1, // 14px
h1: 2.57, // 36px
h2: 2.14, // 30px
h3: 1.71, // 24px
small: 0.7,
// h4: 1.12,
// h5: 0.83,
// h6: 0.75
};
const matched = fontSizeRatio[tagName.toLowerCase()];
return matched ? matched : 1;
}
function needsLineBreaker(tagName = "") {
const lineBreakers = ["p", "li", "h1", "h2", "h3"];
return lineBreakers.includes(tagName);
}
function isBoldTag(tagName = "") {
const boldTags = ["b", "strong"];
return boldTags.includes(tagName);
}
function isItalicTag(tagName = "") {
const italicTags = ["i", "em"];
return italicTags.includes(tagName);
}
function parseNode(node) {
const attributes = [];
const styles = {};
for (let i in node.attributes) {
if (!isNaN(i)) {
attributes.push({
name: node.attributes[i].nodeName,
value: node.attributes[i].nodeValue,
});
if (node.attributes[i].nodeName == "style") {
const styleValues = node.attributes[i].nodeValue.split(";");
styleValues.forEach((element) => {
if (element && element != "") {
element = element.split(":");
const key = element[0];
let value = element[1].replace(/ /g, "");
if (key == "color") {
if (value.search("rgb") > -1) {
value = value
.replace(/rgba?\(/, "")
.replace(/\)/, "")
.split(",")
.map((item) => parseFloat(item));
if (value.length > 3) {
styles["opacity"] = value.pop();
}
}
}
styles[key] = value;
}
});
}
}
}
let value = node.data ? node.data.replace(/^\s*/gm, "") : null;
if (value && value.charCodeAt(0) == 8203) {
// zero width space
value = value.substring(1);
}
const parsedData = {
value,
tag: node.tagName,
isBold: isBoldTag(node.tagName),
isItalic: isItalicTag(node.tagName),
underline: node.tagName == "u",
strikeOut: node.tagName == "del",
attributes,
styles,
needsLineBreaker: needsLineBreaker(node.tagName),
sizeRatio: getFontSizeRatio(node.tagName),
link: node.tagName == "a" ? node.attributes[0].value : null,
childs: [],
};
for (let num in node.childNodes) {
parsedData.childs.push(parseNode(node.childNodes[num]));
}
const ignoreValue = ["\n", "\n\n"];
parsedData.childs = parsedData.childs.filter((item) => {
return (
item.tag ||
(item.value && !ignoreValue.includes(item.value.replace(/ /g, "")))
);
});
return parsedData;
}