cleanview
Version:
Clean the content of html articles
197 lines (196 loc) • 6.29 kB
JavaScript
;
var __importDefault = (this && this.__importDefault) || function (mod) {
return (mod && mod.__esModule) ? mod : { "default": mod };
};
Object.defineProperty(exports, "__esModule", { value: true });
exports.clean = clean;
const valid_tags_1 = __importDefault(require("../defaults/valid-tags"));
const forbidden_classes_1 = __importDefault(require("../defaults/forbidden-classes"));
const helpers_1 = require("./helpers");
const VALID_TAGS_SECOND_TRY = [...valid_tags_1.default, "header"];
const ATTRIBUTES_TO_KEEP = {
IMAGE: ["src", "title", "alt", "data-src", "srcset", "data-srcset"],
LINK: ["href", "title"],
SOURCE: ["srcset"],
YOUTUBE: ["src", "width", "height", "allowfullscreen", "frameborder"],
OTHER: [],
INVALID: [],
};
function clean(json, options) {
options = options || {};
json = addFlags(json, options);
json = cleanOuterToInner(json, options);
json = cleanInnerToOuter(json, options);
return json;
}
function addFlags(json, options) {
json = addFlagForPre(json, options);
return json;
}
function addFlagForPre(json, options) {
return json.map((e) => iterateChildren(e, options, (child, _, parent) => {
if (parent.tagName === "pre" || parent.insidePre) {
child.insidePre = true;
}
return child;
}));
}
function iterateChildren(element, options, func) {
if (!element)
return element;
if (!(0, helpers_1.isNode)(element))
return element;
if (!element.children)
return element;
if (!element.children.length)
return element;
element.children = element.children.map((child) => {
const modified = func(child, options, element);
iterateChildren(child, options, func);
return modified;
});
return element;
}
function cleanOuterToInner(json, options) {
json = json
.filter((e) => filterComments(e, options))
.filter((e) => filterSpaces(e, options))
.filter((e) => filterTags(e, options))
.filter((e) => filterClasses(e, options))
.map((e) => cleanAttributes(e, options))
.map((e) => passToChildren(e, options, cleanOuterToInner));
return json;
}
function cleanInnerToOuter(json, options) {
json = json
.map((e) => passToChildren(e, options, cleanInnerToOuter))
.filter((e) => filterEmptyNodes(e, options));
return json;
}
function filterEmptyNodes(e, _options) {
if ((0, helpers_1.isComment)(e))
return false;
if ((0, helpers_1.isText)(e))
return true;
if (e.tagName == "img")
return true;
if (e.tagName == "iframe")
return true;
if (e.tagName == "br")
return true;
if (e.tagName == "hr")
return true;
if (!e.children)
return true;
return e.children.length > 0;
}
function filterComments(e, _options) {
return !(0, helpers_1.isComment)(e);
}
function filterSpaces(e, _options) {
// do not remove spaces when inside a <pre> tag
if (e.insidePre)
return true;
const blankSpace = (0, helpers_1.isText)(e) && e.content.trim() == "";
return !blankSpace;
}
function filterTags(e, options) {
if ((0, helpers_1.isText)(e))
return true;
if ((0, helpers_1.isComment)(e))
return false;
const TAGS = options.secondTry ? VALID_TAGS_SECOND_TRY : valid_tags_1.default;
const aditionalTags = options.includeTags || [];
const tags = [...TAGS, ...aditionalTags];
const tag = (e.tagName || "").toLowerCase();
const isValidTag = tags.indexOf(tag) > -1;
return isValidTag;
}
function filterClasses(e, options) {
if (options.includeClasses)
return true;
const forbiddenClasses = options.forbiddenClasses || [];
const FORBIDDEN = [...forbidden_classes_1.default, ...forbiddenClasses];
const className = getClass(e);
let found = false;
FORBIDDEN.forEach(function (forbidden) {
if (className.indexOf(forbidden) > -1) {
found = true;
}
});
return !found;
}
function getClass(e) {
return getProp(e, "class").toLowerCase();
}
function getProp(e, prop) {
if (!(0, helpers_1.isNode)(e))
return "";
if (!e.attributes)
return "";
const pair = e.attributes.find((a) => a.key === prop);
if (pair)
return String(pair.value);
return "";
}
function passToChildren(e, options, func) {
if (!(0, helpers_1.isNode)(e))
return e;
if ((0, helpers_1.isNodeWithChildren)(e)) {
e.children = func(e.children, options, func);
}
return e;
}
function cleanAttributes(e, _options) {
if (!(0, helpers_1.isNode)(e))
return e;
const type = getElementType(e);
const attributeList = ATTRIBUTES_TO_KEEP[type];
keepAttributes(e, attributeList);
// make sure invalid elements don't get rendered to html
if (type === "INVALID") {
e.tagName = "div";
e.children = [];
}
if (type === "LINK") {
e.attributes.push({ key: "target", value: "_blank" });
}
if (type === "IMAGE") {
mirrorAttribute(e, "data-src", "src");
mirrorAttribute(e, "data-srcset", "srcset");
}
return e;
}
function mirrorAttribute(e, source, target) {
const sourceValue = getProp(e, source);
const targetValue = getProp(e, target);
if (sourceValue && !targetValue) {
e.attributes.push({ key: target, value: sourceValue });
}
}
function getElementType(e) {
if (e.tagName === "img")
return "IMAGE";
if (e.tagName === "a")
return "LINK";
if (e.tagName === "source")
return "SOURCE";
const isIFrame = e.tagName === "iframe";
if (isIFrame) {
const src = getProp(e, "src");
// TODO: add support to other platforms
const isYoutube = src.indexOf("youtube.com") > 0 || src.indexOf("youtu.be") > 0;
if (isYoutube)
return "YOUTUBE";
}
// if is not a youtube video, but is still an iframe, return invalid
if (isIFrame)
return "INVALID";
return "OTHER";
}
function keepAttributes(e, list) {
e.attributes = e.attributes
.map((a) => ({ key: a.key.toLowerCase(), value: a.value }))
.filter((attr) => attr.value && list.includes(attr.key));
return e;
}