cleanview
Version:
Clean the content of html articles
144 lines (143 loc) • 5.33 kB
JavaScript
;
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
var desc = Object.getOwnPropertyDescriptor(m, k);
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
desc = { enumerable: true, get: function() { return m[k]; } };
}
Object.defineProperty(o, k2, desc);
}) : (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
o[k2] = m[k];
}));
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
Object.defineProperty(o, "default", { enumerable: true, value: v });
}) : function(o, v) {
o["default"] = v;
});
var __importStar = (this && this.__importStar) || (function () {
var ownKeys = function(o) {
ownKeys = Object.getOwnPropertyNames || function (o) {
var ar = [];
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
return ar;
};
return ownKeys(o);
};
return function (mod) {
if (mod && mod.__esModule) return mod;
var result = {};
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
__setModuleDefault(result, mod);
return result;
};
})();
var __importDefault = (this && this.__importDefault) || function (mod) {
return (mod && mod.__esModule) ? mod : { "default": mod };
};
Object.defineProperty(exports, "__esModule", { value: true });
const himalaya = __importStar(require("himalaya"));
const filters = __importStar(require("./filters"));
const modifiers = __importStar(require("./modifiers"));
const urlParser = __importStar(require("./url-parser"));
const query_1 = __importDefault(require("./query"));
const MIN_DEFAULT_RATIO = 0.75;
function parse(html, options) {
options = options || {};
options.url = options.url || "";
let { allElements, allParagraphs } = parseJSON(html, options);
// If there's no paragraphs from the search, try again without filtering classes
if (!allParagraphs.length) {
options.secondTry = true;
options.includeClasses = true;
const result = parseJSON(html, options);
allParagraphs = result.allParagraphs;
allElements = result.allElements;
}
// if it's the second time around
if (!allParagraphs.length) {
return nothing(options);
}
const contentElement = getContentElement(allElements, allParagraphs, options);
return stringify([contentElement]);
}
function parseJSON(html, options) {
const url = options.url || "";
const json = himalaya.parse(html);
// clean the elements
const clearedJSON = filters.clean(json, options);
// add ids to each one
const allElements = modifiers.addIds(clearedJSON);
// fix all the relative urls
urlParser.addBaseUrl(allElements, url);
const allParagraphs = (0, query_1.default)("p", clearedJSON);
return { allParagraphs, clearedJSON, allElements };
}
function getContentElement(allElements, allParagraphs, options) {
var _a;
const MIN_RATIO = options.minRatio || MIN_DEFAULT_RATIO;
const totalParagraphs = allParagraphs.length;
// the element with more paragraphs will be the the one shown
const parents = countParents(allParagraphs);
const maxId = getMaxId(parents);
let contentParent = allElements[maxId];
let ratio = 0;
let count = 0;
do {
const contentParagraphs = (0, query_1.default)("p", contentParent);
const contentParagraphsCount = contentParagraphs.length;
ratio = contentParagraphsCount / totalParagraphs;
if (ratio < MIN_RATIO) {
const id = (_a = contentParent.parentId) !== null && _a !== void 0 ? _a : 0;
contentParent = allElements[id];
}
// prevent infinite loops
count++;
} while (contentParent && ratio <= MIN_RATIO && count < 4);
return contentParent;
}
function nothing(o) {
return `<p><a href="${o.url}" target="_blank">${o.url}</a></p>`;
}
function countParents(allParagraphs) {
const parents = {};
allParagraphs.forEach(function (element) {
const id = element.parentId || "";
parents[id] = parents[id] || 0;
parents[id]++;
});
return parents;
}
function getMaxId(obj) {
let max = -1;
let maxId = -1;
for (const id in obj) {
const nId = Number(id);
const value = obj[nId];
if (value > max) {
max = value;
maxId = nId;
}
}
return maxId;
}
function stringify(json) {
const output = himalaya
.stringify(json)
.replace(/<html>/g, "")
.replace(/<body>/g, "")
.replace(/<div>/g, "")
.replace(/<span>/g, "")
.replace(/<\/html>/g, "")
.replace(/<\/body>/g, "")
.replace(/<\/div>/g, "")
.replace(/<\/span>/g, "");
return addSomeSpaces(output);
}
function addSomeSpaces(str) {
// this will add a space before each anchor tag, except for those
// preceded by: ( " [ { - – — _ ~ @
// the reason to do this is to prevent space collapsing before links
return str.replace(/([^("[{\-–—_~@])<a/gi, "$1 <a");
}
exports.default = parse;