@mozilla/readability
Version:
A standalone version of the readability library used for Firefox Reader View.
1,529 lines (1,376 loc) • 90 kB
JavaScript
/*
* Copyright (c) 2010 Arc90 Inc
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* This code is heavily based on Arc90's readability.js (1.7.1) script
* available at: http://code.google.com/p/arc90labs-readability
*/
/**
* Public constructor.
* @param {HTMLDocument} doc The document to parse.
* @param {Object} options The options object.
*/
function Readability(doc, options) {
// In some older versions, people passed a URI as the first argument. Cope:
if (options && options.documentElement) {
doc = options;
options = arguments[2];
} else if (!doc || !doc.documentElement) {
throw new Error(
"First argument to Readability constructor should be a document object."
);
}
options = options || {};
this._doc = doc;
this._docJSDOMParser = this._doc.firstChild.__JSDOMParser__;
this._articleTitle = null;
this._articleByline = null;
this._articleDir = null;
this._articleSiteName = null;
this._attempts = [];
this._metadata = {};
// Configurable options
this._debug = !!options.debug;
this._maxElemsToParse =
options.maxElemsToParse || this.DEFAULT_MAX_ELEMS_TO_PARSE;
this._nbTopCandidates =
options.nbTopCandidates || this.DEFAULT_N_TOP_CANDIDATES;
this._charThreshold = options.charThreshold || this.DEFAULT_CHAR_THRESHOLD;
this._classesToPreserve = this.CLASSES_TO_PRESERVE.concat(
options.classesToPreserve || []
);
this._keepClasses = !!options.keepClasses;
this._serializer =
options.serializer ||
function (el) {
return el.innerHTML;
};
this._disableJSONLD = !!options.disableJSONLD;
this._allowedVideoRegex = options.allowedVideoRegex || this.REGEXPS.videos;
this._linkDensityModifier = options.linkDensityModifier || 0;
// Start with all flags set
this._flags =
this.FLAG_STRIP_UNLIKELYS |
this.FLAG_WEIGHT_CLASSES |
this.FLAG_CLEAN_CONDITIONALLY;
// Control whether log messages are sent to the console
if (this._debug) {
let logNode = function (node) {
if (node.nodeType == node.TEXT_NODE) {
return `${node.nodeName} ("${node.textContent}")`;
}
let attrPairs = Array.from(node.attributes || [], function (attr) {
return `${attr.name}="${attr.value}"`;
}).join(" ");
return `<${node.localName} ${attrPairs}>`;
};
this.log = function () {
if (typeof console !== "undefined") {
let args = Array.from(arguments, arg => {
if (arg && arg.nodeType == this.ELEMENT_NODE) {
return logNode(arg);
}
return arg;
});
args.unshift("Reader: (Readability)");
// eslint-disable-next-line no-console
console.log(...args);
} else if (typeof dump !== "undefined") {
/* global dump */
var msg = Array.prototype.map
.call(arguments, function (x) {
return x && x.nodeName ? logNode(x) : x;
})
.join(" ");
dump("Reader: (Readability) " + msg + "\n");
}
};
} else {
this.log = function () {};
}
}
Readability.prototype = {
FLAG_STRIP_UNLIKELYS: 0x1,
FLAG_WEIGHT_CLASSES: 0x2,
FLAG_CLEAN_CONDITIONALLY: 0x4,
// https://developer.mozilla.org/en-US/docs/Web/API/Node/nodeType
ELEMENT_NODE: 1,
TEXT_NODE: 3,
// Max number of nodes supported by this parser. Default: 0 (no limit)
DEFAULT_MAX_ELEMS_TO_PARSE: 0,
// The number of top candidates to consider when analysing how
// tight the competition is among candidates.
DEFAULT_N_TOP_CANDIDATES: 5,
// Element tags to score by default.
DEFAULT_TAGS_TO_SCORE: "section,h2,h3,h4,h5,h6,p,td,pre"
.toUpperCase()
.split(","),
// The default number of chars an article must have in order to return a result
DEFAULT_CHAR_THRESHOLD: 500,
// All of the regular expressions in use within readability.
// Defined up here so we don't instantiate them repeatedly in loops.
REGEXPS: {
// NOTE: These two regular expressions are duplicated in
// Readability-readerable.js. Please keep both copies in sync.
unlikelyCandidates:
/-ad-|ai2html|banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|footer|gdpr|header|legends|menu|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote/i,
okMaybeItsACandidate: /and|article|body|column|content|main|shadow/i,
positive:
/article|body|content|entry|hentry|h-entry|main|page|pagination|post|text|blog|story/i,
negative:
/-ad-|hidden|^hid$| hid$| hid |^hid |banner|combx|comment|com-|contact|footer|gdpr|masthead|media|meta|outbrain|promo|related|scroll|share|shoutbox|sidebar|skyscraper|sponsor|shopping|tags|widget/i,
extraneous:
/print|archive|comment|discuss|e[\-]?mail|share|reply|all|login|sign|single|utility/i,
byline: /byline|author|dateline|writtenby|p-author/i,
replaceFonts: /<(\/?)font[^>]*>/gi,
normalize: /\s{2,}/g,
videos:
/\/\/(www\.)?((dailymotion|youtube|youtube-nocookie|player\.vimeo|v\.qq)\.com|(archive|upload\.wikimedia)\.org|player\.twitch\.tv)/i,
shareElements: /(\b|_)(share|sharedaddy)(\b|_)/i,
nextLink: /(next|weiter|continue|>([^\|]|$)|»([^\|]|$))/i,
prevLink: /(prev|earl|old|new|<|«)/i,
tokenize: /\W+/g,
whitespace: /^\s*$/,
hasContent: /\S$/,
hashUrl: /^#.+/,
srcsetUrl: /(\S+)(\s+[\d.]+[xw])?(\s*(?:,|$))/g,
b64DataUrl: /^data:\s*([^\s;,]+)\s*;\s*base64\s*,/i,
// Commas as used in Latin, Sindhi, Chinese and various other scripts.
// see: https://en.wikipedia.org/wiki/Comma#Comma_variants
commas: /\u002C|\u060C|\uFE50|\uFE10|\uFE11|\u2E41|\u2E34|\u2E32|\uFF0C/g,
// See: https://schema.org/Article
jsonLdArticleTypes:
/^Article|AdvertiserContentArticle|NewsArticle|AnalysisNewsArticle|AskPublicNewsArticle|BackgroundNewsArticle|OpinionNewsArticle|ReportageNewsArticle|ReviewNewsArticle|Report|SatiricalArticle|ScholarlyArticle|MedicalScholarlyArticle|SocialMediaPosting|BlogPosting|LiveBlogPosting|DiscussionForumPosting|TechArticle|APIReference$/,
// used to see if a node's content matches words commonly used for ad blocks or loading indicators
adWords:
/^(ad(vertising|vertisement)?|pub(licité)?|werb(ung)?|广告|Реклама|Anuncio)$/iu,
loadingWords:
/^((loading|正在加载|Загрузка|chargement|cargando)(…|\.\.\.)?)$/iu,
},
UNLIKELY_ROLES: [
"menu",
"menubar",
"complementary",
"navigation",
"alert",
"alertdialog",
"dialog",
],
DIV_TO_P_ELEMS: new Set([
"BLOCKQUOTE",
"DL",
"DIV",
"IMG",
"OL",
"P",
"PRE",
"TABLE",
"UL",
]),
ALTER_TO_DIV_EXCEPTIONS: ["DIV", "ARTICLE", "SECTION", "P", "OL", "UL"],
PRESENTATIONAL_ATTRIBUTES: [
"align",
"background",
"bgcolor",
"border",
"cellpadding",
"cellspacing",
"frame",
"hspace",
"rules",
"style",
"valign",
"vspace",
],
DEPRECATED_SIZE_ATTRIBUTE_ELEMS: ["TABLE", "TH", "TD", "HR", "PRE"],
// The commented out elements qualify as phrasing content but tend to be
// removed by readability when put into paragraphs, so we ignore them here.
PHRASING_ELEMS: [
// "CANVAS", "IFRAME", "SVG", "VIDEO",
"ABBR",
"AUDIO",
"B",
"BDO",
"BR",
"BUTTON",
"CITE",
"CODE",
"DATA",
"DATALIST",
"DFN",
"EM",
"EMBED",
"I",
"IMG",
"INPUT",
"KBD",
"LABEL",
"MARK",
"MATH",
"METER",
"NOSCRIPT",
"OBJECT",
"OUTPUT",
"PROGRESS",
"Q",
"RUBY",
"SAMP",
"SCRIPT",
"SELECT",
"SMALL",
"SPAN",
"STRONG",
"SUB",
"SUP",
"TEXTAREA",
"TIME",
"VAR",
"WBR",
],
// These are the classes that readability sets itself.
CLASSES_TO_PRESERVE: ["page"],
// These are the list of HTML entities that need to be escaped.
HTML_ESCAPE_MAP: {
lt: "<",
gt: ">",
amp: "&",
quot: '"',
apos: "'",
},
/**
* Run any post-process modifications to article content as necessary.
*
* @param Element
* @return void
**/
_postProcessContent(articleContent) {
// Readability cannot open relative uris so we convert them to absolute uris.
this._fixRelativeUris(articleContent);
this._simplifyNestedElements(articleContent);
if (!this._keepClasses) {
// Remove classes.
this._cleanClasses(articleContent);
}
},
/**
* Iterates over a NodeList, calls `filterFn` for each node and removes node
* if function returned `true`.
*
* If function is not passed, removes all the nodes in node list.
*
* @param NodeList nodeList The nodes to operate on
* @param Function filterFn the function to use as a filter
* @return void
*/
_removeNodes(nodeList, filterFn) {
// Avoid ever operating on live node lists.
if (this._docJSDOMParser && nodeList._isLiveNodeList) {
throw new Error("Do not pass live node lists to _removeNodes");
}
for (var i = nodeList.length - 1; i >= 0; i--) {
var node = nodeList[i];
var parentNode = node.parentNode;
if (parentNode) {
if (!filterFn || filterFn.call(this, node, i, nodeList)) {
parentNode.removeChild(node);
}
}
}
},
/**
* Iterates over a NodeList, and calls _setNodeTag for each node.
*
* @param NodeList nodeList The nodes to operate on
* @param String newTagName the new tag name to use
* @return void
*/
_replaceNodeTags(nodeList, newTagName) {
// Avoid ever operating on live node lists.
if (this._docJSDOMParser && nodeList._isLiveNodeList) {
throw new Error("Do not pass live node lists to _replaceNodeTags");
}
for (const node of nodeList) {
this._setNodeTag(node, newTagName);
}
},
/**
* Iterate over a NodeList, which doesn't natively fully implement the Array
* interface.
*
* For convenience, the current object context is applied to the provided
* iterate function.
*
* @param NodeList nodeList The NodeList.
* @param Function fn The iterate function.
* @return void
*/
_forEachNode(nodeList, fn) {
Array.prototype.forEach.call(nodeList, fn, this);
},
/**
* Iterate over a NodeList, and return the first node that passes
* the supplied test function
*
* For convenience, the current object context is applied to the provided
* test function.
*
* @param NodeList nodeList The NodeList.
* @param Function fn The test function.
* @return void
*/
_findNode(nodeList, fn) {
return Array.prototype.find.call(nodeList, fn, this);
},
/**
* Iterate over a NodeList, return true if any of the provided iterate
* function calls returns true, false otherwise.
*
* For convenience, the current object context is applied to the
* provided iterate function.
*
* @param NodeList nodeList The NodeList.
* @param Function fn The iterate function.
* @return Boolean
*/
_someNode(nodeList, fn) {
return Array.prototype.some.call(nodeList, fn, this);
},
/**
* Iterate over a NodeList, return true if all of the provided iterate
* function calls return true, false otherwise.
*
* For convenience, the current object context is applied to the
* provided iterate function.
*
* @param NodeList nodeList The NodeList.
* @param Function fn The iterate function.
* @return Boolean
*/
_everyNode(nodeList, fn) {
return Array.prototype.every.call(nodeList, fn, this);
},
_getAllNodesWithTag(node, tagNames) {
if (node.querySelectorAll) {
return node.querySelectorAll(tagNames.join(","));
}
return [].concat.apply(
[],
tagNames.map(function (tag) {
var collection = node.getElementsByTagName(tag);
return Array.isArray(collection) ? collection : Array.from(collection);
})
);
},
/**
* Removes the class="" attribute from every element in the given
* subtree, except those that match CLASSES_TO_PRESERVE and
* the classesToPreserve array from the options object.
*
* @param Element
* @return void
*/
_cleanClasses(node) {
var classesToPreserve = this._classesToPreserve;
var className = (node.getAttribute("class") || "")
.split(/\s+/)
.filter(cls => classesToPreserve.includes(cls))
.join(" ");
if (className) {
node.setAttribute("class", className);
} else {
node.removeAttribute("class");
}
for (node = node.firstElementChild; node; node = node.nextElementSibling) {
this._cleanClasses(node);
}
},
/**
* Tests whether a string is a URL or not.
*
* @param {string} str The string to test
* @return {boolean} true if str is a URL, false if not
*/
_isUrl(str) {
try {
new URL(str);
return true;
} catch {
return false;
}
},
/**
* Converts each <a> and <img> uri in the given element to an absolute URI,
* ignoring #ref URIs.
*
* @param Element
* @return void
*/
_fixRelativeUris(articleContent) {
var baseURI = this._doc.baseURI;
var documentURI = this._doc.documentURI;
function toAbsoluteURI(uri) {
// Leave hash links alone if the base URI matches the document URI:
if (baseURI == documentURI && uri.charAt(0) == "#") {
return uri;
}
// Otherwise, resolve against base URI:
try {
return new URL(uri, baseURI).href;
} catch (ex) {
// Something went wrong, just return the original:
}
return uri;
}
var links = this._getAllNodesWithTag(articleContent, ["a"]);
this._forEachNode(links, function (link) {
var href = link.getAttribute("href");
if (href) {
// Remove links with javascript: URIs, since
// they won't work after scripts have been removed from the page.
if (href.indexOf("javascript:") === 0) {
// if the link only contains simple text content, it can be converted to a text node
if (
link.childNodes.length === 1 &&
link.childNodes[0].nodeType === this.TEXT_NODE
) {
var text = this._doc.createTextNode(link.textContent);
link.parentNode.replaceChild(text, link);
} else {
// if the link has multiple children, they should all be preserved
var container = this._doc.createElement("span");
while (link.firstChild) {
container.appendChild(link.firstChild);
}
link.parentNode.replaceChild(container, link);
}
} else {
link.setAttribute("href", toAbsoluteURI(href));
}
}
});
var medias = this._getAllNodesWithTag(articleContent, [
"img",
"picture",
"figure",
"video",
"audio",
"source",
]);
this._forEachNode(medias, function (media) {
var src = media.getAttribute("src");
var poster = media.getAttribute("poster");
var srcset = media.getAttribute("srcset");
if (src) {
media.setAttribute("src", toAbsoluteURI(src));
}
if (poster) {
media.setAttribute("poster", toAbsoluteURI(poster));
}
if (srcset) {
var newSrcset = srcset.replace(
this.REGEXPS.srcsetUrl,
function (_, p1, p2, p3) {
return toAbsoluteURI(p1) + (p2 || "") + p3;
}
);
media.setAttribute("srcset", newSrcset);
}
});
},
_simplifyNestedElements(articleContent) {
var node = articleContent;
while (node) {
if (
node.parentNode &&
["DIV", "SECTION"].includes(node.tagName) &&
!(node.id && node.id.startsWith("readability"))
) {
if (this._isElementWithoutContent(node)) {
node = this._removeAndGetNext(node);
continue;
} else if (
this._hasSingleTagInsideElement(node, "DIV") ||
this._hasSingleTagInsideElement(node, "SECTION")
) {
var child = node.children[0];
for (var i = 0; i < node.attributes.length; i++) {
child.setAttributeNode(node.attributes[i].cloneNode());
}
node.parentNode.replaceChild(child, node);
node = child;
continue;
}
}
node = this._getNextNode(node);
}
},
/**
* Get the article title as an H1.
*
* @return string
**/
_getArticleTitle() {
var doc = this._doc;
var curTitle = "";
var origTitle = "";
try {
curTitle = origTitle = doc.title.trim();
// If they had an element with id "title" in their HTML
if (typeof curTitle !== "string") {
curTitle = origTitle = this._getInnerText(
doc.getElementsByTagName("title")[0]
);
}
} catch (e) {
/* ignore exceptions setting the title. */
}
var titleHadHierarchicalSeparators = false;
function wordCount(str) {
return str.split(/\s+/).length;
}
// If there's a separator in the title, first remove the final part
if (/ [\|\-\\\/>»] /.test(curTitle)) {
titleHadHierarchicalSeparators = / [\\\/>»] /.test(curTitle);
let allSeparators = Array.from(origTitle.matchAll(/ [\|\-\\\/>»] /gi));
curTitle = origTitle.substring(0, allSeparators.pop().index);
// If the resulting title is too short, remove the first part instead:
if (wordCount(curTitle) < 3) {
curTitle = origTitle.replace(/^[^\|\-\\\/>»]*[\|\-\\\/>»]/gi, "");
}
} else if (curTitle.includes(": ")) {
// Check if we have an heading containing this exact string, so we
// could assume it's the full title.
var headings = this._getAllNodesWithTag(doc, ["h1", "h2"]);
var trimmedTitle = curTitle.trim();
var match = this._someNode(headings, function (heading) {
return heading.textContent.trim() === trimmedTitle;
});
// If we don't, let's extract the title out of the original title string.
if (!match) {
curTitle = origTitle.substring(origTitle.lastIndexOf(":") + 1);
// If the title is now too short, try the first colon instead:
if (wordCount(curTitle) < 3) {
curTitle = origTitle.substring(origTitle.indexOf(":") + 1);
// But if we have too many words before the colon there's something weird
// with the titles and the H tags so let's just use the original title instead
} else if (wordCount(origTitle.substr(0, origTitle.indexOf(":"))) > 5) {
curTitle = origTitle;
}
}
} else if (curTitle.length > 150 || curTitle.length < 15) {
var hOnes = doc.getElementsByTagName("h1");
if (hOnes.length === 1) {
curTitle = this._getInnerText(hOnes[0]);
}
}
curTitle = curTitle.trim().replace(this.REGEXPS.normalize, " ");
// If we now have 4 words or fewer as our title, and either no
// 'hierarchical' separators (\, /, > or ») were found in the original
// title or we decreased the number of words by more than 1 word, use
// the original title.
var curTitleWordCount = wordCount(curTitle);
if (
curTitleWordCount <= 4 &&
(!titleHadHierarchicalSeparators ||
curTitleWordCount !=
wordCount(origTitle.replace(/[\|\-\\\/>»]+/g, "")) - 1)
) {
curTitle = origTitle;
}
return curTitle;
},
/**
* Prepare the HTML document for readability to scrape it.
* This includes things like stripping javascript, CSS, and handling terrible markup.
*
* @return void
**/
_prepDocument() {
var doc = this._doc;
// Remove all style tags in head
this._removeNodes(this._getAllNodesWithTag(doc, ["style"]));
if (doc.body) {
this._replaceBrs(doc.body);
}
this._replaceNodeTags(this._getAllNodesWithTag(doc, ["font"]), "SPAN");
},
/**
* Finds the next node, starting from the given node, and ignoring
* whitespace in between. If the given node is an element, the same node is
* returned.
*/
_nextNode(node) {
var next = node;
while (
next &&
next.nodeType != this.ELEMENT_NODE &&
this.REGEXPS.whitespace.test(next.textContent)
) {
next = next.nextSibling;
}
return next;
},
/**
* Replaces 2 or more successive <br> elements with a single <p>.
* Whitespace between <br> elements are ignored. For example:
* <div>foo<br>bar<br> <br><br>abc</div>
* will become:
* <div>foo<br>bar<p>abc</p></div>
*/
_replaceBrs(elem) {
this._forEachNode(this._getAllNodesWithTag(elem, ["br"]), function (br) {
var next = br.nextSibling;
// Whether 2 or more <br> elements have been found and replaced with a
// <p> block.
var replaced = false;
// If we find a <br> chain, remove the <br>s until we hit another node
// or non-whitespace. This leaves behind the first <br> in the chain
// (which will be replaced with a <p> later).
while ((next = this._nextNode(next)) && next.tagName == "BR") {
replaced = true;
var brSibling = next.nextSibling;
next.remove();
next = brSibling;
}
// If we removed a <br> chain, replace the remaining <br> with a <p>. Add
// all sibling nodes as children of the <p> until we hit another <br>
// chain.
if (replaced) {
var p = this._doc.createElement("p");
br.parentNode.replaceChild(p, br);
next = p.nextSibling;
while (next) {
// If we've hit another <br><br>, we're done adding children to this <p>.
if (next.tagName == "BR") {
var nextElem = this._nextNode(next.nextSibling);
if (nextElem && nextElem.tagName == "BR") {
break;
}
}
if (!this._isPhrasingContent(next)) {
break;
}
// Otherwise, make this node a child of the new <p>.
var sibling = next.nextSibling;
p.appendChild(next);
next = sibling;
}
while (p.lastChild && this._isWhitespace(p.lastChild)) {
p.lastChild.remove();
}
if (p.parentNode.tagName === "P") {
this._setNodeTag(p.parentNode, "DIV");
}
}
});
},
_setNodeTag(node, tag) {
this.log("_setNodeTag", node, tag);
if (this._docJSDOMParser) {
node.localName = tag.toLowerCase();
node.tagName = tag.toUpperCase();
return node;
}
var replacement = node.ownerDocument.createElement(tag);
while (node.firstChild) {
replacement.appendChild(node.firstChild);
}
node.parentNode.replaceChild(replacement, node);
if (node.readability) {
replacement.readability = node.readability;
}
for (var i = 0; i < node.attributes.length; i++) {
replacement.setAttributeNode(node.attributes[i].cloneNode());
}
return replacement;
},
/**
* Prepare the article node for display. Clean out any inline styles,
* iframes, forms, strip extraneous <p> tags, etc.
*
* @param Element
* @return void
**/
_prepArticle(articleContent) {
this._cleanStyles(articleContent);
// Check for data tables before we continue, to avoid removing items in
// those tables, which will often be isolated even though they're
// visually linked to other content-ful elements (text, images, etc.).
this._markDataTables(articleContent);
this._fixLazyImages(articleContent);
// Clean out junk from the article content
this._cleanConditionally(articleContent, "form");
this._cleanConditionally(articleContent, "fieldset");
this._clean(articleContent, "object");
this._clean(articleContent, "embed");
this._clean(articleContent, "footer");
this._clean(articleContent, "link");
this._clean(articleContent, "aside");
// Clean out elements with little content that have "share" in their id/class combinations from final top candidates,
// which means we don't remove the top candidates even they have "share".
var shareElementThreshold = this.DEFAULT_CHAR_THRESHOLD;
this._forEachNode(articleContent.children, function (topCandidate) {
this._cleanMatchedNodes(topCandidate, function (node, matchString) {
return (
this.REGEXPS.shareElements.test(matchString) &&
node.textContent.length < shareElementThreshold
);
});
});
this._clean(articleContent, "iframe");
this._clean(articleContent, "input");
this._clean(articleContent, "textarea");
this._clean(articleContent, "select");
this._clean(articleContent, "button");
this._cleanHeaders(articleContent);
// Do these last as the previous stuff may have removed junk
// that will affect these
this._cleanConditionally(articleContent, "table");
this._cleanConditionally(articleContent, "ul");
this._cleanConditionally(articleContent, "div");
// replace H1 with H2 as H1 should be only title that is displayed separately
this._replaceNodeTags(
this._getAllNodesWithTag(articleContent, ["h1"]),
"h2"
);
// Remove extra paragraphs
this._removeNodes(
this._getAllNodesWithTag(articleContent, ["p"]),
function (paragraph) {
// At this point, nasty iframes have been removed; only embedded video
// ones remain.
var contentElementCount = this._getAllNodesWithTag(paragraph, [
"img",
"embed",
"object",
"iframe",
]).length;
return (
contentElementCount === 0 && !this._getInnerText(paragraph, false)
);
}
);
this._forEachNode(
this._getAllNodesWithTag(articleContent, ["br"]),
function (br) {
var next = this._nextNode(br.nextSibling);
if (next && next.tagName == "P") {
br.remove();
}
}
);
// Remove single-cell tables
this._forEachNode(
this._getAllNodesWithTag(articleContent, ["table"]),
function (table) {
var tbody = this._hasSingleTagInsideElement(table, "TBODY")
? table.firstElementChild
: table;
if (this._hasSingleTagInsideElement(tbody, "TR")) {
var row = tbody.firstElementChild;
if (this._hasSingleTagInsideElement(row, "TD")) {
var cell = row.firstElementChild;
cell = this._setNodeTag(
cell,
this._everyNode(cell.childNodes, this._isPhrasingContent)
? "P"
: "DIV"
);
table.parentNode.replaceChild(cell, table);
}
}
}
);
},
/**
* Initialize a node with the readability object. Also checks the
* className/id for special names to add to its score.
*
* @param Element
* @return void
**/
_initializeNode(node) {
node.readability = { contentScore: 0 };
switch (node.tagName) {
case "DIV":
node.readability.contentScore += 5;
break;
case "PRE":
case "TD":
case "BLOCKQUOTE":
node.readability.contentScore += 3;
break;
case "ADDRESS":
case "OL":
case "UL":
case "DL":
case "DD":
case "DT":
case "LI":
case "FORM":
node.readability.contentScore -= 3;
break;
case "H1":
case "H2":
case "H3":
case "H4":
case "H5":
case "H6":
case "TH":
node.readability.contentScore -= 5;
break;
}
node.readability.contentScore += this._getClassWeight(node);
},
_removeAndGetNext(node) {
var nextNode = this._getNextNode(node, true);
node.remove();
return nextNode;
},
/**
* Traverse the DOM from node to node, starting at the node passed in.
* Pass true for the second parameter to indicate this node itself
* (and its kids) are going away, and we want the next node over.
*
* Calling this in a loop will traverse the DOM depth-first.
*
* @param {Element} node
* @param {boolean} ignoreSelfAndKids
* @return {Element}
*/
_getNextNode(node, ignoreSelfAndKids) {
// First check for kids if those aren't being ignored
if (!ignoreSelfAndKids && node.firstElementChild) {
return node.firstElementChild;
}
// Then for siblings...
if (node.nextElementSibling) {
return node.nextElementSibling;
}
// And finally, move up the parent chain *and* find a sibling
// (because this is depth-first traversal, we will have already
// seen the parent nodes themselves).
do {
node = node.parentNode;
} while (node && !node.nextElementSibling);
return node && node.nextElementSibling;
},
// compares second text to first one
// 1 = same text, 0 = completely different text
// works the way that it splits both texts into words and then finds words that are unique in second text
// the result is given by the lower length of unique parts
_textSimilarity(textA, textB) {
var tokensA = textA
.toLowerCase()
.split(this.REGEXPS.tokenize)
.filter(Boolean);
var tokensB = textB
.toLowerCase()
.split(this.REGEXPS.tokenize)
.filter(Boolean);
if (!tokensA.length || !tokensB.length) {
return 0;
}
var uniqTokensB = tokensB.filter(token => !tokensA.includes(token));
var distanceB = uniqTokensB.join(" ").length / tokensB.join(" ").length;
return 1 - distanceB;
},
/**
* Checks whether an element node contains a valid byline
*
* @param node {Element}
* @param matchString {string}
* @return boolean
*/
_isValidByline(node, matchString) {
var rel = node.getAttribute("rel");
var itemprop = node.getAttribute("itemprop");
var bylineLength = node.textContent.trim().length;
return (
(rel === "author" ||
(itemprop && itemprop.includes("author")) ||
this.REGEXPS.byline.test(matchString)) &&
!!bylineLength &&
bylineLength < 100
);
},
_getNodeAncestors(node, maxDepth) {
maxDepth = maxDepth || 0;
var i = 0,
ancestors = [];
while (node.parentNode) {
ancestors.push(node.parentNode);
if (maxDepth && ++i === maxDepth) {
break;
}
node = node.parentNode;
}
return ancestors;
},
/***
* grabArticle - Using a variety of metrics (content score, classname, element types), find the content that is
* most likely to be the stuff a user wants to read. Then return it wrapped up in a div.
*
* @param page a document to run upon. Needs to be a full document, complete with body.
* @return Element
**/
/* eslint-disable-next-line complexity */
_grabArticle(page) {
this.log("**** grabArticle ****");
var doc = this._doc;
var isPaging = page !== null;
page = page ? page : this._doc.body;
// We can't grab an article if we don't have a page!
if (!page) {
this.log("No body found in document. Abort.");
return null;
}
var pageCacheHtml = page.innerHTML;
while (true) {
this.log("Starting grabArticle loop");
var stripUnlikelyCandidates = this._flagIsActive(
this.FLAG_STRIP_UNLIKELYS
);
// First, node prepping. Trash nodes that look cruddy (like ones with the
// class name "comment", etc), and turn divs into P tags where they have been
// used inappropriately (as in, where they contain no other block level elements.)
var elementsToScore = [];
var node = this._doc.documentElement;
let shouldRemoveTitleHeader = true;
while (node) {
if (node.tagName === "HTML") {
this._articleLang = node.getAttribute("lang");
}
var matchString = node.className + " " + node.id;
if (!this._isProbablyVisible(node)) {
this.log("Removing hidden node - " + matchString);
node = this._removeAndGetNext(node);
continue;
}
// User is not able to see elements applied with both "aria-modal = true" and "role = dialog"
if (
node.getAttribute("aria-modal") == "true" &&
node.getAttribute("role") == "dialog"
) {
node = this._removeAndGetNext(node);
continue;
}
// If we don't have a byline yet check to see if this node is a byline; if it is store the byline and remove the node.
if (
!this._articleByline &&
!this._metadata.byline &&
this._isValidByline(node, matchString)
) {
// Find child node matching [itemprop="name"] and use that if it exists for a more accurate author name byline
var endOfSearchMarkerNode = this._getNextNode(node, true);
var next = this._getNextNode(node);
var itemPropNameNode = null;
while (next && next != endOfSearchMarkerNode) {
var itemprop = next.getAttribute("itemprop");
if (itemprop && itemprop.includes("name")) {
itemPropNameNode = next;
break;
} else {
next = this._getNextNode(next);
}
}
this._articleByline = (itemPropNameNode ?? node).textContent.trim();
node = this._removeAndGetNext(node);
continue;
}
if (shouldRemoveTitleHeader && this._headerDuplicatesTitle(node)) {
this.log(
"Removing header: ",
node.textContent.trim(),
this._articleTitle.trim()
);
shouldRemoveTitleHeader = false;
node = this._removeAndGetNext(node);
continue;
}
// Remove unlikely candidates
if (stripUnlikelyCandidates) {
if (
this.REGEXPS.unlikelyCandidates.test(matchString) &&
!this.REGEXPS.okMaybeItsACandidate.test(matchString) &&
!this._hasAncestorTag(node, "table") &&
!this._hasAncestorTag(node, "code") &&
node.tagName !== "BODY" &&
node.tagName !== "A"
) {
this.log("Removing unlikely candidate - " + matchString);
node = this._removeAndGetNext(node);
continue;
}
if (this.UNLIKELY_ROLES.includes(node.getAttribute("role"))) {
this.log(
"Removing content with role " +
node.getAttribute("role") +
" - " +
matchString
);
node = this._removeAndGetNext(node);
continue;
}
}
// Remove DIV, SECTION, and HEADER nodes without any content(e.g. text, image, video, or iframe).
if (
(node.tagName === "DIV" ||
node.tagName === "SECTION" ||
node.tagName === "HEADER" ||
node.tagName === "H1" ||
node.tagName === "H2" ||
node.tagName === "H3" ||
node.tagName === "H4" ||
node.tagName === "H5" ||
node.tagName === "H6") &&
this._isElementWithoutContent(node)
) {
node = this._removeAndGetNext(node);
continue;
}
if (this.DEFAULT_TAGS_TO_SCORE.includes(node.tagName)) {
elementsToScore.push(node);
}
// Turn all divs that don't have children block level elements into p's
if (node.tagName === "DIV") {
// Put phrasing content into paragraphs.
var p = null;
var childNode = node.firstChild;
while (childNode) {
var nextSibling = childNode.nextSibling;
if (this._isPhrasingContent(childNode)) {
if (p !== null) {
p.appendChild(childNode);
} else if (!this._isWhitespace(childNode)) {
p = doc.createElement("p");
node.replaceChild(p, childNode);
p.appendChild(childNode);
}
} else if (p !== null) {
while (p.lastChild && this._isWhitespace(p.lastChild)) {
p.lastChild.remove();
}
p = null;
}
childNode = nextSibling;
}
// Sites like http://mobile.slate.com encloses each paragraph with a DIV
// element. DIVs with only a P element inside and no text content can be
// safely converted into plain P elements to avoid confusing the scoring
// algorithm with DIVs with are, in practice, paragraphs.
if (
this._hasSingleTagInsideElement(node, "P") &&
this._getLinkDensity(node) < 0.25
) {
var newNode = node.children[0];
node.parentNode.replaceChild(newNode, node);
node = newNode;
elementsToScore.push(node);
} else if (!this._hasChildBlockElement(node)) {
node = this._setNodeTag(node, "P");
elementsToScore.push(node);
}
}
node = this._getNextNode(node);
}
/**
* Loop through all paragraphs, and assign a score to them based on how content-y they look.
* Then add their score to their parent node.
*
* A score is determined by things like number of commas, class names, etc. Maybe eventually link density.
**/
var candidates = [];
this._forEachNode(elementsToScore, function (elementToScore) {
if (
!elementToScore.parentNode ||
typeof elementToScore.parentNode.tagName === "undefined"
) {
return;
}
// If this paragraph is less than 25 characters, don't even count it.
var innerText = this._getInnerText(elementToScore);
if (innerText.length < 25) {
return;
}
// Exclude nodes with no ancestor.
var ancestors = this._getNodeAncestors(elementToScore, 5);
if (ancestors.length === 0) {
return;
}
var contentScore = 0;
// Add a point for the paragraph itself as a base.
contentScore += 1;
// Add points for any commas within this paragraph.
contentScore += innerText.split(this.REGEXPS.commas).length;
// For every 100 characters in this paragraph, add another point. Up to 3 points.
contentScore += Math.min(Math.floor(innerText.length / 100), 3);
// Initialize and score ancestors.
this._forEachNode(ancestors, function (ancestor, level) {
if (
!ancestor.tagName ||
!ancestor.parentNode ||
typeof ancestor.parentNode.tagName === "undefined"
) {
return;
}
if (typeof ancestor.readability === "undefined") {
this._initializeNode(ancestor);
candidates.push(ancestor);
}
// Node score divider:
// - parent: 1 (no division)
// - grandparent: 2
// - great grandparent+: ancestor level * 3
if (level === 0) {
var scoreDivider = 1;
} else if (level === 1) {
scoreDivider = 2;
} else {
scoreDivider = level * 3;
}
ancestor.readability.contentScore += contentScore / scoreDivider;
});
});
// After we've calculated scores, loop through all of the possible
// candidate nodes we found and find the one with the highest score.
var topCandidates = [];
for (var c = 0, cl = candidates.length; c < cl; c += 1) {
var candidate = candidates[c];
// Scale the final candidates score based on link density. Good content
// should have a relatively small link density (5% or less) and be mostly
// unaffected by this operation.
var candidateScore =
candidate.readability.contentScore *
(1 - this._getLinkDensity(candidate));
candidate.readability.contentScore = candidateScore;
this.log("Candidate:", candidate, "with score " + candidateScore);
for (var t = 0; t < this._nbTopCandidates; t++) {
var aTopCandidate = topCandidates[t];
if (
!aTopCandidate ||
candidateScore > aTopCandidate.readability.contentScore
) {
topCandidates.splice(t, 0, candidate);
if (topCandidates.length > this._nbTopCandidates) {
topCandidates.pop();
}
break;
}
}
}
var topCandidate = topCandidates[0] || null;
var neededToCreateTopCandidate = false;
var parentOfTopCandidate;
// If we still have no top candidate, just use the body as a last resort.
// We also have to copy the body node so it is something we can modify.
if (topCandidate === null || topCandidate.tagName === "BODY") {
// Move all of the page's children into topCandidate
topCandidate = doc.createElement("DIV");
neededToCreateTopCandidate = true;
// Move everything (not just elements, also text nodes etc.) into the container
// so we even include text directly in the body:
while (page.firstChild) {
this.log("Moving child out:", page.firstChild);
topCandidate.appendChild(page.firstChild);
}
page.appendChild(topCandidate);
this._initializeNode(topCandidate);
} else if (topCandidate) {
// Find a better top candidate node if it contains (at least three) nodes which belong to `topCandidates` array
// and whose scores are quite closed with current `topCandidate` node.
var alternativeCandidateAncestors = [];
for (var i = 1; i < topCandidates.length; i++) {
if (
topCandidates[i].readability.contentScore /
topCandidate.readability.contentScore >=
0.75
) {
alternativeCandidateAncestors.push(
this._getNodeAncestors(topCandidates[i])
);
}
}
var MINIMUM_TOPCANDIDATES = 3;
if (alternativeCandidateAncestors.length >= MINIMUM_TOPCANDIDATES) {
parentOfTopCandidate = topCandidate.parentNode;
while (parentOfTopCandidate.tagName !== "BODY") {
var listsContainingThisAncestor = 0;
for (
var ancestorIndex = 0;
ancestorIndex < alternativeCandidateAncestors.length &&
listsContainingThisAncestor < MINIMUM_TOPCANDIDATES;
ancestorIndex++
) {
listsContainingThisAncestor += Number(
alternativeCandidateAncestors[ancestorIndex].includes(
parentOfTopCandidate
)
);
}
if (listsContainingThisAncestor >= MINIMUM_TOPCANDIDATES) {
topCandidate = parentOfTopCandidate;
break;
}
parentOfTopCandidate = parentOfTopCandidate.parentNode;
}
}
if (!topCandidate.readability) {
this._initializeNode(topCandidate);
}
// Because of our bonus system, parents of candidates might have scores
// themselves. They get half of the node. There won't be nodes with higher
// scores than our topCandidate, but if we see the score going *up* in the first
// few steps up the tree, that's a decent sign that there might be more content
// lurking in other places that we want to unify in. The sibling stuff
// below does some of that - but only if we've looked high enough up the DOM
// tree.
parentOfTopCandidate = topCandidate.parentNode;
var lastScore = topCandidate.readability.contentScore;
// The scores shouldn't get too low.
var scoreThreshold = lastScore / 3;
while (parentOfTopCandidate.tagName !== "BODY") {
if (!parentOfTopCandidate.readability) {
parentOfTopCandidate = parentOfTopCandidate.parentNode;
continue;
}
var parentScore = parentOfTopCandidate.readability.contentScore;
if (parentScore < scoreThreshold) {
break;
}
if (parentScore > lastScore) {
// Alright! We found a better parent to use.
topCandidate = parentOfTopCandidate;
break;
}
lastScore = parentOfTopCandidate.readability.contentScore;
parentOfTopCandidate = parentOfTopCandidate.parentNode;
}
// If the top candidate is the only child, use parent instead. This will help sibling
// joining logic when adjacent content is actually located in parent's sibling node.
parentOfTopCandidate = topCandidate.parentNode;
while (
parentOfTopCandidate.tagName != "BODY" &&
parentOfTopCandidate.children.length == 1
) {
topCandidate = parentOfTopCandidate;
parentOfTopCandidate = topCandidate.parentNode;
}
if (!topCandidate.readability) {
this._initializeNode(topCandidate);
}
}
// Now that we have the top candidate, look through its siblings for content
// that might also be related. Things like preambles, content split by ads
// that we removed, etc.
var articleContent = doc.createElement("DIV");
if (isPaging) {
articleContent.id = "readability-content";
}
var siblingScoreThreshold = Math.max(
10,
topCandidate.readability.contentScore * 0.2
);
// Keep potential top candidate's parent node to try to get text direction of it later.
parentOfTopCandidate = topCandidate.parentNode;
var siblings = parentOfTopCandidate.children;
for (var s = 0, sl = siblings.length; s < sl; s++) {
var sibling = siblings[s];
var append = false;
this.log(
"Looking at sibling node:",
sibling,
sibling.readability
? "with score " + sibling.readability.contentScore
: ""
);
this.log(
"Sibling has score",
sibling.readability ? sibling.readability.contentScore : "Unknown"
);
if (sibling === topCandidate) {
append = true;
} else {
var contentBonus = 0;
// Give a bonus if sibling nodes and top candidates have the example same classname
if (
sibling.className === topCandidate.className &&
topCandidate.className !== ""
) {
contentBonus += topCandidate.readability.contentScore * 0.2;
}
if (
sibling.readability &&
sibling.readability.contentScore + contentBonus >=
siblingScoreThreshold
) {
append = true;
} else if (sibling.nodeName === "P") {
var linkDensity = this._getLinkDensity(sibling);
var nodeContent = this._getInnerText(sibling);
var nodeLength = nodeContent.length;
if (nodeLength > 80 && linkDensity < 0.25) {
append = true;
} else if (
nodeLength < 80 &&
nodeLength > 0 &&
linkDensity === 0 &&
nodeContent.search(/\.( |$)/) !== -1
) {
append = true;
}
}
}
if (append) {
this.log("Appending node:", sibling);
if (!this.ALTER_TO_DIV_EXCEPTIONS.includes(sibling.nodeName)) {
// We have a node that isn't a common block level element, like a form or td tag.
// Turn it into a div so it doesn't get filtered out later by accident.
this.log("Altering sibling:", sibling, "to div.");
sibling = this._setNodeTag(sibling, "DIV");
}
articleContent.appendChild(sibling);
// Fetch children again to make it compatible
// with DOM parsers without live collection support.
siblings = parentOfTopCandidate.children;
// siblings is a reference to the children array, and
// sibling is removed from the array when we call appendChild().
// As a result, we must revisit this index since the nodes
// have been shifted.
s -= 1;
sl -= 1;
}
}
if (this._debug) {
this.log("Article content pre-prep: " + articleContent.innerHTML);
}
// So we have all of the content that we need. Now we clean it up for presentation.
this._prepArticle(articleContent);
if (this._debug) {
this.log("Article content post-prep: " + articleContent.innerHTML);
}
if (neededToCreateTopCandidate) {
// We already created a fake div thing, and there wouldn't have been any siblings left
// for the previous loop, so there's no point trying to create a new div, and then
// move all the children over. Just assign IDs and class names here. No need to append
// because that already happened anyway.
topCandidate.id = "readability-page-1";
topCandidate.className = "page";
} else {
var div = doc.createElement("DIV");
div.id = "readability-page-1";
div.className = "page";
while (articleContent.firstChild) {
div