sajari-website
Version:
Website extensions for the Sajari API. Automatically index site content, add user profiles, render search and recommendations, etc.
47 lines (43 loc) • 1.71 kB
JavaScript
var Text = function(node) {
var REGEXPS = {
negative: /hidden|^hid$| hid$| hid |^hid |adblade|alert|banner|combx|comment|com-|contact|cookies-dialog|foot|footer|footnote|masthead|media|meta|navbar|outbrain|promo|related|scroll|share|shoutbox|sidebar|skyscraper|sponsor|shopping|tags|tool|widget/i,
}
var badTags = ["head","script","style","footer","header","nav","select","noscript"];
var text = "";
// Element nodes
if (node.nodeType === 1) {
if (badTags.indexOf(node.tagName.toLowerCase()) > -1) {
return text; // Known bad tag names
}
if (node.hasAttribute("data-sj-ignore")) {
return text; // Specific ignore elems
}
if (node.style.display === "none" && node.hasAttribute("hidden")) {
return text; // invisible text
}
if (node.tagName.toLowerCase() !== 'body') {
if (typeof(node.className) === "string" && node.className !== "") {
if (REGEXPS.negative.test(node.className)) {
return text; // bad classes
}
}
if (typeof(node.id) === "string" && node.id !== "") {
if (REGEXPS.negative.test(node.id)) {
return text; // bad ids
}
}
}
// Recursively walk the DOM
if (node.childNodes !== undefined) {
for (var j = 0; j < node.childNodes.length; j++) {
text += Text(node.childNodes[j]);
}
}
}
// Text nodes
if (node.nodeType === 3) {
text += node.textContent;
}
return text.replace(/\s+/g,' ');
};
module.exports = Text;