ineed
Version:
Web scraping and HTML-reprocessing. The easy way.
59 lines (44 loc) • 1.77 kB
JavaScript
var url = require('url'),
Common = require('../common');
//Const
var TAGS_WITH_TEXT_ALLOWED_IN_HEAD = [
'script', 'style', 'title', 'noscript', 'noframes'
],
TAGS_ALLOWED_IN_HEAD = [
'html', 'head', 'base', 'basefont', 'bgsound', 'command', 'link', 'meta'
].concat(TAGS_WITH_TEXT_ALLOWED_IN_HEAD);
//NOTE: make fast lookup tables from arrays
TAGS_WITH_TEXT_ALLOWED_IN_HEAD = Common.toLookupTable(TAGS_WITH_TEXT_ALLOWED_IN_HEAD);
TAGS_ALLOWED_IN_HEAD = Common.toLookupTable(TAGS_ALLOWED_IN_HEAD);
module.exports = {
init: function (ctx) {
this.ctx = ctx;
this.baseTagFound = false;
},
exitsHead: function (text) {
//NOTE: any non-whitespace text in <head> causes a content transition into <body>
return text.trim().length > 0 && !TAGS_WITH_TEXT_ALLOWED_IN_HEAD[this.ctx.leadingStartTag];
},
onStartTag: function (startTag) {
if (!this.ctx.inBody) {
this.ctx.inBody = !TAGS_ALLOWED_IN_HEAD[startTag.tagName];
if (!this.baseTagFound && startTag.tagName === 'base') {
var href = Common.getAttrValue(startTag.attrs, 'href');
if (href)
this.ctx.baseUrl = url.resolve(this.ctx.baseUrl, href);
}
}
this.ctx.leadingStartTag = startTag.selfClosing ? null : startTag.tagName;
},
onEndTag: function (tagName) {
this.ctx.leadingStartTag = null;
if (tagName === 'head')
this.ctx.inBody = true;
if (this.ctx.inBody && (tagName === 'body' || tagName === 'html'))
this.ctx.inBody = false;
},
onText: function (text) {
if (!this.ctx.inBody && this.exitsHead(text))
this.ctx.inBody = true;
}
};