og-parser
Version:
NodeJS module to parse open-graph information
361 lines (348 loc) • 9.38 kB
JavaScript
var htmlparser = require("htmlparser2"),
string = require('string'),
request = require('request'),
meta = {},
currMeta = {},
currTag = null;
function _og() {};
_og.prototype._set_og = function(inoutMeta, name, value) {
if (!inoutMeta.og) {
inoutMeta.og = {}
}
if (typeof value === "string") {
value = string(value).unescapeHTML().s;
}
if (inoutMeta.og[name]) {
if (!Array.isArray(inoutMeta.og[name])) {
inoutMeta.og[name] = [inoutMeta.og[name]];
}
inoutMeta.og[name].push(value);
} else {
inoutMeta.og[name] = value;
}
};
_og.prototype._process_header = function(inoutMeta, name, value) {
switch (name) {
case 'og:title':
case 'og:description':
case 'og:type':
case 'og:url':
case 'og:determiner':
case 'og:site_name':
this._set_og(inoutMeta, name.substring(3), value);
break;
case 'og:locale':
{
this._set_og(inoutMeta, 'locale', {
name: value
});
break;
}
case 'og:locale:alternate':
{
if (inoutMeta.og.locale) {
if (!inoutMeta.og.locale.alternate) {
inoutMeta.og.locale.alternate = [];
}
inoutMeta.og.locale.alternate.push(value);
}
break;
}
case 'og:image':
case 'og:image:url':
{
var obj = {
url: value
};
this._set_og(inoutMeta, 'image', obj);
break;
}
case 'og:image:type':
case 'og:image:width':
case 'og:image:height':
case 'og:image:secure_url':
{
var image = {}
if (inoutMeta.og.image) {
image = inoutMeta.og.image
if (Array.isArray(inoutMeta.og.image)) {
image = inoutMeta.og.image[inoutMeta.og.image.length - 1];
}
} else {
this._set_og(inoutMeta, 'image', image);
}
image[name.substring(9)] = value;
break;
}
case 'og:audio':
case 'og:audio:url':
{
var obj = {
url: value
};
this._set_og(inoutMeta, 'audio', obj);
break;
}
case 'og:audio:type':
case 'og:audio:secure_url':
{
var audio = {}
if (inoutMeta.og.audio) {
audio = inoutMeta.og.audio;
if (Array.isArray(inoutMeta.og.audio)) {
audio = inoutMeta.og.audio[inoutMeta.og.audio.length - 1];
}
} else {
this._set_og(inoutMeta, 'audio', audio);
}
audio[name.substring(9)] = value;
break;
}
case 'og:video':
case 'og:video:url':
{
var obj = {
url: value
};
this._set_og(inoutMeta, 'video', obj);
break;
}
case 'og:video:type':
case 'og:video:width':
case 'og:video:height':
case 'og:video:secure_url':
{
var video = {}
if (inoutMeta.og.video) {
video = inoutMeta.og.video
if (Array.isArray(inoutMeta.og.video)) {
video = inoutMeta.og.video[inoutMeta.og.video.length - 1];
}
} else {
this._set_og(inoutMeta, 'video', video);
}
video[name.substring(9)] = value;
break;
}
}
};
var og = new _og();
function _twitter() {}
_twitter.prototype._process_header = function(inoutMeta, name, value) {
switch (name) {
case 'twitter:card':
case 'twitter:description':
case 'twitter:title':
case 'twitter:image':
if (!inoutMeta.twitter) {
inoutMeta.twitter = {};
}
inoutMeta.twitter[name.substring(8)] = value;
break;
case 'twitter:site':
case 'twitter:creator':
case 'twitter:player':
case 'twitter:data1':
case 'twitter:label1':
case 'twitter:data2':
case 'twitter:label2':
{
if (!inoutMeta.twitter) {
inoutMeta.twitter = {};
}
var fields = name.split(':');
if (!inoutMeta.twitter[fields[1]]) {
inoutMeta.twitter[fields[1]] = {
name: value
};
} else {
inoutMeta.twitter[fields[1]].name = value;
}
break;
}
case 'twitter:site:id':
case 'twitter:creator:id':
case 'twitter:image:src':
case 'twitter:image:width':
case 'twitter:image:height':
case 'twitter:player:width':
case 'twitter:player:height':
case 'twitter:player:stream':
{
if (!inoutMeta.twitter) {
inoutMeta.twitter = {};
}
var fields = name.split(':');
if (!inoutMeta.twitter[fields[1]]) {
inoutMeta.twitter[fields[1]] = {};
}
inoutMeta.twitter[fields[1]][fields[2]] = value;
break;
}
case 'twitter:player:stream:content_type':
{
if (!inoutMeta.twitter) {
inoutMeta.twitter = {};
}
var fields = name.split(':');
if (!inoutMeta.twitter[fields[1]]) {
inoutMeta.twitter[fields[1]] = {};
}
if (!inoutMeta.twitter[fields[1]][fields[2]]) {
inoutMeta.twitter[fields[1]][fields[2]] = {};
}
inoutMeta.twitter[fields[1]][fields[2]][fields[3]] = value;
break;
}
default:
util._assign(inoutMeta, name, value);
}
};
var twitter = new _twitter();
function _util() {};
_util.prototype._assign = function(obj, prop, value) {
if (typeof prop === "string")
prop = prop.split(":");
if (prop.length > 1) {
var e = prop.shift();
this._assign(obj[e] =
Object.prototype.toString.call(obj[e]) === "[object Object]" ? obj[e] : {},
prop,
value);
} else {
if (typeof value === "string") {
value = string(value).unescapeHTML().s;
}
obj[prop[0]] = value;
}
};
_util.prototype._is_image = function(contentType) {
switch (contentType) {
case "image/gif":
case "image/jpeg":
case "image/pjpeg":
case "image/png":
case "image/svg+xml":
case "image/tiff":
case "image/vnd.djvu":
case "image/example":
return true;
default:
return false;
}
};
var util = new _util();
var parser = new htmlparser.Parser({
onopentag: function(name, attribs) {
if (name === 'head') {
currTag = "head";
} else if (currTag === "head" && name === "meta") {
var n = attribs.property,
v = attribs.content;
if (attribs.name) {
n = attribs.name;
}
if (name === 'title') {
meta.title = value;
} else if (name === 'description') {
meta.description = value;
}
if (n) {
if (n.indexOf('twitter:') === 0) {
twitter._process_header(meta, n, v);
} else if (n.indexOf('og:') === 0) {
og._process_header(meta, n, v);
} else if (n.indexOf('al:') === 0) {
util._assign(meta, n, v);
}
}
} else if (currTag === "head" && name === "title") {
currTag = "head/title";
} else if (name === 'body') {
currTag = "body";
} else if (currTag === 'body' && name === 'meta') {
if (!meta.meta) {
meta.meta = {};
}
meta[name][attribs.itemprop] = attribs.content;
} else if (currTag === 'body' && name === 'span' && attribs.itemprop) {
currTag = 'body/span';
currMeta.name = attribs.itemprop;
} else if (currTag === 'body/span' && name === 'link') {
if (!currMeta. in ) {
currMeta. in = {};
}
currMeta. in .url = attribs.href;
} else if (currTag === 'body/span' && name === 'meta') {
if (!currMeta. in ) {
currMeta. in = {};
}
currMeta. in [attribs.itemprop] = attribs.content;
} else if (currTag === 'body' && name === 'link') {
if (!meta.meta) {
meta.meta = {};
}
meta.meta[attribs.itemprop] = attribs.href;
} else if (currTag === 'body' && name === 'img') {
if (!meta.images) {
meta.images = [];
}
meta.images.push(attribs);
}
},
ontext: function(text) {
if (currTag === "head/title") {
meta.title = text;
}
},
onclosetag: function(tagname) {
if (currTag === "head/title" && tagname === "title") {
currTag = "head";
} else if (currTag === "head" && tagname === "head") {
currTag = null;
} else if (currTag === 'body/span' && tagname === 'span') {
currTag = "body";
if (!meta.meta) {
meta.meta = {};
}
meta.meta[currMeta.name] = currMeta. in ;
currMeta = {};
} else if (currTag === 'body' && tagname === 'body') {
currTag = null;
}
}
});
var _get_og_data = function(url, callback) {
if (!callback) {
return;
}
var headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.76 Safari/537.36',
'Accept-Language': 'en-US'
}
var options = {
url: url,
method: 'GET',
headers: headers
}
request(options, function(error, response, body) {
if (error) {
callback(error, null);
}
meta = {};
if (response.statusCode == 200) {
if (util._is_image(response.headers['content-type'])) {
callback(null, {
image: url
});
} else {
parser.write(body);
parser.end();
callback(null, meta);
}
} else {
callback(null, null);
}
});
};
module.exports = _get_og_data;