fetch-meta
Version:
Fetch and extract meta tag information (title, description, og: open graph, twitter, al: app links, social) from url
516 lines (427 loc) • 15.5 kB
JavaScript
;
Object.defineProperty(exports, "__esModule", {
value: true
});
var _url = require('url');
var _url2 = _interopRequireDefault(_url);
var _request = require('request');
var _request2 = _interopRequireDefault(_request);
var _htmlparser = require('htmlparser2');
var _htmlparser2 = _interopRequireDefault(_htmlparser);
var _fileType = require('file-type');
var _fileType2 = _interopRequireDefault(_fileType);
function _interopRequireDefault(obj) { return obj && obj.__esModule ? obj : { default: obj }; }
var metaNamesRegex = /^(?:charset|description|keywords|favicon|theme-color|author)$|^(?:og|al|twitter|fb|article):/i;
var urlMetaNamesRegex = /url|uri|href|link|image|icon/i;
var linkRelRegex = /^(?:canonical|alternate|publisher|me)$|icon/i;
var trim = function trim(str) {
return str && str.trim();
};
var parseMeta = function parseMeta(attrs, base) {
var name = attrs.name || attrs.property || Object.keys(attrs)[0];
if (!metaNamesRegex.test(name)) return;
var content = trim(attrs.content || attrs[name]);
if (!content) return;
if (urlMetaNamesRegex.test(name)) content = _url2.default.resolve(base, content);
return [name, content];
};
var parseLink = function parseLink(attrs, base) {
var href = trim(attrs.href);
if (href && linkRelRegex.test(attrs.rel)) {
return ['link:' + attrs.rel, _url2.default.resolve(base, href)];
}
};
var contentTypeSplitRegex = /[\s;]*;+[\s;]*/g;
var charsetRegex = /charset\s*=\s*([^\s]+)/i;
var parseContentType = function parseContentType(str) {
var contentType = null;
var charset = null;
str = (str || '').trim();
if (str) {
var _iteratorNormalCompletion = true;
var _didIteratorError = false;
var _iteratorError = undefined;
try {
for (var _iterator = str.toLowerCase().split(contentTypeSplitRegex)[Symbol.iterator](), _step; !(_iteratorNormalCompletion = (_step = _iterator.next()).done); _iteratorNormalCompletion = true) {
var part = _step.value;
if (part.indexOf('/') >= 0) {
contentType = part;
} else {
var matched = part.match(charsetRegex);
if (matched) {
charset = matched[1];
}
}
}
} catch (err) {
_didIteratorError = true;
_iteratorError = err;
} finally {
try {
if (!_iteratorNormalCompletion && _iterator.return) {
_iterator.return();
}
} finally {
if (_didIteratorError) {
throw _iteratorError;
}
}
}
}
return { contentType: contentType, charset: charset };
};
var socialHostsRegex = /\b(?:facebook\.com|snapchat\.com|instagram\.com|twitter\.com|linkedin\.com|youtube\.com|pinterest\.com|tumblr\.com|plus\.google\.com|medium\.com|github\.com|reddit\.com|news\.ycombinator\.com|yelp\.com|m\.me|kik\.me)$/i;
// TODO yelp with different country top domains.
var hostToProvider = {
'facebook.com': 'facebook',
'snapchat.com': 'snapchat',
'instagram.com': 'instagram',
'twitter.com': 'twitter',
'linkedin.com': 'linkedin',
'youtube.com': 'youtube',
'pinterest.com': 'pinterest',
'tumblr.com': 'tumblr',
'plus.google.com': 'googleplus',
'medium.com': 'medium',
'github.com': 'github',
'reddit.com': 'reddit',
'news.ycombinator.com': 'hackernews',
'yelp.com': 'yelp',
'm.me': 'messenger',
'kik.me': 'kik'
};
var numeric = /^[0-9]+$/;
var facebookIdMatcher = /^\bid=([0-9]+)/i;
var facebookUsernameMatcher = /^\/([A-Za-z0-9\.]+)/i;
var facebookAppUrlMatcher = /^fb:\/\/profile\/([0-9]+)/i;
var snapchatUsernameMatcher = /^\/add\/@?(\w+)/i;
var instagramUsernameMatcher = /^\/([\w\.]+)/i;
var twitterUsernameMatcher = /^\/@?(\w+)/i;
var pinterestUsernameMatcher = /^\/([\w\.]+)/i;
var youtubeUsernameMatcher = /^\/user\/([\w\.\-]+)/i;
var youtubeChannelMatcher = /^\/channel\/([\w\.\-]+)/i;
var youtubeIosUserIdMatcher = /^vnd\.youtube:\/\/user\/([\w\.\-]+)/i;
var tumblrSubDomainMatcher = /^([\w\-]+)\.tumblr.com$/i;
var tumblrAppUrlMatcher = /^tumblr:\/\/x-callback-url\/blog\?blogName=([\w\-]+)/;
var googlePlusUsernameMatcher = /^\/\+([\w\.\-]+)/i;
var googlePlusUserIdMatcher = /^\/([\w\.\-]+)/i;
var linkedinUsernameMatcher = /^\/in\/([^\?#\/]+)/i;
var mediumUsernameMatcher = /^\/@([\w\-]+)/i;
var mediumAppUsernameMatcher = /^medium:\/\/([\w\-]+)/i;
var githubUsernameMatcher = /^\/([\w\-]+)/i;
var redditUsernameMatcher = /^\/(?:u|user)\/([\w\.\-]+)/i;
var hackernewsIdMatcher = /^\bid=([\w\.\-]+)/i;
var yelpBizMatcher = /^\/biz\/([^\?#\/]+)/i;
var yelpIosUrlBizIdMatcher = /^yelp:\/\/\/biz\/([\w\-]+)/i;
var messengerUsernameMatcher = /^\/([A-Za-z0-9\.]+)/i;
var kikUsernameMatcher = /^\/([\w\.\-]+)/i;
var matchGroup = function matchGroup(str, matcher) {
if (str) {
var matched = str.match(matcher);
if (matched) return matched[1];
}
return null;
};
var providerToUsername = {
facebook: function facebook(pathname, _ref, res) {
var query = _ref.query;
var username = null;
var userId = null;
if (pathname === '/profile.php') {
userId = matchGroup(query, facebookIdMatcher);
} else {
username = matchGroup(pathname, facebookUsernameMatcher);
if (numeric.test(username)) {
userId = username;
username = null;
}
}
if (!userId) {
userId = matchGroup(res['al:ios:url'] || res['al:android:url'], facebookAppUrlMatcher);
}
return { userId: userId, username: username };
},
snapchat: function snapchat(pathname) {
return {
userId: null,
username: matchGroup(pathname, snapchatUsernameMatcher)
};
},
instagram: function instagram(pathname) {
return {
userId: null,
username: matchGroup(pathname, instagramUsernameMatcher)
};
},
twitter: function twitter(pathname) {
return {
userId: null,
username: matchGroup(pathname, twitterUsernameMatcher)
};
},
linkedin: function linkedin(pathname) {
var username = matchGroup(pathname, linkedinUsernameMatcher);
if (username) username = decodeURIComponent(username);
return { userId: null, username: username };
},
youtube: function youtube(pathname, urlParts, res) {
return {
userId: matchGroup(res['al:ios:url'], youtubeIosUserIdMatcher) || matchGroup(pathname, youtubeChannelMatcher),
username: matchGroup(pathname, youtubeUsernameMatcher)
};
},
pinterest: function pinterest(pathname) {
return {
userId: null,
username: matchGroup(pathname, pinterestUsernameMatcher)
};
},
tumblr: function tumblr(pathname, _ref2, res) {
var host = _ref2.host;
var username = matchGroup(res['al:ios:url'] || res['al:android:url'], tumblrAppUrlMatcher) || matchGroup(host, tumblrSubDomainMatcher);
if (username === 'www') username = null;
return { userId: null, username: username };
},
googleplus: function googleplus(pathname) {
return {
userId: matchGroup(pathname, googlePlusUserIdMatcher),
username: matchGroup(pathname, googlePlusUsernameMatcher)
};
},
medium: function medium(pathname, urlParts, res) {
return {
userId: null,
username: matchGroup(res['al:ios:url'] || res['al:android:url'], mediumAppUsernameMatcher) || matchGroup(pathname, mediumUsernameMatcher)
};
},
github: function github(pathname) {
return {
userId: null,
username: matchGroup(pathname, githubUsernameMatcher)
};
},
reddit: function reddit(pathname) {
return {
userId: null,
username: matchGroup(pathname, redditUsernameMatcher)
};
},
hackernews: function hackernews(pathname, _ref3) {
var query = _ref3.query;
return {
userId: null,
username: matchGroup(query, hackernewsIdMatcher)
};
},
yelp: function yelp(pathname, urlParts, res) {
var username = matchGroup(pathname, yelpBizMatcher);
if (username) username = decodeURIComponent(username);
return {
userId: matchGroup(res['twitter:app:url:iphone'], yelpIosUrlBizIdMatcher),
username: username
};
},
messenger: function messenger(pathname) {
return {
userId: null,
username: matchGroup(pathname, messengerUsernameMatcher)
};
},
kik: function kik(pathname) {
return {
userId: null,
username: matchGroup(pathname, kikUsernameMatcher)
};
}
};
var set = function set(res, name, value) {
value = value && value.trim();
if (value) res[name] = value;
};
var parseSocialLink = function parseSocialLink(res) {
var urlParts = _url2.default.parse(res['summary:url']);
var hostname = urlParts.hostname;
var pathname = urlParts.pathname;
var provider = null;
var username = null;
var userId = null;
var match = hostname.match(socialHostsRegex);
if (match) {
provider = match && hostToProvider[match[0].toLowerCase()] || null;
if (provider) {
var usernameAndId = providerToUsername[provider](pathname, urlParts, res);
username = usernameAndId.username;
userId = usernameAndId.userId;
}
} else if (tumblrAppUrlMatcher.test(res['al:ios:url'] || res['al:android:url'] || null)) {
provider = 'tumblr';
var _usernameAndId = providerToUsername.tumblr(pathname, urlParts, res);
username = _usernameAndId.username;
userId = _usernameAndId.userId;
}
return { provider: provider, username: username, userId: userId };
};
var createParser = function createParser(uri, res) {
var isHead = false;
var current = void 0;
var base = uri;
return new _htmlparser2.default.Parser({
onopentag: function onopentag(name, attrs) {
current = name;
switch (name) {
case 'head':
isHead = true;
break;
case 'base':
var href = trim(attrs.href);
if (href) base = _url2.default.resolve(base, href);
break;
case 'meta':
var meta = parseMeta(attrs, base);
if (meta) res[meta[0]] = meta[1];
break;
case 'link':
var link = parseLink(attrs, base);
if (link) res[link[0]] = link[1];
break;
case 'img':
var src = trim(attrs.src);
if (src && src.substr(0, 4) !== 'data') {
if (!res.images) res.images = new Set();
res.images.add(_url2.default.resolve(base, src));
}
break;
// default: break;
}
},
ontext: function ontext(text) {
if (isHead && current === 'title') res.title += text;
},
onclosetag: function onclosetag(name) {
if (name === 'head') isHead = false;
}
}, { decodeEntities: true });
};
var wrapResolveAndReject = function wrapResolveAndReject(resolve, reject) {
var done = false;
return {
safeResolve: function safeResolve(result) {
if (!done) {
done = true;resolve(result);
}
},
safeReject: function safeReject(error) {
if (!done) {
done = true;reject(error);
}
}
};
};
var dirtyWhitespaceRegex = /\s{2,}|\n/gmi;
var sanitizeTitle = function sanitizeTitle(str) {
return trim(str).replace(dirtyWhitespaceRegex, ' ');
};
var setToArray = function setToArray(aSet) {
var arr = [];
var _iteratorNormalCompletion2 = true;
var _didIteratorError2 = false;
var _iteratorError2 = undefined;
try {
for (var _iterator2 = aSet[Symbol.iterator](), _step2; !(_iteratorNormalCompletion2 = (_step2 = _iterator2.next()).done); _iteratorNormalCompletion2 = true) {
var i = _step2.value;
arr.push(i);
}
} catch (err) {
_didIteratorError2 = true;
_iteratorError2 = err;
} finally {
try {
if (!_iteratorNormalCompletion2 && _iterator2.return) {
_iterator2.return();
}
} finally {
if (_didIteratorError2) {
throw _iteratorError2;
}
}
}
return arr;
};
var fetchMeta = function fetchMeta(opts) {
return new Promise(function (resolve, reject) {
var uri = opts.uri;
var _urlLib$parse = _url2.default.parse(uri);
var host = _urlLib$parse.host;
var path = _urlLib$parse.path;
var res = { host: host, path: path, title: '' };
var _wrapResolveAndReject = wrapResolveAndReject(resolve, reject);
var safeResolve = _wrapResolveAndReject.safeResolve;
var safeReject = _wrapResolveAndReject.safeReject;
var headers = {};
var parser = createParser(uri, res);
var isClosed = false;
var isFileChecked = false;
var isSuccess = false;
var req = (0, _request2.default)(opts).on('data', function (chunk) {
if (!isFileChecked) {
var file = (0, _fileType2.default)(chunk);
if (file) {
res.file = file;
req.end();
isClosed = true;
req.destroy();
return;
}
isFileChecked = true;
}
parser.write(chunk);
}).on('end', function () {
if (!isSuccess) return;
res.title = sanitizeTitle(res.title);
if (res.images) res.images = setToArray(res.images);
if (res.charset) res.charset = res.charset.toLowerCase();
// set headers
set(res, 'header:content_type', headers.contentType);
set(res, 'header:charset', headers.charset);
set(res, 'header:content_length', headers.contentLength);
// set summary
set(res, 'summary:url', res['og:url'] || res['link:canonical'] || uri);
set(res, 'summary:title', res.title || res['og:title'] || res['twitter:title']);
set(res, 'summary:description', res.description || res['og:description'] || res['twitter:description']);
set(res, 'summary:site', res.site || res['og:site_name'] || res['al:ios:app_name'] || res['al:android:app_name'] || res['twitter:app:name:iphone'] || res['twitter:app:name:ipad'] || res['twitter:app:name:googleplay'] || res['twitter:site']);
set(res, 'summary:favicon', res.favicon || res['link:shortcut icon'] || res['link:icon'] || res['link:apple-touch-icon'] || res['link:apple-touch-icon-precomposed']);
set(res, 'summary:image', res.images && res.images[0] || res['og:image'] || res['twitter:image']);
// set social link
var _parseSocialLink = parseSocialLink(res);
var provider = _parseSocialLink.provider;
var username = _parseSocialLink.username;
var userId = _parseSocialLink.userId;
set(res, 'social:provider', provider);
set(res, 'social:username', username);
set(res, 'social:user_id', userId);
safeResolve(res);
}).on('response', function (response) {
var statusCode = response.statusCode;
var statusMessage = response.statusMessage;
isSuccess = statusCode >= 200 && statusCode < 300;
if (isSuccess) {
var _parseContentType = parseContentType(response.headers['content-type']);
var contentType = _parseContentType.contentType;
var charset = _parseContentType.charset;
if (contentType) headers.contentType = contentType;
if (charset) headers.charset = charset;
var contentLength = parseInt(response.headers['content-length'], 10);
if (contentLength || contentLength === 0) {
headers.contentLength = String(contentLength);
}
} else if (statusCode >= 400) {
safeReject(new Error(statusCode + ' ' + statusMessage));
}
}).on('close', function () {
if (!isClosed) safeReject(new Error('Read stream was closed'));
}).on('error', function (err) {
return safeReject(err);
});
});
};
exports.default = fetchMeta;