UNPKG

fetch-meta

Version:

Fetch and extract meta tag information (title, description, og: open graph, twitter, al: app links, social) from url

516 lines (427 loc) 15.5 kB
'use strict'; Object.defineProperty(exports, "__esModule", { value: true }); var _url = require('url'); var _url2 = _interopRequireDefault(_url); var _request = require('request'); var _request2 = _interopRequireDefault(_request); var _htmlparser = require('htmlparser2'); var _htmlparser2 = _interopRequireDefault(_htmlparser); var _fileType = require('file-type'); var _fileType2 = _interopRequireDefault(_fileType); function _interopRequireDefault(obj) { return obj && obj.__esModule ? obj : { default: obj }; } var metaNamesRegex = /^(?:charset|description|keywords|favicon|theme-color|author)$|^(?:og|al|twitter|fb|article):/i; var urlMetaNamesRegex = /url|uri|href|link|image|icon/i; var linkRelRegex = /^(?:canonical|alternate|publisher|me)$|icon/i; var trim = function trim(str) { return str && str.trim(); }; var parseMeta = function parseMeta(attrs, base) { var name = attrs.name || attrs.property || Object.keys(attrs)[0]; if (!metaNamesRegex.test(name)) return; var content = trim(attrs.content || attrs[name]); if (!content) return; if (urlMetaNamesRegex.test(name)) content = _url2.default.resolve(base, content); return [name, content]; }; var parseLink = function parseLink(attrs, base) { var href = trim(attrs.href); if (href && linkRelRegex.test(attrs.rel)) { return ['link:' + attrs.rel, _url2.default.resolve(base, href)]; } }; var contentTypeSplitRegex = /[\s;]*;+[\s;]*/g; var charsetRegex = /charset\s*=\s*([^\s]+)/i; var parseContentType = function parseContentType(str) { var contentType = null; var charset = null; str = (str || '').trim(); if (str) { var _iteratorNormalCompletion = true; var _didIteratorError = false; var _iteratorError = undefined; try { for (var _iterator = str.toLowerCase().split(contentTypeSplitRegex)[Symbol.iterator](), _step; !(_iteratorNormalCompletion = (_step = _iterator.next()).done); _iteratorNormalCompletion = true) { var part = _step.value; if (part.indexOf('/') >= 0) { contentType = part; } else { var matched = part.match(charsetRegex); if (matched) { charset = matched[1]; } } } } catch (err) { _didIteratorError = true; _iteratorError = err; } finally { try { if (!_iteratorNormalCompletion && _iterator.return) { _iterator.return(); } } finally { if (_didIteratorError) { throw _iteratorError; } } } } return { contentType: contentType, charset: charset }; }; var socialHostsRegex = /\b(?:facebook\.com|snapchat\.com|instagram\.com|twitter\.com|linkedin\.com|youtube\.com|pinterest\.com|tumblr\.com|plus\.google\.com|medium\.com|github\.com|reddit\.com|news\.ycombinator\.com|yelp\.com|m\.me|kik\.me)$/i; // TODO yelp with different country top domains. var hostToProvider = { 'facebook.com': 'facebook', 'snapchat.com': 'snapchat', 'instagram.com': 'instagram', 'twitter.com': 'twitter', 'linkedin.com': 'linkedin', 'youtube.com': 'youtube', 'pinterest.com': 'pinterest', 'tumblr.com': 'tumblr', 'plus.google.com': 'googleplus', 'medium.com': 'medium', 'github.com': 'github', 'reddit.com': 'reddit', 'news.ycombinator.com': 'hackernews', 'yelp.com': 'yelp', 'm.me': 'messenger', 'kik.me': 'kik' }; var numeric = /^[0-9]+$/; var facebookIdMatcher = /^\bid=([0-9]+)/i; var facebookUsernameMatcher = /^\/([A-Za-z0-9\.]+)/i; var facebookAppUrlMatcher = /^fb:\/\/profile\/([0-9]+)/i; var snapchatUsernameMatcher = /^\/add\/@?(\w+)/i; var instagramUsernameMatcher = /^\/([\w\.]+)/i; var twitterUsernameMatcher = /^\/@?(\w+)/i; var pinterestUsernameMatcher = /^\/([\w\.]+)/i; var youtubeUsernameMatcher = /^\/user\/([\w\.\-]+)/i; var youtubeChannelMatcher = /^\/channel\/([\w\.\-]+)/i; var youtubeIosUserIdMatcher = /^vnd\.youtube:\/\/user\/([\w\.\-]+)/i; var tumblrSubDomainMatcher = /^([\w\-]+)\.tumblr.com$/i; var tumblrAppUrlMatcher = /^tumblr:\/\/x-callback-url\/blog\?blogName=([\w\-]+)/; var googlePlusUsernameMatcher = /^\/\+([\w\.\-]+)/i; var googlePlusUserIdMatcher = /^\/([\w\.\-]+)/i; var linkedinUsernameMatcher = /^\/in\/([^\?#\/]+)/i; var mediumUsernameMatcher = /^\/@([\w\-]+)/i; var mediumAppUsernameMatcher = /^medium:\/\/([\w\-]+)/i; var githubUsernameMatcher = /^\/([\w\-]+)/i; var redditUsernameMatcher = /^\/(?:u|user)\/([\w\.\-]+)/i; var hackernewsIdMatcher = /^\bid=([\w\.\-]+)/i; var yelpBizMatcher = /^\/biz\/([^\?#\/]+)/i; var yelpIosUrlBizIdMatcher = /^yelp:\/\/\/biz\/([\w\-]+)/i; var messengerUsernameMatcher = /^\/([A-Za-z0-9\.]+)/i; var kikUsernameMatcher = /^\/([\w\.\-]+)/i; var matchGroup = function matchGroup(str, matcher) { if (str) { var matched = str.match(matcher); if (matched) return matched[1]; } return null; }; var providerToUsername = { facebook: function facebook(pathname, _ref, res) { var query = _ref.query; var username = null; var userId = null; if (pathname === '/profile.php') { userId = matchGroup(query, facebookIdMatcher); } else { username = matchGroup(pathname, facebookUsernameMatcher); if (numeric.test(username)) { userId = username; username = null; } } if (!userId) { userId = matchGroup(res['al:ios:url'] || res['al:android:url'], facebookAppUrlMatcher); } return { userId: userId, username: username }; }, snapchat: function snapchat(pathname) { return { userId: null, username: matchGroup(pathname, snapchatUsernameMatcher) }; }, instagram: function instagram(pathname) { return { userId: null, username: matchGroup(pathname, instagramUsernameMatcher) }; }, twitter: function twitter(pathname) { return { userId: null, username: matchGroup(pathname, twitterUsernameMatcher) }; }, linkedin: function linkedin(pathname) { var username = matchGroup(pathname, linkedinUsernameMatcher); if (username) username = decodeURIComponent(username); return { userId: null, username: username }; }, youtube: function youtube(pathname, urlParts, res) { return { userId: matchGroup(res['al:ios:url'], youtubeIosUserIdMatcher) || matchGroup(pathname, youtubeChannelMatcher), username: matchGroup(pathname, youtubeUsernameMatcher) }; }, pinterest: function pinterest(pathname) { return { userId: null, username: matchGroup(pathname, pinterestUsernameMatcher) }; }, tumblr: function tumblr(pathname, _ref2, res) { var host = _ref2.host; var username = matchGroup(res['al:ios:url'] || res['al:android:url'], tumblrAppUrlMatcher) || matchGroup(host, tumblrSubDomainMatcher); if (username === 'www') username = null; return { userId: null, username: username }; }, googleplus: function googleplus(pathname) { return { userId: matchGroup(pathname, googlePlusUserIdMatcher), username: matchGroup(pathname, googlePlusUsernameMatcher) }; }, medium: function medium(pathname, urlParts, res) { return { userId: null, username: matchGroup(res['al:ios:url'] || res['al:android:url'], mediumAppUsernameMatcher) || matchGroup(pathname, mediumUsernameMatcher) }; }, github: function github(pathname) { return { userId: null, username: matchGroup(pathname, githubUsernameMatcher) }; }, reddit: function reddit(pathname) { return { userId: null, username: matchGroup(pathname, redditUsernameMatcher) }; }, hackernews: function hackernews(pathname, _ref3) { var query = _ref3.query; return { userId: null, username: matchGroup(query, hackernewsIdMatcher) }; }, yelp: function yelp(pathname, urlParts, res) { var username = matchGroup(pathname, yelpBizMatcher); if (username) username = decodeURIComponent(username); return { userId: matchGroup(res['twitter:app:url:iphone'], yelpIosUrlBizIdMatcher), username: username }; }, messenger: function messenger(pathname) { return { userId: null, username: matchGroup(pathname, messengerUsernameMatcher) }; }, kik: function kik(pathname) { return { userId: null, username: matchGroup(pathname, kikUsernameMatcher) }; } }; var set = function set(res, name, value) { value = value && value.trim(); if (value) res[name] = value; }; var parseSocialLink = function parseSocialLink(res) { var urlParts = _url2.default.parse(res['summary:url']); var hostname = urlParts.hostname; var pathname = urlParts.pathname; var provider = null; var username = null; var userId = null; var match = hostname.match(socialHostsRegex); if (match) { provider = match && hostToProvider[match[0].toLowerCase()] || null; if (provider) { var usernameAndId = providerToUsername[provider](pathname, urlParts, res); username = usernameAndId.username; userId = usernameAndId.userId; } } else if (tumblrAppUrlMatcher.test(res['al:ios:url'] || res['al:android:url'] || null)) { provider = 'tumblr'; var _usernameAndId = providerToUsername.tumblr(pathname, urlParts, res); username = _usernameAndId.username; userId = _usernameAndId.userId; } return { provider: provider, username: username, userId: userId }; }; var createParser = function createParser(uri, res) { var isHead = false; var current = void 0; var base = uri; return new _htmlparser2.default.Parser({ onopentag: function onopentag(name, attrs) { current = name; switch (name) { case 'head': isHead = true; break; case 'base': var href = trim(attrs.href); if (href) base = _url2.default.resolve(base, href); break; case 'meta': var meta = parseMeta(attrs, base); if (meta) res[meta[0]] = meta[1]; break; case 'link': var link = parseLink(attrs, base); if (link) res[link[0]] = link[1]; break; case 'img': var src = trim(attrs.src); if (src && src.substr(0, 4) !== 'data') { if (!res.images) res.images = new Set(); res.images.add(_url2.default.resolve(base, src)); } break; // default: break; } }, ontext: function ontext(text) { if (isHead && current === 'title') res.title += text; }, onclosetag: function onclosetag(name) { if (name === 'head') isHead = false; } }, { decodeEntities: true }); }; var wrapResolveAndReject = function wrapResolveAndReject(resolve, reject) { var done = false; return { safeResolve: function safeResolve(result) { if (!done) { done = true;resolve(result); } }, safeReject: function safeReject(error) { if (!done) { done = true;reject(error); } } }; }; var dirtyWhitespaceRegex = /\s{2,}|\n/gmi; var sanitizeTitle = function sanitizeTitle(str) { return trim(str).replace(dirtyWhitespaceRegex, ' '); }; var setToArray = function setToArray(aSet) { var arr = []; var _iteratorNormalCompletion2 = true; var _didIteratorError2 = false; var _iteratorError2 = undefined; try { for (var _iterator2 = aSet[Symbol.iterator](), _step2; !(_iteratorNormalCompletion2 = (_step2 = _iterator2.next()).done); _iteratorNormalCompletion2 = true) { var i = _step2.value; arr.push(i); } } catch (err) { _didIteratorError2 = true; _iteratorError2 = err; } finally { try { if (!_iteratorNormalCompletion2 && _iterator2.return) { _iterator2.return(); } } finally { if (_didIteratorError2) { throw _iteratorError2; } } } return arr; }; var fetchMeta = function fetchMeta(opts) { return new Promise(function (resolve, reject) { var uri = opts.uri; var _urlLib$parse = _url2.default.parse(uri); var host = _urlLib$parse.host; var path = _urlLib$parse.path; var res = { host: host, path: path, title: '' }; var _wrapResolveAndReject = wrapResolveAndReject(resolve, reject); var safeResolve = _wrapResolveAndReject.safeResolve; var safeReject = _wrapResolveAndReject.safeReject; var headers = {}; var parser = createParser(uri, res); var isClosed = false; var isFileChecked = false; var isSuccess = false; var req = (0, _request2.default)(opts).on('data', function (chunk) { if (!isFileChecked) { var file = (0, _fileType2.default)(chunk); if (file) { res.file = file; req.end(); isClosed = true; req.destroy(); return; } isFileChecked = true; } parser.write(chunk); }).on('end', function () { if (!isSuccess) return; res.title = sanitizeTitle(res.title); if (res.images) res.images = setToArray(res.images); if (res.charset) res.charset = res.charset.toLowerCase(); // set headers set(res, 'header:content_type', headers.contentType); set(res, 'header:charset', headers.charset); set(res, 'header:content_length', headers.contentLength); // set summary set(res, 'summary:url', res['og:url'] || res['link:canonical'] || uri); set(res, 'summary:title', res.title || res['og:title'] || res['twitter:title']); set(res, 'summary:description', res.description || res['og:description'] || res['twitter:description']); set(res, 'summary:site', res.site || res['og:site_name'] || res['al:ios:app_name'] || res['al:android:app_name'] || res['twitter:app:name:iphone'] || res['twitter:app:name:ipad'] || res['twitter:app:name:googleplay'] || res['twitter:site']); set(res, 'summary:favicon', res.favicon || res['link:shortcut icon'] || res['link:icon'] || res['link:apple-touch-icon'] || res['link:apple-touch-icon-precomposed']); set(res, 'summary:image', res.images && res.images[0] || res['og:image'] || res['twitter:image']); // set social link var _parseSocialLink = parseSocialLink(res); var provider = _parseSocialLink.provider; var username = _parseSocialLink.username; var userId = _parseSocialLink.userId; set(res, 'social:provider', provider); set(res, 'social:username', username); set(res, 'social:user_id', userId); safeResolve(res); }).on('response', function (response) { var statusCode = response.statusCode; var statusMessage = response.statusMessage; isSuccess = statusCode >= 200 && statusCode < 300; if (isSuccess) { var _parseContentType = parseContentType(response.headers['content-type']); var contentType = _parseContentType.contentType; var charset = _parseContentType.charset; if (contentType) headers.contentType = contentType; if (charset) headers.charset = charset; var contentLength = parseInt(response.headers['content-length'], 10); if (contentLength || contentLength === 0) { headers.contentLength = String(contentLength); } } else if (statusCode >= 400) { safeReject(new Error(statusCode + ' ' + statusMessage)); } }).on('close', function () { if (!isClosed) safeReject(new Error('Read stream was closed')); }).on('error', function (err) { return safeReject(err); }); }); }; exports.default = fetchMeta;