open-graph-scraper
Version:
Node.js scraper module for Open Graph and Twitter Card info
187 lines (163 loc) • 6.45 kB
JavaScript
const cheerio = require('cheerio');
const iconv = require('iconv-lite');
const chardet = require('chardet');
const request = require('got');
const media = require('./media');
const fallback = require('./fallback');
const fields = require('./fields');
const charset = require('./charset');
const utils = require('./utils');
/*
* extract meta tags from html string
* @param string body - html string
* @param string options - options the user has set
*/
const extractMetaTags = (body, options) => {
let ogObject = {};
const $ = cheerio.load(body);
// find all of the open graph info in the meta tags
$('meta').each((index, meta) => {
if (!meta.attribs || (!meta.attribs.property && !meta.attribs.name)) return;
const property = meta.attribs.property || meta.attribs.name;
const content = meta.attribs.content || meta.attribs.value;
fields.forEach((item) => {
if (property.toLowerCase() === item.property.toLowerCase()) {
if (!item.multiple) {
ogObject[item.fieldName] = content;
} else if (!ogObject[item.fieldName]) {
ogObject[item.fieldName] = [content];
} else if (Array.isArray(ogObject[item.fieldName])) {
ogObject[item.fieldName].push(content);
}
}
});
});
// set ogImage to ogImageSecureURL/ogImageURL if there is no ogImage
if (!ogObject.ogImage && ogObject.ogImageSecureURL) {
ogObject.ogImage = ogObject.ogImageSecureURL;
} else if (!ogObject.ogImage && ogObject.ogImageURL) {
ogObject.ogImage = ogObject.ogImageURL;
}
// formats the multiple media values
ogObject = media.mediaSetup(ogObject, options);
// if onlyGetOpenGraphInfo isn't set, run the open graph fallbacks
if (!options.onlyGetOpenGraphInfo) {
ogObject = fallback(ogObject, options, $);
}
// removes any undefs
ogObject = utils.removeNestedUndefinedValues(ogObject);
return ogObject;
};
/*
* request and results formatter
* @param string options - options the user has set
* @param function callback
*/
const requestAndResultsFormatter = async (options) => {
const requestUrl = options.url;
delete options.url; // setting options.url messes with got
return request.get(requestUrl, options)
.then((response) => {
options.url = requestUrl;
let formatBody = response.body;
if (response && response.statusCode && (response.statusCode.toString().substring(0, 1) === '4' || response.statusCode.toString().substring(0, 1) === '5')) {
throw new Error('Server has returned a 400/500 error code');
} else if (response.body === undefined) {
throw new Error('Page not found');
}
if (options.runChar) {
const char = charset
.find(response.headers, formatBody, options.peekSize) || chardet.detect(formatBody);
if (char) {
// eslint-disable-next-line no-useless-catch
try {
formatBody = iconv.decode(formatBody, char);
} catch (exception) {
throw exception;
}
} else {
formatBody = formatBody.toString();
}
}
const ogObject = extractMetaTags(formatBody, options);
if (options.withCharset) {
ogObject.charset = charset.find(response.headers, formatBody, options.peekSize);
}
ogObject.requestUrl = options.url;
ogObject.success = true;
return { ogObject, response };
})
.catch((error) => {
options.url = requestUrl;
if (error instanceof Error) throw error;
throw new Error(error);
});
};
/*
* set options and return open graph results
* @param string options - options the user has set
* @param function callback
*/
const setOptionsAndReturnOpenGraphResults = async (options) => {
if (options.html) {
if (options.url) throw new Error('Must specify either `url` or `html`, not both');
const ogObject = extractMetaTags(options.html, options);
ogObject.requestUrl = null;
ogObject.success = true;
return { ogObject, response: { body: options.html } };
}
const validate = utils.validate(options.url, options.timeout);
if (!validate.url) throw new Error('Invalid URL');
options.url = validate.url;
options.timeout = validate.timeout;
options.decompress = options.decompress || true;
options.peekSize = options.peekSize || 1024;
options.followRedirect = options.followRedirect || true;
options.maxRedirects = options.maxRedirects || 10;
options.retry = options.retry || 2;
options.onlyGetOpenGraphInfo = options.onlyGetOpenGraphInfo || false;
options.ogImageFallback = options.ogImageFallback || true;
options.allMedia = options.allMedia || false;
options.headers = options.headers || {};
if (options.encoding === null) {
options.runChar = true;
options.encoding = 'utf8';
options.responseType = 'buffer';
} else {
options.encoding = options.encoding || 'utf8';
}
if (process.browser) {
options.decompress = false;
}
// trying to limit non html pages
if (options.url.includes('.jpg')
|| options.url.includes('.jpeg')
|| options.url.includes('.png')
|| options.url.includes('.mp3')
|| options.url.includes('.zip')
|| options.url.includes('.pdf')) {
throw new Error('Must scrape an HTML page');
}
if (options.blacklist && options.blacklist.some((blacklistedHostname) => options.url.includes(blacklistedHostname))) {
throw new Error('Host name has been black listed');
}
try {
const results = await requestAndResultsFormatter(options);
return results;
} catch (exception) {
if (exception && (exception.code === 'ENOTFOUND' || exception.code === 'EHOSTUNREACH' || exception.code === 'ENETUNREACH')) {
throw new Error('Page not found');
} else if (exception && (exception.code === 'ERR_INVALID_URL' || exception.code === 'EINVAL')) {
throw new Error('Page not found');
} else if (exception && exception.code === 'ETIMEDOUT') {
throw new Error('Time out');
} else if (exception && exception.message && exception.message.startsWith('Response code 4')) {
throw new Error('Page not found');
} else if (exception && exception.message && exception.message.startsWith('Response code 5')) {
throw new Error('Web server is returning error');
}
if (exception instanceof Error) throw exception;
throw new Error('Page not found');
}
};
module.exports = setOptionsAndReturnOpenGraphResults;