@crawlee/utils
Version:
A set of shared utilities that can be used by crawlers
381 lines • 11.1 kB
JavaScript
;
Object.defineProperty(exports, "__esModule", { value: true });
exports.parseOpenGraph = parseOpenGraph;
const cheerio_1 = require("cheerio");
/**
* To be used with the spread operator. Ensures that the item is defined, and is not empty.
*
* @param key The key for the item to have in the object
* @param item The item to assign to the key.
* @returns Either an empty object or an object with the content provided.
*/
const optionalSpread = (key, item) => item !== undefined && !!Object.values(item)?.length ? { [key]: item } : {};
const OPEN_GRAPH_PROPERTIES = [
{
name: 'og:title',
outputName: 'title',
children: [],
},
{
name: 'og:type',
outputName: 'type',
children: [],
},
{
name: 'og:image',
outputName: 'image',
children: [
{
name: 'og:image:url',
outputName: 'url',
children: [],
},
{
name: 'og:image:secure_url',
outputName: 'secureUrl',
children: [],
},
{
name: 'og:image:type',
outputName: 'type',
children: [],
},
{
name: 'og:image:width',
outputName: 'width',
children: [],
},
{
name: 'og:image:height',
outputName: 'height',
children: [],
},
{
name: 'og:image:alt',
outputName: 'alt',
children: [],
},
],
},
{
name: 'og:url',
outputName: 'url',
children: [],
},
{
name: 'og:audio',
outputName: 'audio',
children: [
{
name: 'og:audio:url',
outputName: 'url',
children: [],
},
{
name: 'og:audio:secure_url',
outputName: 'secureUrl',
children: [],
},
{
name: 'og:audio:type',
outputName: 'type',
children: [],
},
],
},
{
name: 'og:description',
outputName: 'description',
children: [],
},
{
name: 'og:determiner',
outputName: 'determiner',
children: [],
},
{
name: 'og:locale',
outputName: 'locale',
children: [
{
name: 'og:locale:alternate',
outputName: 'alternate',
children: [],
},
],
},
{
name: 'og:site_name',
outputName: 'siteName',
children: [],
},
{
name: 'og:video',
outputName: 'video',
children: [
{
name: 'og:video:url',
outputName: 'url',
children: [],
},
{
name: 'og:video:secure_url',
outputName: 'secureUrl',
children: [],
},
{
name: 'og:video:type',
outputName: 'type',
children: [],
},
{
name: 'og:video:width',
outputName: 'width',
children: [],
},
{
name: 'og:video:height',
outputName: 'height',
children: [],
},
{
name: 'og:video:alt',
outputName: 'alt',
children: [],
},
],
},
// The properties below aren't prefixed with "og".
// Part of the reason the properties have been hardcoded is because not all OpenGraph properties start with "og".
// Especially the newer ones that extend "og:type".
{
name: 'video',
outputName: 'videoInfo',
children: [
{
name: 'video:actor',
outputName: 'actor',
children: [
{
name: 'video:actor:role',
outputName: 'role',
children: [],
},
],
},
{
name: 'video:director',
outputName: 'director',
children: [],
},
{
name: 'video:writer',
outputName: 'writer',
children: [],
},
{
name: 'video:duration',
outputName: 'duration',
children: [],
},
{
name: 'video:release_date',
outputName: 'releaseDate',
children: [],
},
{
name: 'video:tag',
outputName: 'tag',
children: [],
},
{
name: 'video:series',
outputName: 'series',
children: [],
},
],
},
{
name: 'music',
outputName: 'musicInfo',
children: [
{
name: 'music:duration',
outputName: 'duration',
children: [],
},
{
name: 'music:album',
outputName: 'album',
children: [
{
name: 'music:album:disc',
outputName: 'disc',
children: [],
},
{
name: 'music:album:track',
outputName: 'track',
children: [],
},
],
},
{
name: 'music:musician',
outputName: 'musician',
children: [],
},
{
name: 'music:song',
outputName: 'song',
children: [
{
name: 'music:song:disc',
outputName: 'disc',
children: [],
},
{
name: 'music:song:track',
outputName: 'track',
children: [],
},
],
},
{
name: 'music:release_date',
outputName: 'releaseDate',
children: [],
},
{
name: 'music:creator',
outputName: 'creator',
children: [],
},
],
},
{
name: 'article',
outputName: 'articleInfo',
children: [
{
name: 'music:published_time',
outputName: 'publishedTime',
children: [],
},
{
name: 'music:modified_time',
outputName: 'modifiedTime',
children: [],
},
{
name: 'music:expiration_time',
outputName: 'expirationTime',
children: [],
},
{
name: 'music:author',
outputName: 'author',
children: [],
},
{
name: 'music:section',
outputName: 'section',
children: [],
},
{
name: 'music:tag',
outputName: 'tag',
children: [],
},
],
},
{
name: 'book',
outputName: 'bookInfo',
children: [
{
name: 'book:author',
outputName: 'author',
children: [],
},
{
name: 'book:isbn',
outputName: 'isbn',
children: [],
},
{
name: 'book:release_date',
outputName: 'releaseDate',
children: [],
},
{
name: 'book:tag',
outputName: 'tag',
children: [],
},
],
},
{
name: 'profile',
outputName: 'profileInfo',
children: [
{
name: 'profile:first_name',
outputName: 'firstName',
children: [],
},
{
name: 'profile:last_name',
outputName: 'lastName',
children: [],
},
{
name: 'profile:username',
outputName: 'username',
children: [],
},
{
name: 'profile:gender',
outputName: 'gender',
children: [],
},
],
},
];
const makeOpenGraphSelector = (name) => `meta[property="${name}"]`;
const parseOpenGraphProperty = (property, $) => {
// Some OpenGraph properties can be added multiple times, such as with video:actor. We must handle this case.
const values = [...$(makeOpenGraphSelector(property.name))].map((elem) => $(elem).attr('content'));
// If there is more than 1 item, keep it a an array. Otherwise, return just the first value.
const content = values.length <= 1 ? values[0] : values;
// If the property has no children, just return its value immediately.
if (!property.children.length)
return content;
// Otherwise, return an object with the values for the property, along with the values for its children.
return {
// We do this, because for example, there can be a value under og:image which should still be parsed,
// but there can also be child properties such as og:image:url or og:image:size
// "Value" is appended to the end of the property name to make it more clear, and to prevent things such
// as `videoInfo.actor.actor` to grab the actor's name.
...optionalSpread(`${property.outputName}Value`, content),
...property.children.reduce((acc, curr) => {
const parsed = parseOpenGraphProperty(curr, $);
if (parsed === undefined)
return acc;
return {
...acc,
...optionalSpread(curr.outputName, parseOpenGraphProperty(curr, $)),
};
}, {}),
};
};
function parseOpenGraph(item, additionalProperties) {
const $ = typeof item === 'string' ? (0, cheerio_1.load)(item) : item;
return [...(additionalProperties || []), ...OPEN_GRAPH_PROPERTIES].reduce((acc, curr) => {
return {
...acc,
...optionalSpread(curr.outputName, parseOpenGraphProperty(curr, $)),
};
}, {});
}
//# sourceMappingURL=open_graph_parser.js.map