UNPKG

@crawlee/utils

Version:

A set of shared utilities that can be used by crawlers

381 lines • 11.1 kB
"use strict"; Object.defineProperty(exports, "__esModule", { value: true }); exports.parseOpenGraph = parseOpenGraph; const cheerio_1 = require("cheerio"); /** * To be used with the spread operator. Ensures that the item is defined, and is not empty. * * @param key The key for the item to have in the object * @param item The item to assign to the key. * @returns Either an empty object or an object with the content provided. */ const optionalSpread = (key, item) => item !== undefined && !!Object.values(item)?.length ? { [key]: item } : {}; const OPEN_GRAPH_PROPERTIES = [ { name: 'og:title', outputName: 'title', children: [], }, { name: 'og:type', outputName: 'type', children: [], }, { name: 'og:image', outputName: 'image', children: [ { name: 'og:image:url', outputName: 'url', children: [], }, { name: 'og:image:secure_url', outputName: 'secureUrl', children: [], }, { name: 'og:image:type', outputName: 'type', children: [], }, { name: 'og:image:width', outputName: 'width', children: [], }, { name: 'og:image:height', outputName: 'height', children: [], }, { name: 'og:image:alt', outputName: 'alt', children: [], }, ], }, { name: 'og:url', outputName: 'url', children: [], }, { name: 'og:audio', outputName: 'audio', children: [ { name: 'og:audio:url', outputName: 'url', children: [], }, { name: 'og:audio:secure_url', outputName: 'secureUrl', children: [], }, { name: 'og:audio:type', outputName: 'type', children: [], }, ], }, { name: 'og:description', outputName: 'description', children: [], }, { name: 'og:determiner', outputName: 'determiner', children: [], }, { name: 'og:locale', outputName: 'locale', children: [ { name: 'og:locale:alternate', outputName: 'alternate', children: [], }, ], }, { name: 'og:site_name', outputName: 'siteName', children: [], }, { name: 'og:video', outputName: 'video', children: [ { name: 'og:video:url', outputName: 'url', children: [], }, { name: 'og:video:secure_url', outputName: 'secureUrl', children: [], }, { name: 'og:video:type', outputName: 'type', children: [], }, { name: 'og:video:width', outputName: 'width', children: [], }, { name: 'og:video:height', outputName: 'height', children: [], }, { name: 'og:video:alt', outputName: 'alt', children: [], }, ], }, // The properties below aren't prefixed with "og". // Part of the reason the properties have been hardcoded is because not all OpenGraph properties start with "og". // Especially the newer ones that extend "og:type". { name: 'video', outputName: 'videoInfo', children: [ { name: 'video:actor', outputName: 'actor', children: [ { name: 'video:actor:role', outputName: 'role', children: [], }, ], }, { name: 'video:director', outputName: 'director', children: [], }, { name: 'video:writer', outputName: 'writer', children: [], }, { name: 'video:duration', outputName: 'duration', children: [], }, { name: 'video:release_date', outputName: 'releaseDate', children: [], }, { name: 'video:tag', outputName: 'tag', children: [], }, { name: 'video:series', outputName: 'series', children: [], }, ], }, { name: 'music', outputName: 'musicInfo', children: [ { name: 'music:duration', outputName: 'duration', children: [], }, { name: 'music:album', outputName: 'album', children: [ { name: 'music:album:disc', outputName: 'disc', children: [], }, { name: 'music:album:track', outputName: 'track', children: [], }, ], }, { name: 'music:musician', outputName: 'musician', children: [], }, { name: 'music:song', outputName: 'song', children: [ { name: 'music:song:disc', outputName: 'disc', children: [], }, { name: 'music:song:track', outputName: 'track', children: [], }, ], }, { name: 'music:release_date', outputName: 'releaseDate', children: [], }, { name: 'music:creator', outputName: 'creator', children: [], }, ], }, { name: 'article', outputName: 'articleInfo', children: [ { name: 'music:published_time', outputName: 'publishedTime', children: [], }, { name: 'music:modified_time', outputName: 'modifiedTime', children: [], }, { name: 'music:expiration_time', outputName: 'expirationTime', children: [], }, { name: 'music:author', outputName: 'author', children: [], }, { name: 'music:section', outputName: 'section', children: [], }, { name: 'music:tag', outputName: 'tag', children: [], }, ], }, { name: 'book', outputName: 'bookInfo', children: [ { name: 'book:author', outputName: 'author', children: [], }, { name: 'book:isbn', outputName: 'isbn', children: [], }, { name: 'book:release_date', outputName: 'releaseDate', children: [], }, { name: 'book:tag', outputName: 'tag', children: [], }, ], }, { name: 'profile', outputName: 'profileInfo', children: [ { name: 'profile:first_name', outputName: 'firstName', children: [], }, { name: 'profile:last_name', outputName: 'lastName', children: [], }, { name: 'profile:username', outputName: 'username', children: [], }, { name: 'profile:gender', outputName: 'gender', children: [], }, ], }, ]; const makeOpenGraphSelector = (name) => `meta[property="${name}"]`; const parseOpenGraphProperty = (property, $) => { // Some OpenGraph properties can be added multiple times, such as with video:actor. We must handle this case. const values = [...$(makeOpenGraphSelector(property.name))].map((elem) => $(elem).attr('content')); // If there is more than 1 item, keep it a an array. Otherwise, return just the first value. const content = values.length <= 1 ? values[0] : values; // If the property has no children, just return its value immediately. if (!property.children.length) return content; // Otherwise, return an object with the values for the property, along with the values for its children. return { // We do this, because for example, there can be a value under og:image which should still be parsed, // but there can also be child properties such as og:image:url or og:image:size // "Value" is appended to the end of the property name to make it more clear, and to prevent things such // as `videoInfo.actor.actor` to grab the actor's name. ...optionalSpread(`${property.outputName}Value`, content), ...property.children.reduce((acc, curr) => { const parsed = parseOpenGraphProperty(curr, $); if (parsed === undefined) return acc; return { ...acc, ...optionalSpread(curr.outputName, parseOpenGraphProperty(curr, $)), }; }, {}), }; }; function parseOpenGraph(item, additionalProperties) { const $ = typeof item === 'string' ? (0, cheerio_1.load)(item) : item; return [...(additionalProperties || []), ...OPEN_GRAPH_PROPERTIES].reduce((acc, curr) => { return { ...acc, ...optionalSpread(curr.outputName, parseOpenGraphProperty(curr, $)), }; }, {}); } //# sourceMappingURL=open_graph_parser.js.map