html-metadata
Version:
Scrapes metadata of several different standards
711 lines (639 loc) • 21.7 kB
JavaScript
const microdata = require( 'microdata-node' ); // Schema.org microdata
/**
* Returns Object containing all available datatypes, keyed
* using the same keys as in metadataFunctions.
*
* @param {Object} chtml html Cheerio object to parse
* @return {Object} Promise for metadata
*/
exports.parseAll = function ( chtml ) {
// Array of keys corresponding to position of promise
const keys = Object.keys( exports.metadataFunctions );
const meta = {}; // Metadata keyed by keys in exports.metadataFunctions
// Array of promises for metadata of each type in exports.metadataFunctions
const arr = keys.map( ( key ) => exports.metadataFunctions[ key ]( chtml ) );
let result; // Result in for loop over results
let key; // Key corresponding to location of result
return Promise.all( arr.map( ( promise ) => promise.then(
// Create a promise that will always resolve with either the result or the error
( value ) => ( { status: 'fulfilled', value } ),
( error ) => ( { status: 'rejected', reason: error } )
)
) )
.then( ( results ) => {
Object.keys( results ).forEach( ( r ) => {
result = results[ r ];
key = keys[ r ];
if ( result && result.status === 'fulfilled' && result.value ) {
meta[ key ] = result.value;
}
} );
if ( Object.keys( meta ).length === 0 ) {
throw new Error( 'No metadata found in page' );
}
return meta;
} );
};
/**
* Base scraper for tags, used by some other parsing functions
*
* @param {Object} chtml html Cheerio object
* @param {string[]} tags tag types to process
* @param {string} reason message when metadata is not found
* @param {Function} getProperty function that gets the property of an element
* @param {Function} getContent function that gets the content of an element
* @return {Object} promise of metadata object
*/
exports.parseBase = function ( chtml, tags, reason, getProperty, getContent ) {
return new Promise( ( resolve, reject ) => {
const meta = {};
const metaTags = chtml( tags.join() );
if ( !metaTags || metaTags.length === 0 ) {
reject( new Error( reason ) );
}
metaTags.each( function () {
const element = chtml( this );
const property = getProperty( element );
const content = getContent( element );
// If lacks property or content, skip
if ( !property || !content ) {
return;
}
// If the property already exists, make the array of contents
if ( meta[ property ] ) {
if ( meta[ property ] instanceof Array ) {
meta[ property ].push( content );
} else {
meta[ property ] = [ meta[ property ], content ];
}
} else {
meta[ property ] = content;
}
} );
if ( !Object.keys( meta ).length ) {
reject( new Error( reason ) );
}
resolve( meta );
} );
};
/**
* Scrapes BE Press metadata given html object
*
* @param {Object} chtml html Cheerio object
* @return {Object} promise of BE Press metadata object
*/
exports.parseBEPress = function ( chtml ) {
return exports.parseBase(
chtml,
[ 'meta' ],
'No BE Press metadata found in page',
( element ) => {
const content = element.attr( 'content' );
const name = element.attr( 'name' );
// If the element isn't a BE Press property or if content is missing, skip it
if ( !name || !content || ( name.slice( 0, 17 ).toLowerCase() !== 'bepress_citation_' ) ) {
return;
}
return name.slice( 17 ).toLowerCase();
},
( element ) => element.attr( 'content' )
);
};
/**
* Scrapes COinS data given Cheerio loaded html object
*
* @param {Object} chtml html Cheerio object
* @return {Object} Promise for COinS metadata
*/
exports.parseCOinS = function ( chtml ) {
let title;
const metadata = [];
const tags = chtml( 'span[class=Z3988]' );
const promArray = [];
// Add promises for parsed title tags to an Array
tags.each( function () {
title = chtml( this ).attr( 'title' );
promArray.push( exports.parseCOinSTitle( title ) );
} );
// Once promises have resolved, add any successfully parsed titles to the metadata Array
return Promise.all( promArray.map( ( promise ) => promise.then(
( value ) => ( { status: 'fulfilled', value } ),
( error ) => ( { status: 'rejected', reason: error } )
) ) ).then( ( results ) => {
let result;
for ( const r in results ) {
result = results[ r ];
if ( result && result.status === 'fulfilled' && result.value ) {
metadata.push( result.value );
}
}
if ( !metadata.length ) {
throw new Error( 'No COinS metadata found' );
} else {
return metadata;
}
} );
};
/**
* Parses value of COinS title tag
*
* @param {string} title String corresponding to value of title tag in span element
* @return {Object} Promise for CoinS metadata
*/
exports.parseCOinSTitle = function ( title ) {
return new Promise( ( resolve, reject ) => {
const metadata = {};
const rft = {};
let value;
let key;
if ( typeof title !== 'string' ) {
reject( new Error( 'Provided value must be a string; Got ' + typeof title ) );
}
title = title.replace( /&/g, '&' ); // Allows function to take the raw html string
title = title.split( '&' );
title.forEach( ( element ) => {
element = element.split( '=' );
if ( element.length !== 2 ) {
return;
} // Invalid element
key = element[ 0 ].toLowerCase(); // Be case-insensitive for properties
value = decodeURIComponent( element[ 1 ].replace( /\+/g, '%20' ) ); // Replace + with encoded space since they aren't getting decoded as spaces
key = key.split( '.' ); // Split hierarchical keys
if ( key.length === 1 ) { // Top level key
metadata[ key[ 0 ] ] = value;
return;
}
if ( key.length === 2 ) { // Split key e.g. rft.date
if ( key[ 0 ] !== 'rft' ) {
return;
} // Invalid hierarchical key
// Keys that may have multiple values - return in list format
if ( key[ 1 ] === 'au' || key[ 1 ] === 'isbn' || key[ 1 ] === 'issn' || key[ 1 ] === 'eissn' || key[ 1 ] === 'aucorp' ) {
if ( !rft[ key[ 1 ] ] ) {
rft[ key[ 1 ] ] = [];
}
rft[ key[ 1 ] ].push( value );
return;
}
// Add rft value to rft key - this will overwrite duplicates, if they exist
rft[ key[ 1 ] ] = value;
}
} );
if ( Object.keys( rft ).length ) { // Add rft object if it is not empty
metadata.rft = rft;
}
if ( !Object.keys( metadata ).length ) {
reject( new Error( 'No COinS in provided string' ) );
}
if ( metadata.rft && metadata.rft.genre ) {
// Genre should be case insensitive as this field may be used programmatically
metadata.rft.genre = metadata.rft.genre.toLowerCase();
}
resolve( metadata );
} );
};
/**
* Scrapes Dublin Core data given Cheerio loaded html object
*
* @param {Object} chtml html Cheerio object
* @return {Object} Promise for DC metadata
*/
exports.parseDublinCore = function ( chtml ) {
return exports.parseBase(
chtml,
[ 'meta', 'link' ],
'No Dublin Core metadata found in page',
( element ) => {
const isLink = element[ 0 ].name === 'link';
const nameAttr = element.attr( isLink ? 'rel' : 'name' );
const value = element.attr( isLink ? 'href' : 'content' );
// If the element isn't a Dublin Core property or if value is missing, skip it
if ( !nameAttr || !value ||
( nameAttr.slice( 0, 3 ).toUpperCase() !== 'DC.' &&
nameAttr.slice( 0, 8 ).toUpperCase() !== 'DCTERMS.' ) ) {
return;
}
const property = nameAttr.slice( Math.max( 0, nameAttr.lastIndexOf( '.' ) + 1 ) ).toLowerCase();
return property;
},
( element ) => {
const isLink = element[ 0 ].name === 'link';
return element.attr( isLink ? 'href' : 'content' );
}
);
};
/**
* Scrapes EPrints data given Cheerio loaded html object
*
* @param {Object} chtml html Cheerio object
* @return {Object} Promise for EPrints metadata
*/
exports.parseEprints = function ( chtml ) {
return exports.parseBase(
chtml,
[ 'meta' ],
'No EPrints metadata found in page',
( element ) => {
const nameAttr = element.attr( 'name' );
const content = element.attr( 'content' );
// If the element isn't an EPrints property or content is missing, skip it
if ( !nameAttr || !content || nameAttr.slice( 0, 8 ).toLowerCase() !== 'eprints.' ) {
return;
}
let property = nameAttr.slice( Math.max( 0, nameAttr.lastIndexOf( '.' ) + 1 ) );
// Lowercase property
property = property.toLowerCase();
return property;
},
( element ) => element.attr( 'content' )
).then( ( results ) => {
if ( results.type ) {
results.type = results.type.toLowerCase(); // Standardise 'type' field to lowercase
}
return results;
} );
};
/**
* Scrapes general metadata terms given Cheerio loaded html object
*
* @param {Object} chtml html Cheerio object
* @return {Object} Promise for general metadata
*/
exports.parseGeneral = function ( chtml ) {
return new Promise( ( resolve, reject ) => {
const clutteredMeta = {
appleTouchIcons: chtml( 'link[rel=apple-touch-icon i]' ).map( ( i, e ) => ( {
href: e.attribs.href,
sizes: e.attribs.sizes
} ) ).get(), // apple-touch-icon <link rel="apple-touch-icon" href="" sizes="">
icons: chtml( 'link[rel="shortcut icon" i], link[rel="icon" i]' ).map( ( i, e ) => ( {
href: e.attribs.href,
sizes: e.attribs.sizes,
type: e.attribs.type
} ) ).get(), // icon <link rel="icon" href="" sizes="" type="">
author: chtml( 'meta[name=author i]' ).first().attr( 'content' ), // author <meta name="author" content="">
authorlink: chtml( 'link[rel=author i]' ).first().attr( 'href' ), // author link <link rel="author" href="">
canonical: chtml( 'link[rel=canonical i]' ).first().attr( 'href' ), // canonical link <link rel="canonical" href="">
description: chtml( 'meta[name=description i]' ).attr( 'content' ), // meta description <meta name ="description" content="">
publisher: chtml( 'link[rel=publisher i]' ).first().attr( 'href' ), // publisher link <link rel="publisher" href="">
robots: chtml( 'meta[name=robots i]' ).first().attr( 'content' ), // robots <meta name ="robots" content="">
shortlink: chtml( 'link[rel=shortlink i]' ).first().attr( 'href' ), // short link <link rel="shortlink" href="">
title: chtml( 'title' ).first().text(), // title tag <title>
lang: chtml( 'html' ).first().attr( 'lang' ) || chtml( 'html' ).first().attr( 'xml:lang' ), // lang <html lang=""> or <html xml:lang="">
dir: chtml( 'html' ).first().attr( 'dir' ) // dir <html dir="">
};
// Copy key-value pairs with defined values to meta
const meta = {};
let value;
let notEmpty = false;
Object.keys( clutteredMeta ).forEach( ( key ) => {
notEmpty = false;
value = clutteredMeta[ key ];
let innerValue;
if ( value && typeof value === 'object' ) {
let i;
for ( i = 0; i < Object.keys( value ).length; i++ ) {
const definedValue = {};
// eslint-disable-next-line no-loop-func
Object.keys( value[ i ] ).forEach( ( objectProperty ) => {
innerValue = value[ i ][ objectProperty ];
if ( innerValue ) {
definedValue[ objectProperty ] = innerValue;
notEmpty = true;
}
} );
value[ i ] = definedValue;
}
} else {
notEmpty = true;
}
if ( value && notEmpty ) { // Only add if has value
meta[ key ] = value;
}
} );
// Reject promise if meta is empty
if ( Object.keys( meta ).length === 0 ) {
reject( new Error( 'No general metadata found in page' ) );
}
// Resolve on meta
resolve( meta );
} );
};
/**
* Scrapes Highwire Press metadata given html object
*
* @param {Object} chtml html Cheerio object
* @return {Object} promise of highwire press metadata object
*/
exports.parseHighwirePress = function ( chtml ) {
return exports.parseBase(
chtml,
[ 'meta' ],
'No Highwire Press metadata found in page',
( element ) => {
const nameAttr = element.attr( 'name' );
const content = element.attr( 'content' );
// If the element isn't a Highwire Press property, skip it
if ( !nameAttr || !content || ( nameAttr.slice( 0, 9 ).toLowerCase() !== 'citation_' ) ) {
return;
}
return nameAttr.slice( Math.max( 0, nameAttr.indexOf( '_' ) + 1 ) ).toLowerCase();
},
( element ) => element.attr( 'content' )
);
};
/**
* Returns JSON-LD provided by page given HTML object
*
* @param {Object} chtml html Cheerio object
* @return {Object} Promise for JSON-LD
*/
exports.parseJsonLd = function ( chtml ) {
return new Promise( ( resolve, reject ) => {
const json = [];
const jsonLd = chtml( 'script[type="application/ld+json"]' );
jsonLd.each( function () {
let contents;
try {
contents = JSON.parse( this.children[ 0 ].data );
} catch ( e ) {
// Fail silently, just in case there are valid tags
return;
}
if ( contents ) {
json.push( contents );
} else {
return;
}
} );
if ( json.length === 0 ) {
reject( new Error( 'No JSON-LD valid script tags present on page' ) );
}
resolve( json.length > 1 ? json : json[ 0 ] );
} );
};
/**
* Scrapes OpenGraph data given html object
*
* @param {Object} chtml html Cheerio object
* @return {Object} promise of open graph metadata object
*/
exports.parseOpenGraph = function ( chtml ) {
return new Promise( ( resolve, reject ) => {
let property;
let node;
const meta = {};
const metaTags = chtml( 'meta' );
const namespace = [ 'og', 'fb' ];
const subProperty = {
image: 'url',
video: 'url',
audio: 'url'
};
const roots = {}; // Object to store roots of different type i.e. image, audio
let subProp; // Current subproperty of interest
const reason = new Error( 'No openGraph metadata found in page' );
if ( !metaTags || metaTags.length === 0 ) {
reject( reason );
}
metaTags.each( function () {
const element = chtml( this );
let propertyValue = element.attr( 'property' );
const content = element.attr( 'content' );
if ( !propertyValue || !content ) {
return;
} else {
propertyValue = propertyValue.toLowerCase().split( ':' );
}
// If the property isn't in namespace, exit
if ( !namespace.includes( propertyValue[ 0 ] ) ) {
return;
}
if ( propertyValue.length === 2 ) {
property = propertyValue[ 1 ]; // Set property to value after namespace
if ( property in subProperty ) { // If has valid subproperty
node = {};
node[ subProperty[ property ] ] = content;
roots[ property ] = node;
} else {
node = content;
}
// If the property already exists, make the array of contents
if ( meta[ property ] ) {
if ( meta[ property ] instanceof Array ) {
meta[ property ].push( node );
} else {
meta[ property ] = [ meta[ property ], node ];
}
} else {
meta[ property ] = node;
}
} else if ( propertyValue.length === 3 ) { // Property part of a vertical
// i.e. image, audio - as properties, not values, these should be lower case
subProp = propertyValue[ 1 ].toLowerCase();
// i.e. height, width - as properties, not values, these should be lower case
property = propertyValue[ 2 ].toLowerCase();
// If root for subproperty exists, and there isn't already a property
// called that in there already i.e. height, add property and content.
if ( roots[ subProp ] && !roots[ subProp ][ property ] ) {
// As properties, not values, these should be lower case
roots[ subProp ][ property ] = content.toLowerCase();
}
} else {
return; // Discard values with length <2 and >3 as invalid
}
// Check for "type" property and add to namespace if so
// If any of these type occur in order before the type attribute is defined,
// they'll be skipped; spec requires they be placed below type definition.
// For nested types (e.g. video.movie) the OG protocol uses the super type
// (e.g. movie) as the new namespace.
if ( property === 'type' ) {
namespace.push( content.split( '.' )[ 0 ].toLowerCase() ); // Add the type to the acceptable namespace list - as a property, should be lower case
}
} );
if ( Object.keys( meta ).length === 0 ) {
reject( reason );
}
if ( meta.type ) {
// Make type case insensitive as this may be used programmatically
meta.type = meta.type.toLowerCase();
}
resolve( meta );
} );
};
/**
* Scrapes schema.org microdata given Cheerio loaded html object
*
* @param {Object} chtml Cheerio object with html loaded
* @return {Object} promise of schema.org microdata object
*/
exports.parseSchemaOrgMicrodata = function ( chtml ) {
return new Promise( ( resolve, reject ) => {
if ( !chtml ) {
reject( new Error( 'Undefined argument' ) );
}
const meta = microdata.toJson( chtml.html() );
if ( !meta || !meta.items || !meta.items[ 0 ] ) {
reject( new Error( 'No schema.org metadata found in page' ) );
}
resolve( meta );
} );
};
/**
* Scrapes twitter microdata given Cheerio html object
*
* @param {Object} chtml html Cheerio object
* @return {Object} promise of twitter metadata object
*/
exports.parseTwitter = function ( chtml ) {
return new Promise( ( resolve, reject ) => {
if ( !chtml ) {
reject( new Error( 'Undefined argument' ) );
}
const meta = {};
const metaTags = chtml( 'meta' );
// These properties can either be strings or objects
const dualStateSubProperties = {
image: 'url',
player: 'url',
creator: '@username'
};
metaTags.each( function () {
const element = chtml( this );
let name = element.attr( 'name' );
let property;
const content = element.attr( 'content' );
let node;
// Exit if not a twitter tag or content is missing
if ( !name || !content ) {
return;
} else {
name = name.toLowerCase().split( ':' );
property = name[ 1 ];
}
// Exit if tag not twitter metadata
if ( name[ 0 ] !== 'twitter' ) {
return;
}
// Handle nested properties
if ( name.length > 2 ) {
const subProperty = name[ 2 ];
// Upgrade the property to an object if it needs to be
if ( property in dualStateSubProperties &&
!( meta[ property ] instanceof Object ) ) {
node = {};
node[ dualStateSubProperties[ property ] ] = meta[ property ];
// Clear out the existing string as we just placed it into our new node
meta[ property ] = [];
} else {
// Either create a new node or ammend the existing one
node = meta[ property ] ? meta[ property ] : {};
}
// Differentiate betweeen twice and thrice nested properties
// Not the prettiest solution, but twitter metadata guidelines are fairly strict,
// so it's not nessesary to anticipate strange data.
if ( name.length === 3 ) {
node[ subProperty ] = content;
} else if ( name.length === 4 ) {
// Solve twitter:player:stream:content_type where stream needs to be an obj
if ( subProperty.toLowerCase() === 'stream' ) {
node[ subProperty ] = { url: node[ subProperty ] };
} else {
// Either create a new subnode or amend the existing one
node[ subProperty ] = node[ subProperty ] ? node[ subProperty ] : {};
}
node[ subProperty ][ name[ 3 ] ] = content;
} else {
// Something is malformed, so exit
return;
}
} else {
node = content;
}
// Create array if property exists and is not a nested object
if ( meta[ property ] && !( meta[ property ] instanceof Object ) ) {
if ( meta[ property ] instanceof Array ) {
meta[ property ].push( node );
} else {
meta[ property ] = [ meta[ property ], node ];
}
} else {
meta[ property ] = node;
}
} );
if ( Object.keys( meta ).length === 0 ) {
reject( new Error( 'No twitter metadata found on this page' ) );
}
resolve( meta );
} );
};
/**
* Scrapes prism metadata given Cheerio html object
*
* @param {Object} chtml html Cheerio object
* @return {Object} promise of prism metadata object
*/
exports.parsePrism = function ( chtml ) {
return new Promise( ( resolve, reject ) => {
if ( !chtml ) {
reject( new Error( 'Undefined argument' ) );
}
const meta = {};
const metaTags = chtml( 'meta' );
const reason = new Error( 'No PRISM metadata found in page' );
if ( !metaTags || metaTags.length === 0 ) {
reject( reason );
}
metaTags.each( function () {
const element = chtml( this );
let name = element.attr( 'name' );
const content = element.attr( 'content' );
if ( !name || !content ) {
return;
} else {
name = name.split( '.' );
}
// If the name does not have the prism prefix, exit
if ( name[ 0 ].toLowerCase() !== 'prism' ) {
return;
}
// Set the name to the value after the prefix
name = name[ 1 ];
// Set the first character to lower case
name = name.charAt( 0 ).toLowerCase() + name.slice( 1 );
// If the name already exists, make an array of the contents
if ( meta[ name ] ) {
if ( meta[ name ] instanceof Array ) {
meta[ name ].push( content );
} else {
meta[ name ] = [ meta[ name ], content ];
}
} else {
meta[ name ] = content;
}
} );
if ( Object.keys( meta ).length === 0 ) {
reject( reason );
}
resolve( meta );
} );
};
/**
* Global exportable list of scraping promises with string keys
*
* @type {Object}
*/
exports.metadataFunctions = {
bePress: exports.parseBEPress,
coins: exports.parseCOinS,
dublinCore: exports.parseDublinCore,
eprints: exports.parseEprints,
general: exports.parseGeneral,
highwirePress: exports.parseHighwirePress,
jsonLd: exports.parseJsonLd,
openGraph: exports.parseOpenGraph,
schemaOrg: exports.parseSchemaOrgMicrodata,
twitter: exports.parseTwitter,
prism: exports.parsePrism
};
;