bandcamp-fetch
Version:
Scrape Bandcamp content
212 lines • 8.84 kB
JavaScript
;
Object.defineProperty(exports, "__esModule", { value: true });
const cheerio_1 = require("cheerio");
const html_entities_1 = require("html-entities");
const Constants_js_1 = require("../utils/Constants.js");
const Parse_js_1 = require("../utils/Parse.js");
const os_1 = require("os");
class ArticleParser {
static parseArticle(html, opts) {
const $ = (0, cheerio_1.load)(html);
const ldJson = $('script[type="application/ld+json"]').html();
if (!ldJson) {
throw new Parse_js_1.ParseError('Failed to parse article: missing JSON data.', html);
}
let basic, players;
try {
basic = JSON.parse(ldJson);
}
catch (error) {
throw new Parse_js_1.ParseError('Failed to parse article: JSON error in basic info.', html, error);
}
try {
players = JSON.parse((0, html_entities_1.decode)($('#p-daily-article').attr('data-player-infos')));
}
catch (error) {
players = null;
}
const article = {
title: basic.headline,
description: basic.description,
url: basic['@id'],
imageUrl: basic.image,
date: basic.datePublished,
category: {
name: basic.articleSection
},
author: {
name: basic.author.name,
url: basic.author['@id']
},
mediaItems: [],
sections: []
};
// Get genre
const genreLink = $('.genre a');
if (genreLink.length > 0) {
article.genre = {
name: genreLink.text(),
url: genreLink.attr('href')
};
const genreReadMoreLink = $('.moreingenre a')?.attr('href');
if (genreReadMoreLink) {
article.genre.readMoreUrl =
(0, Parse_js_1.isAbsoluteUrl)(genreReadMoreLink) ? genreReadMoreLink : ((0, Parse_js_1.normalizeUrl)(genreReadMoreLink, Constants_js_1.URLS.DAILY));
}
}
// Get category url
const categoryLink = $('article-type a')?.attr('href');
if (categoryLink) {
article.category.url =
(0, Parse_js_1.isAbsoluteUrl)(categoryLink) ? categoryLink : ((0, Parse_js_1.normalizeUrl)(categoryLink, Constants_js_1.URLS.DAILY));
}
// Get media items (albums and tracks featured in article)
if (Array.isArray(players)) {
players.forEach((player) => {
if (!player) {
// `player` can be null
return;
}
let mediaItemType;
switch (player.parent_tralbum_type) {
case 'a':
mediaItemType = 'album';
break;
case 't':
mediaItemType = 'track';
break;
default:
mediaItemType = null;
}
if (mediaItemType) {
const mediaItem = {
type: mediaItemType,
id: player.parent_tralbum_id,
name: player.title,
url: player.tralbum_url,
imageUrl: '',
featuredTrackPosition: player.featured_track_number,
artist: {
name: player.band_name,
url: player.band_url,
imageUrl: '',
location: player.band_location
},
tracks: [],
mediaItemRef: player.player_id
};
if (player.parent_tralbum_type === 'a') {
mediaItem.type = 'album';
}
else if (player.parent_tralbum_type === 't') {
mediaItem.type = 'track';
}
if (player.art_id && opts.albumImageFormat?.id) {
mediaItem.imageUrl = `${opts.imageBaseUrl}/img/a${player.art_id}_${opts.albumImageFormat.id}.jpg`;
}
if (player.band_image_id &&
mediaItem.artist &&
opts.artistImageFormat?.id) {
mediaItem.artist.imageUrl = `${opts.imageBaseUrl}/img/${player.band_image_id}_${opts.artistImageFormat.id}.jpg`;
}
const tracklist = player.tracklist;
if (mediaItemType === 'album' && Array.isArray(tracklist)) {
mediaItem.tracks = tracklist.reduce((result, trackInfo) => {
const track = {
id: trackInfo.track_id,
position: trackInfo.track_number,
name: trackInfo.track_title,
duration: trackInfo.audio_track_duration,
streamUrl: trackInfo.audio_url?.['mp3-128']
};
const streamUrlHQ = trackInfo.audio_url?.['mp3-v0'];
if (streamUrlHQ) {
track.streamUrlHQ = streamUrlHQ;
}
result.push(track);
return result;
}, []);
}
article.mediaItems.push(mediaItem);
}
});
}
// Function that returns a section corresponding to a media item
const _getSectionByPlayer = (player) => {
const section = {
html: '',
text: ''
};
// Get heading
const heading = player
.prevUntil('.bamplayer-art, .player-not-available', 'h3, h2')
.first();
const headingHTML = heading.html();
if (headingHTML) {
section.heading = {
html: headingHTML,
text: (0, Parse_js_1.stripTags)((0, Parse_js_1.brToNewLine)(headingHTML)).trim()
};
}
// Get html and text
const paragraphs = player.nextUntil('.bamplayer-art, .player-not-available, h3, h5, article-end', 'p');
paragraphs.each((_i, p) => {
const _p = $(p);
section.html += (section.html !== '' ? os_1.EOL : '') + (_p.html() || '');
section.text += (section.text !== '' ? os_1.EOL + os_1.EOL : '') + _p.text();
});
// Get mediaItemRef
const playerIdMatch = player
.attr('data-bind')
?.match(/playerMap\["(.+?)"]/);
if (playerIdMatch?.[1]) {
section.mediaItemRef = playerIdMatch[1];
}
return section;
};
// Function that returns the introductory paragraph(s) of the article
const _getIntroSection = (articleBody) => {
const firstPlayer = articleBody
.find('.bamplayer-art, .player-not-available')
.first();
const paragraphs = firstPlayer.length > 0 ?
firstPlayer.prevAll('p')
: articleBody.find('p');
if (paragraphs.length > 0) {
const section = {
html: '',
text: ''
};
paragraphs.each((_i, p) => {
const _p = $(p);
section.html += (section.html !== '' ? os_1.EOL : '') + (_p.html() || '');
section.text += (section.text !== '' ? os_1.EOL + os_1.EOL : '') + _p.text();
});
return section;
}
return null;
};
// Sections
const articleBody = $('#p-daily-article article');
const sections = [];
const introSection = _getIntroSection(articleBody);
if (introSection) {
sections.push(introSection);
}
const bcplayers = articleBody.find('.bamplayer-art, .player-not-available');
bcplayers.each((i, player) => {
sections.push(_getSectionByPlayer($(player)));
});
article.sections = sections;
if (opts.includeRawData) {
article.raw = {
basic,
mediaItems: players,
body: articleBody.html() || ''
};
}
return article;
}
}
exports.default = ArticleParser;
//# sourceMappingURL=ArticleParser.js.map