bandcamp-fetch
Version:
Scrape Bandcamp content
179 lines • 7.72 kB
JavaScript
import { load as cheerioLoad } from 'cheerio';
import { normalizeUrl, reformatImageUrl, splitUrl } from '../utils/Parse.js';
import TrackInfoParser from '../track/TrackInfoParser.js';
import AlbumInfoParser from '../album/AlbumInfoParser.js';
import { decode } from 'html-entities';
export default class DiscographyParser {
static parseDiscography(html, opts) {
const $ = cheerioLoad(html);
// One-album / one-track artists don't have a discography page.
// The page for the album or track will be loaded instead.
// Check if this is the case and handle accordingly.
const currentAlbumOrTrack = $('script[type="application/ld+json"]');
let isOneTrack = false, isOneAlbum = false;
if (currentAlbumOrTrack.length) {
let currentAlbumOrTrackData;
const currentAlbumOrTrackHtml = currentAlbumOrTrack.html();
if (currentAlbumOrTrackHtml) {
try {
currentAlbumOrTrackData = JSON.parse(currentAlbumOrTrackHtml);
}
catch (error) {
currentAlbumOrTrackData = null;
}
}
if (currentAlbumOrTrackData && typeof currentAlbumOrTrackData === 'object') {
// Check if there is a 'discography' element and, if there is, whether
// It is hidden or has only one track / album child
const discographyEl = $('#discography');
if (discographyEl.length === 0 || discographyEl.css('display') === 'none' || discographyEl.find('li').length === 1) {
const currentAlbumOrTrackUrl = splitUrl(currentAlbumOrTrackData['@id']);
isOneTrack = !!currentAlbumOrTrackUrl.path.startsWith('/track/');
isOneAlbum = !!currentAlbumOrTrackUrl.path.startsWith('/album/');
}
}
}
if (isOneTrack || isOneAlbum) {
const newOpts = {
imageBaseUrl: opts.imageBaseUrl,
albumImageFormat: opts.imageFormat,
artistImageFormat: null,
includeRawData: false
};
const info = isOneTrack ? TrackInfoParser.parseInfo(html, newOpts) : AlbumInfoParser.parseInfo(html, newOpts);
if (info.artist !== undefined) {
return [{
...info,
artist: {
name: info.artist.name
}
}];
}
return [info];
}
const allLinks = $('a');
const items = {};
const defaultArtistName = $('#band-name-location').find('.title').text();
allLinks.each((index, link) => {
const linkEl = $(link);
const href = linkEl.attr('href');
if (typeof href !== 'string' || href === '') {
return true;
}
let host, pathname;
// Regex taken from:
// https://github.com/masterT/bandcamp-scraper/blob/master/lib/htmlParser.js
if ((/^\/(track|album)\/(.+)$/).exec(href)) { // Relative url starting with '/track' or '/album'
host = opts.bandUrl;
pathname = href;
}
else { // Full url (label discography)
try {
const _url = splitUrl(href);
if (_url.path && (/^\/(track|album)\/(.+)$/).exec(_url.path)) {
host = _url.base;
pathname = _url.path;
}
}
catch (e) {
return true;
}
}
if (host !== undefined && pathname !== undefined) {
const url = normalizeUrl(pathname, host);
if (items[url] === undefined) {
items[url] = {
type: pathname.startsWith('/track/') ? 'track' : 'album'
};
}
// Link element wraps around img and title
const img = linkEl.find('img');
if (img.length) {
const imgSrc = img.attr('data-original') || img.attr('src');
const imageUrl = reformatImageUrl(imgSrc, opts.imageFormat);
if (imageUrl) {
items[url].imageUrl = imageUrl;
}
}
const title = linkEl.find('.title');
if (title.length) {
// For labels, title element contains artist name (when it doesn't, then artist = label).
// For artists, title element may also contain an artist name which overrides the default
const artistNameEl = title.find('.artist-override');
if (artistNameEl.length) {
const artistName = artistNameEl.text().trim();
artistNameEl.remove();
items[url].artist = {
name: artistName
};
}
else {
items[url].artist = {
name: defaultArtistName
};
}
items[url].name = title.text().trim();
}
if (img.length > 0 && title.length > 0) {
items[url].name = linkEl.text().trim();
}
const idStr = linkEl.parent('li').attr('data-item-id');
if (idStr) {
const idMatch = /(?:album|track)-(\d+)$/g.exec(idStr);
if (idMatch && idMatch[1]) {
items[url].id = Number(idMatch[1]);
}
}
}
});
const results = [];
for (const [url, props] of Object.entries(items)) {
if (props.type && props.name) {
const item = {
url,
type: props.type,
id: props.id,
name: props.name,
artist: props.artist || {
name: defaultArtistName
}
};
if (props.imageUrl) {
item.imageUrl = props.imageUrl;
}
results.push(item);
}
}
const rawExtra = decode($('ol[data-client-items]').attr('data-client-items'));
let extra;
try {
extra = JSON.parse(rawExtra);
}
catch (error) {
extra = null;
}
if (Array.isArray(extra)) {
const extraItems = extra.reduce((acc, itemData) => {
if (itemData.type === 'album' || itemData.type === 'track') {
const item = {
url: normalizeUrl(itemData.page_url),
type: itemData.type,
id: itemData.id,
name: itemData.title,
artist: {
name: itemData.artist
}
};
if (itemData.art_id && opts.imageFormat?.id) {
item.imageUrl = `${opts.imageBaseUrl}/img/a${itemData.art_id}_${opts.imageFormat.id}.jpg`;
}
acc.push(item);
}
return acc;
}, []);
results.push(...extraItems);
}
return results;
}
}
//# sourceMappingURL=DiscographyParser.js.map