UNPKG

bedetheque-scraper

Version:

NodeJS script to scrap the entire database of dbgest.com / bedetheque.com (approx. 260.000+ albums)

101 lines (90 loc) 3.35 kB
// imageCoverLarge: https://www.bedetheque.com/media/Couvertures/${imageCover} // imageCoverSmall: https://www.bedetheque.com/cache/thb_couv/${imageCover} // imageExtractLarge: https://www.bedetheque.com/media/Planches/${imageExtract} // imageExtractSmall: https://www.bedetheque.com/cache/thb_planches/${imageExtract} // imageReverseLarge: https://www.bedetheque.com/media/Versos/${imageReverse} // imageReverseSmall: https://www.bedetheque.com/cache/thb_versos/${imageReverse} export class Album { public serieId: number; public albumId: number; public serieTitle: string; public albumTitle: string | null; public scenario!: string | null; public drawing!: string | null; public colors!: string | null; public date!: string | null; public editor!: string | null; public nbrOfPages!: number | null; public imageCover: string | null; public imageExtract: string | null; public imageReverse: string | null; // voteAverage /100 public voteAverage : number; public voteCount : number; constructor(page : Cheerio, $: CheerioAPI, serieId: number, serieTitle: string) { this.serieId = serieId; this.serieTitle = serieTitle; this.albumId = parseInt(page.children().first().attr('name'), 10); this.albumTitle = page.find('.album-main .titre').attr('title'); this.imageCover = this.findImage(page, 'browse-couvertures', 'Couvertures'); this.imageExtract = this.findImage(page, 'browse-planches', 'Planches'); this.imageReverse = this.findImage(page, 'browse-versos', 'Versos'); this.voteAverage = this.findVoteAverage(page, $); this.voteCount = this.findVoteCount(page, $); this.addDetails(page, $); } private findVoteAverage(page : Cheerio, $: CheerioAPI) { const voteAverage = page.find('.ratingblock strong').text(); return voteAverage ? 20 * parseFloat(voteAverage) : 0; } private findVoteCount(page : Cheerio, $: CheerioAPI) { if (this.voteAverage === null) { return 0; } const voteCount = page.find('.ratingblock p').text(); if (!voteCount) { return 0; } return parseInt(voteCount.match(/\(([0-9]+) vote/)![1], 10); } private findImage(page: Cheerio, className: string, path: string) { const image = page.find(`.sous-couv .${className}`).attr('href'); return image ? image.replace(`https://www.bedetheque.com/media/${path}/`, '') : null; } private addDetails(page: Cheerio, $: CheerioAPI) { page.find('.infos > li') .each((index, info) => { const pageInfo = $(info); this.addDetail(pageInfo); }); } private addDetail(pageInfo: Cheerio) { const key = pageInfo.find('label').text().trim() .toLowerCase() .replace(' :', ''); const value = pageInfo.text().split(':')[1] ? pageInfo.text().split(':')[1].trim() : null; if (!value) { return; } switch (key) { case 'scénario': this.scenario = value; break; case 'dessin': this.drawing = value; break; case 'couleurs': this.colors = value; break; case 'dépot légal': this.date = value.slice(0, 7); break; case 'editeur': this.editor = value; break; case 'planches': this.nbrOfPages = parseInt(value, 10); break; default: break; } } }