UNPKG

bedetheque-scraper

Version:

NodeJS script to scrap the entire database of dbgest.com / bedetheque.com (approx. 260.000+ albums)

89 lines (71 loc) 3.22 kB
import {ProxyFetcher} from './proxy-fetcher'; import {Utils} from './utils'; import {Serie} from './serie'; import {Album} from './album'; import {Author} from './author'; export class Scraper { static async scrapeSeries(letter: string) { await ProxyFetcher.updateProxyList(); const urls = await this.getSeriesUrlFromLetter(letter); return Promise.all(urls.map((url, index) => this.getSerie(url, index * 500))); } static async scrapeAuthors(letter: string) { await ProxyFetcher.updateProxyList(); const urls = await this.getAuthorsUrlFromLetter(letter); return Promise.all(urls.map((url, index) => this.getAuthor(url, index * 500))); } static async getSeriesUrlFromLetter(letter: string): Promise<string[]> { console.log(`🔍 ${letter}: searching for series urls...`); const url = `https://www.bedetheque.com/bandes_dessinees_${letter}.html`; const promises = Array.from(Array(50)).map(() => ProxyFetcher.requestProxy(url, 10000)); const $: CheerioStatic = await Utils.raceFirstSuccess(promises); const urls = $('.nav-liste li') .filter((index, element) => ($(element).find('img').attr('src').includes('France'))) .map((index, element) => $(element).find('a').attr('href').replace('.html', '__10000.html')) .get(); console.log(`${letter}: found ${urls.length} series urls`); return urls; } static async getAuthorsUrlFromLetter(letter: string): Promise<string[]> { console.log(`🔍 ${letter}: searching for authors urls...`); const url = `https://www.bedetheque.com/liste_auteurs_BD_${letter}.html`; const promises = Array.from(Array(50)).map(() => ProxyFetcher.requestProxy(url, 10000)); const $: CheerioStatic = await Utils.raceFirstSuccess(promises); const urls = $('.nav-liste li') .map((index, element) => $(element).find('a').attr('href')) .get(); console.log(`${letter}: found ${urls.length} authors urls`); return urls; } static async getSerie(url: string, sleepTime: number) { await Utils.sleepFor(sleepTime); const $ = await ProxyFetcher.requestProxy(url, 60000) .catch(async () => { console.log(`⟳ serie: ${url}`); await Utils.sleepFor(500); return ProxyFetcher.requestProxy(url, 60000); }); const serie = new Serie($); const albums = $('.liste-albums > li') .filter((index, elem) => $(elem).find('.numa').text() === '') .map((index, elem) => new Album($(elem), $, serie.serieId, serie.serieTitle)) .get() as unknown as Album[]; await serie.addAlbumsInfo(albums); console.log(`✔ serie: ${serie.serieTitle} with ${serie.albumsId.length} albums`); return {serie, albums}; } static async getAuthor(url: string, sleepTime: number) { await Utils.sleepFor(sleepTime); const $ = await ProxyFetcher.requestProxy(url, 60000) .catch(async () => { console.log(`⟳ author: ${url}`); await Utils.sleepFor(500); return ProxyFetcher.requestProxy(url, 60000); }); const author = new Author($); await author.getImageDimensions(); console.log(author); console.log(`✔ author: ${author.name}`); return author; } }