bedetheque-scraper
Version:
NodeJS script to scrap the entire database of dbgest.com / bedetheque.com (approx. 260.000+ albums)
45 lines (39 loc) • 1.63 kB
text/typescript
import * as cheerio from 'cheerio';
import lodash from 'lodash';
import axiosHttpsProxyFix from 'axios-https-proxy-fix';
export interface ProxyType{
host: string;
port: number;
}
export class ProxyFetcher {
public static async getFreeProxyList(timeout = 5000) {
console.log('→ searching for free proxies');
const list: ProxyType[] = await axiosHttpsProxyFix
.get(`https://api.proxyscrape.com/?request=getproxies&proxytype=http&timeout=${timeout}`)
.then(response => response.data.trim().split('\r\n')
.map((p: string) => ({ host: p.split(':')[0], port: parseInt(p.split(':')[1], 10) })));
console.log(`→ found ${list.length} free proxies`);
return list;
}
public static async requestProxy(list: ProxyType[], urlRaw: string, nbrRetry = 5): Promise<any> {
if (nbrRetry === 0) { return null; }
const url = encodeURI(urlRaw);
const proxy = this.getRandomProxy(list);
return this.timeoutRequest(60000, axiosHttpsProxyFix.get(url, { proxy }))
.then((result: any) => cheerio.load(result.data))
.catch((error) => {
console.log(`⟳ request: ${nbrRetry} - ${url}, ${error.message || error.code || error}`);
return this.requestProxy(list, url, nbrRetry - 1);
});
}
private static getRandomProxy(list: ProxyType[]) {
const indexProxy = lodash.random(list.length - 1);
return list[indexProxy];
}
private static timeoutRequest(ms: number, promise: Promise<any>) {
return new Promise(((resolve, reject) => {
setTimeout(() => { reject(new Error('timeout')); }, ms);
promise.then(resolve, reject);
}));
}
}