crawlerzzz
Version:
53 lines (47 loc) • 1.16 kB
text/typescript
import liveCrawler, { LinkType } from '../src/liveCrawler';
import { Page } from 'puppeteer';
import * as cheerio from 'cheerio';
import { propArraySelector } from '../src/htmlSelector';
const processPage = async (page: Page, link: LinkType) => {
const content = await page.content();
let $ = cheerio.load(content);
console.log('content', content);
};
liveCrawler({
domain: 'https://www.goodreads.com',
startFrom: '/quotes',
processPage,
manualLogin: true,
rateLimit: 3000,
secondsWaitForLogin: 10,
launchConfig: {
headless: false,
userDataDir: 'browsercache'
}
//@ts-ignore
}).then(({ pageSource$, link$, queueLink }) => {
//@ts-ignore
pageSource$.subscribe(res => {
const src = res.src;
const $ = cheerio.load(src);
$('div.pagingnav a')
.toArray()
.map(a => $(a).attr('href'))
.map(queueLink);
let a = propArraySelector($, '.quotes .quote', {
content: (q: any) =>
$('.quoteText', q)
.contents()
.first()
.text()
.trim(),
author: (q: any) =>
$('.authorOrTitle', q)
.text()
.trim()
});
const lk = $('.next_page').attr('href');
queueLink(lk);
console.log(a);
});
});