@waynechang65/ptt-crawler
Version:
A web crawler module designed to scarp data from Ptt.
211 lines (210 loc) • 9.48 kB
JavaScript
import puppeteer from 'puppeteer';
import os from 'os';
import { log as fmlog } from '@waynechang65/fml-consolelog';
import isInsideDocker from 'is-docker';
let browser;
let page;
let scrapingBoard = '';
let scrapingPages = 1;
let skipBottomPosts = true;
let this_os = '';
const stopSelector = '#main-container > div.r-list-container.action-bar-margin.bbs-screen';
const userAgent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36';
let getContents = false;
async function _initialize(options) {
if (browser) {
return;
}
const chromiumExecutablePath = (isInsideDocker())
? '/usr/bin/chromium' : '/usr/bin/chromium-browser';
this_os = os.platform();
fmlog('event_msg', ['PTT-CRAWLER', 'The OS is ' + this_os,
isInsideDocker() ? '[ Inside a container ]' : '[ Not inside a container ]']);
browser = (this_os === 'linux') ?
await puppeteer.launch(Object.assign({
headless: 'new',
executablePath: chromiumExecutablePath,
args: ['--no-sandbox', '--disable-setuid-sandbox']
}, options)) :
await puppeteer.launch(Object.assign({
headless: false
}, options));
/***** 建立Browser上的 newPage *****/
page = await browser.newPage();
await page.setDefaultNavigationTimeout(180000); // 3 mins
await page.setRequestInterception(true);
page.on('request', request => {
if (request.resourceType() === 'image')
request.abort();
else
request.continue();
});
page.setUserAgent(userAgent);
}
async function _getResults(options) {
const data_pages = [];
//let retObj: MergedPages;
options = options || {};
options.pages = options.pages || 1;
scrapingBoard = options.board || 'Tos';
scrapingPages = (options.pages < 0) ? 1 : options.pages;
skipBottomPosts = options.skipPBs && true;
getContents = options.getContents && true;
/***** 前往 ptt要爬的版面並爬取資料(最新頁面) *****/
const pttUrl = 'https://www.ptt.cc/bbs/' + scrapingBoard + '/index.html';
try {
await page.goto(pttUrl);
const over18Button = await page.$('.over18-button-container');
if (over18Button) {
await Promise.all([
over18Button.click(),
page.waitForNavigation({ waitUntil: 'domcontentloaded' })
]);
}
await page.waitForSelector(stopSelector, { timeout: 60000 });
data_pages.push(await page.evaluate(_scrapingOnePage, skipBottomPosts));
for (let i = 1; i < scrapingPages; i++) {
/***** 點選 "上一頁" 到上一頁較舊的資料 *****/
await page.evaluate(() => {
const buttonPrePage = document.querySelector('#action-bar-container > div > div.btn-group.btn-group-paging > a:nth-child(2)');
buttonPrePage?.click();
});
await page.waitForSelector(stopSelector, { timeout: 60000 });
/***** 抓取網頁資料 (上一頁) *****/
data_pages.push(await page.evaluate(_scrapingOnePage, skipBottomPosts));
}
/***** 將多頁資料 "照實際新舊順序" 合成 1 個物件 *****/
const retObj = await _mergePages(data_pages);
/***** 爬各帖內文 *****/
if (getContents) {
retObj.contents = await _scrapingAllContents(retObj.urls);
}
return retObj;
}
catch (e) {
console.log('[ptt-crawler] ERROR!---getResults', e);
throw e;
}
}
function _scrapingOnePage(skipBPosts = true /* 濾掉置底文 */) {
const aryTitle = [];
const aryHref = [];
const aryRate = [];
let aryAuthor = [];
const aryDate = [];
const aryMark = [];
/****************************************/
/***** 抓所有 Title 及 Href *****/
const titleSelectorAll = '#main-container > div.r-list-container.action-bar-margin.bbs-screen > div.r-ent > div.title > a';
const nlResultTitleAll = document.querySelectorAll(titleSelectorAll);
const aryResultTitleAll = Array.from(nlResultTitleAll);
/****************************************/
/***** 抓置底文 *****/
// (從 div.r-list-sep ~ div.r-ent)
let aryCutOutLength;
const titleSelectorCutOut = '#main-container > div.r-list-container.action-bar-margin.bbs-screen > div.r-list-sep ~ div.r-ent';
const nlResultCutOut = document.querySelectorAll(titleSelectorCutOut);
if (skipBPosts) { // 不顯示置底文
// 取得 div.r-list-sep ~ div.r-ent 的項目次數,這是置底,要扣掉。
aryCutOutLength = Array.from(nlResultCutOut).length;
}
else { // 顯示置底文
aryCutOutLength = 0;
}
for (let i = 0; i < aryResultTitleAll.length - aryCutOutLength; i++) {
aryTitle.push(aryResultTitleAll[i].innerText);
aryHref.push(aryResultTitleAll[i].href);
}
/****************************************/
/***** 抓所有作者(Author) ****/
const authorSelectorAll = '#main-container > div.r-list-container.action-bar-margin.bbs-screen > div.r-ent div.meta div.author';
const nlAuthorAll = document.querySelectorAll(authorSelectorAll);
const aryAuthorAll = Array.from(nlAuthorAll);
//過濾掉 被刪文的 Author 筆數
aryAuthor = aryAuthorAll.filter(author => author.innerText !== '-').map(author => author.innerText);
/****************************************/
/***** 抓所有發文日期(date) ****/
const dateSelectorAll = '#main-container > div.r-list-container.action-bar-margin.bbs-screen > div.r-ent div.meta div.date';
const nlDateAll = document.querySelectorAll(dateSelectorAll);
const aryDateAll = Array.from(nlDateAll);
//過濾掉 被刪文的 date 筆數
aryAuthorAll.map(function (item, index) {
if (item.innerText !== '-')
aryDate.push(aryDateAll[index].innerText);
});
/****************************************/
/***** 抓所有發文標記(mark) ****/
const markSelectorAll = '#main-container > div.r-list-container.action-bar-margin.bbs-screen > div.r-ent div.meta div.mark';
const nlMarkAll = document.querySelectorAll(markSelectorAll);
const aryMarkAll = Array.from(nlMarkAll);
//過濾掉 被刪文的 mark 筆數
aryAuthorAll.map(function (item, index /*, array*/) {
if (item.innerText !== '-')
aryMark.push(aryMarkAll[index].innerText);
});
/****************************************/
/***** 抓所有推文數(Rate) *****/
const rateSelectorAll = '#main-container > div.r-list-container.action-bar-margin.bbs-screen > div.r-ent div.nrec';
const nlRateAll = document.querySelectorAll(rateSelectorAll);
const aryRateAll = Array.from(nlRateAll);
//過濾掉 被刪文的 rate 筆數
aryAuthorAll.map(function (item, index /*, array*/) {
if (item.innerText !== '-')
aryRate.push(aryRateAll[index].innerText);
});
return ({ aryTitle, aryHref, aryRate, aryAuthor, aryDate, aryMark });
}
function _mergePages(pages) {
return new Promise((resolve /*, reject*/) => {
const aryAllPagesTitle = [], aryAllPagesUrl = [], aryAllPagesRate = [], aryAllPagesAuthor = [], aryAllPagesDate = [], aryAllPagesMark = [];
for (let i = 0; i < pages.length; i++) {
const page = pages[i];
const titles = page.aryTitle ?? [];
for (let j = 0; j < titles.length; j++) {
aryAllPagesTitle.push((page.aryTitle ?? [])[titles.length - 1 - j]);
aryAllPagesUrl.push((page.aryHref ?? [])[titles.length - 1 - j]);
aryAllPagesRate.push((page.aryRate ?? [])[titles.length - 1 - j]);
aryAllPagesAuthor.push((page.aryAuthor ?? [])[titles.length - 1 - j]);
aryAllPagesDate.push((page.aryDate ?? [])[titles.length - 1 - j]);
aryAllPagesMark.push((page.aryMark ?? [])[titles.length - 1 - j]);
}
}
const titles = aryAllPagesTitle;
const urls = aryAllPagesUrl;
const rates = aryAllPagesRate;
const authors = aryAllPagesAuthor;
const dates = aryAllPagesDate;
const marks = aryAllPagesMark;
resolve({ titles, urls, rates, authors, dates, marks });
});
}
async function _scrapingAllContents(aryHref) {
const aryContent = [];
const contentSelector = '#main-content';
for (let i = 0; i < aryHref.length; i++) {
try {
if (browser) {
await page.goto(aryHref[i]);
await page.waitForSelector(contentSelector, { timeout: 60000 });
}
}
catch (e) {
console.log('<PTT> page.goto ERROR!---_scrapingAllContents', e);
}
const content = await page.evaluate(() => {
const contentSelector = '#main-content';
const nlResultContent = document.querySelectorAll(contentSelector);
const aryResultContent = Array.from(nlResultContent);
return aryResultContent[0].innerText;
});
aryContent.push(content);
}
return aryContent;
}
async function _close() {
if (browser) {
await browser.close();
browser = undefined;
}
}
export { _initialize as initialize, _getResults as getResults, _close as close };