@waynechang65/ptt-crawler
Version:
A web crawler module designed to scarp data from Ptt.
449 lines • 19.2 kB
JavaScript
import puppeteer from 'puppeteer-extra';
import StealthPlugin from 'puppeteer-extra-plugin-stealth';
import os from 'os';
import { log as fmlog } from '@waynechang65/fml-consolelog';
import isInsideDocker from 'is-docker';
import * as fs from 'fs/promises';
import { retry } from '@lifeomic/attempt';
import hot from './hot.json' with { type: 'json' };
puppeteer.use(StealthPlugin());
/**
* A class to crawl posts from a PTT board.
*/
export class PttCrawler {
/**
* Creates an instance of PttCrawler.
* @param {LaunchOptions} [options={}] - Puppeteer launch options.
*/
constructor(options = {}) {
this.options = options;
this.stopSelector = '#main-container > div.r-list-container.action-bar-margin.bbs-screen';
this.puppteerTimeout = 5000; // 5s
this.pages = [];
this.scrapingBoard = '';
this.scrapingPages = 1;
this.skipBottomPosts = true;
this.this_os = '';
this.getContents = false;
this.concurrency = 5;
this.debug = {
enable: false,
saveResultToFiles: false,
printRetryInfo: false,
printWorkersInfo: false,
printCrawlInfo: false,
};
this.retryOpt = {
delay: 2000,
maxAttempts: 10,
};
}
/**
* Initializes the crawler, launching a browser instance.
* This must be called before any other methods.
*/
async init(initOption = { concurrency: 5, debug: undefined, retry: undefined }) {
if (this.browser)
return;
try {
const insideDocker = isInsideDocker();
const chromiumExecutablePath = insideDocker ? '/usr/bin/chromium' : '/usr/bin/chromium-browser';
this.this_os = os.platform();
if (initOption.debug) {
this.debug = { ...this.debug, ...initOption.debug };
}
if (initOption.retry) {
this.retryOpt = { ...this.retryOpt, ...initOption.retry };
}
if (this.debug.enable && this.debug.printCrawlInfo) {
fmlog('event_msg', [
'PTT-CRAWLER',
'The OS is ' + this.this_os,
insideDocker ? '[ Inside a container ]' : '[ Not inside a container ]',
]);
}
const defaultLaunchOpts = this.this_os === 'linux'
? {
headless: true,
executablePath: chromiumExecutablePath,
args: ['--no-sandbox', '--disable-setuid-sandbox'],
}
: {
headless: false,
};
this.browser = await puppeteer.launch(Object.assign(defaultLaunchOpts, this.options));
this.concurrency = initOption.concurrency;
for (let i = 0; i < this.concurrency; i++) {
const page = await this.browser.newPage();
await page.setDefaultNavigationTimeout(this.puppteerTimeout);
await page.setRequestInterception(true);
page.on('request', (req) => {
const blocked = ['image', 'font', 'media'];
if (blocked.includes(req.resourceType()))
req.abort();
else
req.continue();
});
this.pages.push({ p: page });
}
}
catch (e) {
if (this.debug.enable && this.debug.printCrawlInfo) {
fmlog('error_msg', ['PTT-CRAWLER', 'init error', String(e)]);
}
throw e;
}
}
/**
* Starts the crawling process.
* @param {CrawlerOptions} [options={}] - Options for the crawl.
* @returns {Promise<MergedPages>} A promise that resolves to the crawled data.
*/
async crawl(options = {}) {
if (!this.browser) {
throw new Error('Crawler is not initialized. Please call init() first.');
}
const data_pages = [];
const pages = typeof options.pages === 'number' && options.pages > 0 ? Math.floor(options.pages) : 1;
const onProgress = options.onProgress;
this.scrapingBoard = options.board || 'Tos';
this.scrapingPages = pages;
this.skipBottomPosts = typeof options.skipPBs === 'boolean' ? options.skipPBs : this.skipBottomPosts;
this.getContents = typeof options.getContents === 'boolean' ? options.getContents : this.getContents;
/***** 前往 ptt要爬的版面並爬取資料(最新頁面) *****/
const page = this.pages[0].p;
const pttUrl = 'https://www.ptt.cc/bbs/' + this.scrapingBoard + '/index.html';
try {
if (onProgress) {
onProgress({
type: 'crawling_pages',
message: `Crawling page 1 of ${this.scrapingPages}...`,
current: 1,
total: this.scrapingPages,
percent: Math.round((1 / this.scrapingPages) * 100),
});
}
await page.bringToFront();
await retry(async (context) => {
if (this.debug.enable && this.debug.printRetryInfo) {
fmlog('event_msg', [`RETRY`, `attemptNum: ${context.attemptNum}`, '']);
}
await page.goto(pttUrl, { waitUntil: 'domcontentloaded', timeout: this.puppteerTimeout });
const over18Button = await page.$('.over18-button-container');
if (over18Button) {
await Promise.all([
over18Button.click(),
page.waitForNavigation({
waitUntil: 'domcontentloaded',
}),
]);
}
await page.waitForSelector(this.stopSelector, { timeout: this.puppteerTimeout });
}, this.retryOpt);
data_pages.push(await page.evaluate(this._scrapingOnePage, this.skipBottomPosts));
for (let i = 1; i < this.scrapingPages; i++) {
if (onProgress) {
onProgress({
type: 'crawling_pages',
message: `Crawling page ${i + 1} of ${this.scrapingPages}...`,
current: i + 1,
total: this.scrapingPages,
percent: Math.round((i + 1 / this.scrapingPages) * 100),
});
}
/***** 點選 "上一頁" 到上一頁較舊的資料 *****/
await page.evaluate(() => {
const buttonPrePage = document.querySelector('#action-bar-container > div > div.btn-group.btn-group-paging > a:nth-child(2)');
buttonPrePage?.click();
});
await retry(async (context) => {
if (this.debug.enable && this.debug.printRetryInfo) {
fmlog('event_msg', [`RETRY`, `attemptNum: ${context.attemptNum}`, '']);
}
await page.waitForSelector(this.stopSelector, { timeout: this.puppteerTimeout });
}, this.retryOpt);
/***** 抓取網頁資料 (上一頁) *****/
data_pages.push(await page.evaluate(this._scrapingOnePage, this.skipBottomPosts));
}
/***** 將多頁資料 "照實際新舊順序" 合成 1 個物件 *****/
const retObj = this._mergePages(data_pages);
/***** 爬各帖內文 *****/
if (this.getContents) {
retObj.contents = await this._scrapingAllContents(retObj.urls, onProgress);
}
return retObj;
}
catch (e) {
if (this.debug.enable && this.debug.printCrawlInfo) {
fmlog('error_msg', ['PTT-CRAWLER', 'crawl error', String(e)]);
}
throw e;
}
}
/**
* Scrapes a single page of posts. This method is executed in the browser context.
* It robustly parses each post as a unit (.r-ent).
* If skipBPosts is true, it stops collecting when it encounters the separator for pinned posts.
* @private
* @param {boolean} [skipBPosts=true] - Whether to skip bottom pinned posts.
* @returns {CrawlerOnePage} The scraped data from one page.
*/
_scrapingOnePage(skipBPosts = true /* 濾掉置底文 */) {
const aryTitle = [];
const aryHref = [];
const aryRate = [];
const aryAuthor = [];
const aryDate = [];
const aryMark = [];
const container = document.querySelector('#main-container > div.r-list-container.action-bar-margin.bbs-screen');
if (!container) {
return { aryTitle, aryHref, aryRate, aryAuthor, aryDate, aryMark };
}
const children = Array.from(container.children);
for (const child of children) {
if (child.classList.contains('r-list-sep')) {
if (skipBPosts) {
// Found separator; stop collecting further .r-ent (these are 置底文)
break;
}
else {
// If not skipping bottom posts, continue to collect subsequent .r-ent as well
continue;
}
}
if (!child.classList.contains('r-ent')) {
continue;
}
// child is .r-ent
const ent = child;
const titleEl = ent.querySelector('div.title > a');
if (!titleEl) {
// deleted post or no link; push placeholders to keep alignment if desired
// We'll skip deleted posts to keep arrays consistent with visible posts.
continue;
}
const title = titleEl.innerText.trim();
const href = titleEl.href;
const rateEl = ent.querySelector('div.nrec');
const rate = rateEl ? rateEl.innerText.trim() : '';
const authorEl = ent.querySelector('div.meta div.author');
const author = authorEl ? authorEl.innerText.trim() : '';
const dateEl = ent.querySelector('div.meta div.date');
const date = dateEl ? dateEl.innerText.trim() : '';
const markEl = ent.querySelector('div.meta div.mark');
const mark = markEl ? markEl.innerText.trim() : '';
aryTitle.push(title);
aryHref.push(href);
aryRate.push(rate);
aryAuthor.push(author);
aryDate.push(date);
aryMark.push(mark);
}
return { aryTitle, aryHref, aryRate, aryAuthor, aryDate, aryMark };
}
/**
* Merges data from multiple pages, ensuring the correct chronological order (newest first).
* @private
* @param {CrawlerOnePage[]} pages - An array of scraped page data.
* @returns {MergedPages} The merged data.
*/
_mergePages(pages) {
const aryAllPagesTitle = [];
const aryAllPagesUrl = [];
const aryAllPagesRate = [];
const aryAllPagesAuthor = [];
const aryAllPagesDate = [];
const aryAllPagesMark = [];
for (let i = 0; i < pages.length; i++) {
const page = pages[i];
const titles = page.aryTitle ?? [];
// push items in reversed order (to keep overall newest -> oldest)
for (let j = titles.length - 1; j >= 0; j--) {
aryAllPagesTitle.push(page.aryTitle ? page.aryTitle[j] : '');
aryAllPagesUrl.push(page.aryHref ? page.aryHref[j] : '');
aryAllPagesRate.push(page.aryRate ? page.aryRate[j] : '');
aryAllPagesAuthor.push(page.aryAuthor ? page.aryAuthor[j] : '');
aryAllPagesDate.push(page.aryDate ? page.aryDate[j] : '');
aryAllPagesMark.push(page.aryMark ? page.aryMark[j] : '');
}
}
const titles = aryAllPagesTitle;
const urls = aryAllPagesUrl;
const rates = aryAllPagesRate;
const authors = aryAllPagesAuthor;
const dates = aryAllPagesDate;
const marks = aryAllPagesMark;
return { titles, urls, rates, authors, dates, marks };
}
/**
* Scrapes the content of all posts concurrently.
* Uses multiple pages for speed and blocks unnecessary resources on each page.
* @private
* @param {string[]} aryHref - An array of post URLs.
* @returns {Promise<string[]>} A promise that resolves to an array of post contents.
*/
async _scrapingAllContents(aryHref, onProgress) {
if (!this.browser) {
throw new Error('Crawler is not initialized. Please call init() first.');
}
const results = new Array(aryHref.length).fill('');
const total = aryHref.length;
const aryTuppleHref = [...aryHref.entries()];
const worker = async (page, stackHref, idxPage) => {
let aHref;
while ((aHref = stackHref.pop()) !== undefined) {
if (onProgress) {
const completedCount = total - stackHref.length;
onProgress({
type: 'fetching_contents',
message: `Fetching content ${completedCount} of ${total}...`,
current: completedCount,
total: total,
percent: Math.round((completedCount / total) * 100),
});
}
const idx = aHref[0];
const url = aHref[1];
try {
await page.bringToFront();
await retry(async (context) => {
if (this.debug.enable && this.debug.printRetryInfo) {
fmlog('event_msg', [`RETRY`, `attemptNum: ${context.attemptNum}`, '']);
}
await page.goto(url, {
waitUntil: 'domcontentloaded',
timeout: this.puppteerTimeout,
});
}, this.retryOpt);
const content = await page.evaluate(() => {
const contentSelector = '#main-content';
const el = document.querySelector(contentSelector);
if (!el)
return '';
return (el.innerText || '').trim();
});
results[idx] = content;
if (this.debug.enable && this.debug.printWorkersInfo) {
fmlog('event_msg', [`WORKER-${idxPage}`, `idx: ${idx}`, content.split('\n')[0]]);
}
}
catch (e) {
if (this.debug.enable && this.debug.printCrawlInfo) {
fmlog('error_msg', [
'PTT-CRAWLER',
`_scrapingAllContents error for ${url}`,
String(e),
]);
}
// 這裏沒有 throw e,主要是思考到如果中間有一個網頁出狀況,整個抓的結果就白費了
// 因此,真的有特定網頁抓不到,就塞個 Error fetching訊息,讓要用的人自己判斷與斟酌是否重抓
results[idx] = `Error fetching content for ${url}: ${String(e)}`;
// throw e;
}
}
};
const workers = [];
for (const [idx, page] of this.pages.entries()) {
workers.push(worker(page.p, aryTuppleHref, idx));
}
await Promise.all(workers);
if (this.debug.enable && this.debug.saveResultToFiles) {
this._saveObjToFile(results, `results-${total}-${this.scrapingBoard}.json`);
}
return results;
}
/**
* Saves an object to a file as a beautified JSON string.
* @private
* @param {object} obj - The object to be saved.
* @param {string} fileWithPath - The full path and filename for the output file.
* @returns {Promise<void>} A promise that resolves once the file is written, or rejects on error.
*/
async _saveObjToFile(obj, fileWithPath) {
try {
const jsonString = JSON.stringify(obj, null, 2);
await fs.writeFile(fileWithPath, jsonString, 'utf-8');
console.log(`Save the file successfuly: ${fileWithPath}`);
}
catch (e) {
console.error('Fail to save: ', e);
}
}
/**
* Closes the browser instance.
*/
async close() {
if (this.browser) {
await this.browser.close();
this.browser = undefined;
}
}
/**
* Transforms the crawled data from a struct of arrays to an array of post objects.
* @param {MergedPages} results The MergedPages object from the crawl() method.
* @returns {Post[]} An array of Post objects.
*/
resultsToObjects(results) {
const posts = [];
const postCount = results.titles.length;
if (postCount === 0)
return [];
for (let i = 0; i < postCount; i++) {
const post = {
title: results.titles[i],
url: results.urls[i],
rate: results.rates[i],
author: results.authors[i],
date: results.dates[i],
mark: results.marks[i],
};
// 如果有內文,也一併加入
if (results.contents && results.contents[i]) {
post.content = results.contents[i];
}
posts.push(post);
}
if (this.debug.enable && this.debug.saveResultToFiles) {
this._saveObjToFile(posts, `results-${posts.length}-resultToObjects.json`);
}
return posts;
}
/**
* Get hot boards of Ptt. (Here is a local json file which
* may become outdated and will need to be updated manually from time to time.)
* @returns {Post[]} An array of Post objects.
*/
getHotBoards() {
return hot;
}
}
let _ptt_crawler = undefined;
/**
* @deprecated The function is deprecated, use PttCrawler class instead
*/
const _initialize = async (options = {}) => {
if (_ptt_crawler)
return;
_ptt_crawler = new PttCrawler(options);
await _ptt_crawler.init();
};
/**
* @deprecated The function is deprecated, use PttCrawler class instead
*/
const _getResults = async (options = {}) => {
if (!_ptt_crawler) {
throw new Error('Crawler is not initialized. Please call init() first.');
}
return await _ptt_crawler.crawl(options);
};
/**
* @deprecated The function is deprecated, use PttCrawler class instead
*/
const _close = async () => {
if (_ptt_crawler) {
await _ptt_crawler.close();
_ptt_crawler = undefined;
}
};
export { _initialize as initialize, _getResults as getResults, _close as close };
//# sourceMappingURL=ptt_crawler.js.map