UNPKG

@waynechang65/ptt-crawler

Version:

A web crawler module designed to scarp data from Ptt.

github.com/WayneChang65/ptt-crawler

WayneChang65/ptt-crawler

449 lines • 19.2 kB

JavaScript

import puppeteer from 'puppeteer-extra'; import StealthPlugin from 'puppeteer-extra-plugin-stealth'; import os from 'os'; import { log as fmlog } from '@waynechang65/fml-consolelog'; import isInsideDocker from 'is-docker'; import * as fs from 'fs/promises'; import { retry } from '@lifeomic/attempt'; import hot from './hot.json' with { type: 'json' }; puppeteer.use(StealthPlugin()); /** * A class to crawl posts from a PTT board. */ export class PttCrawler { /** * Creates an instance of PttCrawler. * @param {LaunchOptions} [options={}] - Puppeteer launch options. */ constructor(options = {}) { this.options = options; this.stopSelector = '#main-container > div.r-list-container.action-bar-margin.bbs-screen'; this.puppteerTimeout = 5000; // 5s this.pages = []; this.scrapingBoard = ''; this.scrapingPages = 1; this.skipBottomPosts = true; this.this_os = ''; this.getContents = false; this.concurrency = 5; this.debug = { enable: false, saveResultToFiles: false, printRetryInfo: false, printWorkersInfo: false, printCrawlInfo: false, }; this.retryOpt = { delay: 2000, maxAttempts: 10, }; } /** * Initializes the crawler, launching a browser instance. * This must be called before any other methods. */ async init(initOption = { concurrency: 5, debug: undefined, retry: undefined }) { if (this.browser) return; try { const insideDocker = isInsideDocker(); const chromiumExecutablePath = insideDocker ? '/usr/bin/chromium' : '/usr/bin/chromium-browser'; this.this_os = os.platform(); if (initOption.debug) { this.debug = { ...this.debug, ...initOption.debug }; } if (initOption.retry) { this.retryOpt = { ...this.retryOpt, ...initOption.retry }; } if (this.debug.enable && this.debug.printCrawlInfo) { fmlog('event_msg', [ 'PTT-CRAWLER', 'The OS is ' + this.this_os, insideDocker ? '[ Inside a container ]' : '[ Not inside a container ]', ]); } const defaultLaunchOpts = this.this_os === 'linux' ? { headless: true, executablePath: chromiumExecutablePath, args: ['--no-sandbox', '--disable-setuid-sandbox'], } : { headless: false, }; this.browser = await puppeteer.launch(Object.assign(defaultLaunchOpts, this.options)); this.concurrency = initOption.concurrency; for (let i = 0; i < this.concurrency; i++) { const page = await this.browser.newPage(); await page.setDefaultNavigationTimeout(this.puppteerTimeout); await page.setRequestInterception(true); page.on('request', (req) => { const blocked = ['image', 'font', 'media']; if (blocked.includes(req.resourceType())) req.abort(); else req.continue(); }); this.pages.push({ p: page }); } } catch (e) { if (this.debug.enable && this.debug.printCrawlInfo) { fmlog('error_msg', ['PTT-CRAWLER', 'init error', String(e)]); } throw e; } } /** * Starts the crawling process. * @param {CrawlerOptions} [options={}] - Options for the crawl. * @returns {Promise<MergedPages>} A promise that resolves to the crawled data. */ async crawl(options = {}) { if (!this.browser) { throw new Error('Crawler is not initialized. Please call init() first.'); } const data_pages = []; const pages = typeof options.pages === 'number' && options.pages > 0 ? Math.floor(options.pages) : 1; const onProgress = options.onProgress; this.scrapingBoard = options.board || 'Tos'; this.scrapingPages = pages; this.skipBottomPosts = typeof options.skipPBs === 'boolean' ? options.skipPBs : this.skipBottomPosts; this.getContents = typeof options.getContents === 'boolean' ? options.getContents : this.getContents; /***** 前往 ptt要爬的版面並爬取資料(最新頁面) *****/ const page = this.pages[0].p; const pttUrl = 'https://www.ptt.cc/bbs/' + this.scrapingBoard + '/index.html'; try { if (onProgress) { onProgress({ type: 'crawling_pages', message: `Crawling page 1 of ${this.scrapingPages}...`, current: 1, total: this.scrapingPages, percent: Math.round((1 / this.scrapingPages) * 100), }); } await page.bringToFront(); await retry(async (context) => { if (this.debug.enable && this.debug.printRetryInfo) { fmlog('event_msg', [`RETRY`, `attemptNum: ${context.attemptNum}`, '']); } await page.goto(pttUrl, { waitUntil: 'domcontentloaded', timeout: this.puppteerTimeout }); const over18Button = await page.$('.over18-button-container'); if (over18Button) { await Promise.all([ over18Button.click(), page.waitForNavigation({ waitUntil: 'domcontentloaded', }), ]); } await page.waitForSelector(this.stopSelector, { timeout: this.puppteerTimeout }); }, this.retryOpt); data_pages.push(await page.evaluate(this._scrapingOnePage, this.skipBottomPosts)); for (let i = 1; i < this.scrapingPages; i++) { if (onProgress) { onProgress({ type: 'crawling_pages', message: `Crawling page ${i + 1} of ${this.scrapingPages}...`, current: i + 1, total: this.scrapingPages, percent: Math.round((i + 1 / this.scrapingPages) * 100), }); } /***** 點選 "上一頁" 到上一頁較舊的資料 *****/ await page.evaluate(() => { const buttonPrePage = document.querySelector('#action-bar-container > div > div.btn-group.btn-group-paging > a:nth-child(2)'); buttonPrePage?.click(); }); await retry(async (context) => { if (this.debug.enable && this.debug.printRetryInfo) { fmlog('event_msg', [`RETRY`, `attemptNum: ${context.attemptNum}`, '']); } await page.waitForSelector(this.stopSelector, { timeout: this.puppteerTimeout }); }, this.retryOpt); /***** 抓取網頁資料 (上一頁) *****/ data_pages.push(await page.evaluate(this._scrapingOnePage, this.skipBottomPosts)); } /***** 將多頁資料 "照實際新舊順序" 合成 1 個物件 *****/ const retObj = this._mergePages(data_pages); /***** 爬各帖內文 *****/ if (this.getContents) { retObj.contents = await this._scrapingAllContents(retObj.urls, onProgress); } return retObj; } catch (e) { if (this.debug.enable && this.debug.printCrawlInfo) { fmlog('error_msg', ['PTT-CRAWLER', 'crawl error', String(e)]); } throw e; } } /** * Scrapes a single page of posts. This method is executed in the browser context. * It robustly parses each post as a unit (.r-ent). * If skipBPosts is true, it stops collecting when it encounters the separator for pinned posts. * @private * @param {boolean} [skipBPosts=true] - Whether to skip bottom pinned posts. * @returns {CrawlerOnePage} The scraped data from one page. */ _scrapingOnePage(skipBPosts = true /* 濾掉置底文 */) { const aryTitle = []; const aryHref = []; const aryRate = []; const aryAuthor = []; const aryDate = []; const aryMark = []; const container = document.querySelector('#main-container > div.r-list-container.action-bar-margin.bbs-screen'); if (!container) { return { aryTitle, aryHref, aryRate, aryAuthor, aryDate, aryMark }; } const children = Array.from(container.children); for (const child of children) { if (child.classList.contains('r-list-sep')) { if (skipBPosts) { // Found separator; stop collecting further .r-ent (these are 置底文) break; } else { // If not skipping bottom posts, continue to collect subsequent .r-ent as well continue; } } if (!child.classList.contains('r-ent')) { continue; } // child is .r-ent const ent = child; const titleEl = ent.querySelector('div.title > a'); if (!titleEl) { // deleted post or no link; push placeholders to keep alignment if desired // We'll skip deleted posts to keep arrays consistent with visible posts. continue; } const title = titleEl.innerText.trim(); const href = titleEl.href; const rateEl = ent.querySelector('div.nrec'); const rate = rateEl ? rateEl.innerText.trim() : ''; const authorEl = ent.querySelector('div.meta div.author'); const author = authorEl ? authorEl.innerText.trim() : ''; const dateEl = ent.querySelector('div.meta div.date'); const date = dateEl ? dateEl.innerText.trim() : ''; const markEl = ent.querySelector('div.meta div.mark'); const mark = markEl ? markEl.innerText.trim() : ''; aryTitle.push(title); aryHref.push(href); aryRate.push(rate); aryAuthor.push(author); aryDate.push(date); aryMark.push(mark); } return { aryTitle, aryHref, aryRate, aryAuthor, aryDate, aryMark }; } /** * Merges data from multiple pages, ensuring the correct chronological order (newest first). * @private * @param {CrawlerOnePage[]} pages - An array of scraped page data. * @returns {MergedPages} The merged data. */ _mergePages(pages) { const aryAllPagesTitle = []; const aryAllPagesUrl = []; const aryAllPagesRate = []; const aryAllPagesAuthor = []; const aryAllPagesDate = []; const aryAllPagesMark = []; for (let i = 0; i < pages.length; i++) { const page = pages[i]; const titles = page.aryTitle ?? []; // push items in reversed order (to keep overall newest -> oldest) for (let j = titles.length - 1; j >= 0; j--) { aryAllPagesTitle.push(page.aryTitle ? page.aryTitle[j] : ''); aryAllPagesUrl.push(page.aryHref ? page.aryHref[j] : ''); aryAllPagesRate.push(page.aryRate ? page.aryRate[j] : ''); aryAllPagesAuthor.push(page.aryAuthor ? page.aryAuthor[j] : ''); aryAllPagesDate.push(page.aryDate ? page.aryDate[j] : ''); aryAllPagesMark.push(page.aryMark ? page.aryMark[j] : ''); } } const titles = aryAllPagesTitle; const urls = aryAllPagesUrl; const rates = aryAllPagesRate; const authors = aryAllPagesAuthor; const dates = aryAllPagesDate; const marks = aryAllPagesMark; return { titles, urls, rates, authors, dates, marks }; } /** * Scrapes the content of all posts concurrently. * Uses multiple pages for speed and blocks unnecessary resources on each page. * @private * @param {string[]} aryHref - An array of post URLs. * @returns {Promise<string[]>} A promise that resolves to an array of post contents. */ async _scrapingAllContents(aryHref, onProgress) { if (!this.browser) { throw new Error('Crawler is not initialized. Please call init() first.'); } const results = new Array(aryHref.length).fill(''); const total = aryHref.length; const aryTuppleHref = [...aryHref.entries()]; const worker = async (page, stackHref, idxPage) => { let aHref; while ((aHref = stackHref.pop()) !== undefined) { if (onProgress) { const completedCount = total - stackHref.length; onProgress({ type: 'fetching_contents', message: `Fetching content ${completedCount} of ${total}...`, current: completedCount, total: total, percent: Math.round((completedCount / total) * 100), }); } const idx = aHref[0]; const url = aHref[1]; try { await page.bringToFront(); await retry(async (context) => { if (this.debug.enable && this.debug.printRetryInfo) { fmlog('event_msg', [`RETRY`, `attemptNum: ${context.attemptNum}`, '']); } await page.goto(url, { waitUntil: 'domcontentloaded', timeout: this.puppteerTimeout, }); }, this.retryOpt); const content = await page.evaluate(() => { const contentSelector = '#main-content'; const el = document.querySelector(contentSelector); if (!el) return ''; return (el.innerText || '').trim(); }); results[idx] = content; if (this.debug.enable && this.debug.printWorkersInfo) { fmlog('event_msg', [`WORKER-${idxPage}`, `idx: ${idx}`, content.split('\n')[0]]); } } catch (e) { if (this.debug.enable && this.debug.printCrawlInfo) { fmlog('error_msg', [ 'PTT-CRAWLER', `_scrapingAllContents error for ${url}`, String(e), ]); } // 這裏沒有 throw e，主要是思考到如果中間有一個網頁出狀況，整個抓的結果就白費了 // 因此，真的有特定網頁抓不到，就塞個 Error fetching訊息，讓要用的人自己判斷與斟酌是否重抓 results[idx] = `Error fetching content for ${url}: ${String(e)}`; // throw e; } } }; const workers = []; for (const [idx, page] of this.pages.entries()) { workers.push(worker(page.p, aryTuppleHref, idx)); } await Promise.all(workers); if (this.debug.enable && this.debug.saveResultToFiles) { this._saveObjToFile(results, `results-${total}-${this.scrapingBoard}.json`); } return results; } /** * Saves an object to a file as a beautified JSON string. * @private * @param {object} obj - The object to be saved. * @param {string} fileWithPath - The full path and filename for the output file. * @returns {Promise<void>} A promise that resolves once the file is written, or rejects on error. */ async _saveObjToFile(obj, fileWithPath) { try { const jsonString = JSON.stringify(obj, null, 2); await fs.writeFile(fileWithPath, jsonString, 'utf-8'); console.log(`Save the file successfuly: ${fileWithPath}`); } catch (e) { console.error('Fail to save: ', e); } } /** * Closes the browser instance. */ async close() { if (this.browser) { await this.browser.close(); this.browser = undefined; } } /** * Transforms the crawled data from a struct of arrays to an array of post objects. * @param {MergedPages} results The MergedPages object from the crawl() method. * @returns {Post[]} An array of Post objects. */ resultsToObjects(results) { const posts = []; const postCount = results.titles.length; if (postCount === 0) return []; for (let i = 0; i < postCount; i++) { const post = { title: results.titles[i], url: results.urls[i], rate: results.rates[i], author: results.authors[i], date: results.dates[i], mark: results.marks[i], }; // 如果有內文，也一併加入 if (results.contents && results.contents[i]) { post.content = results.contents[i]; } posts.push(post); } if (this.debug.enable && this.debug.saveResultToFiles) { this._saveObjToFile(posts, `results-${posts.length}-resultToObjects.json`); } return posts; } /** * Get hot boards of Ptt. (Here is a local json file which * may become outdated and will need to be updated manually from time to time.) * @returns {Post[]} An array of Post objects. */ getHotBoards() { return hot; } } let _ptt_crawler = undefined; /** * @deprecated The function is deprecated, use PttCrawler class instead */ const _initialize = async (options = {}) => { if (_ptt_crawler) return; _ptt_crawler = new PttCrawler(options); await _ptt_crawler.init(); }; /** * @deprecated The function is deprecated, use PttCrawler class instead */ const _getResults = async (options = {}) => { if (!_ptt_crawler) { throw new Error('Crawler is not initialized. Please call init() first.'); } return await _ptt_crawler.crawl(options); }; /** * @deprecated The function is deprecated, use PttCrawler class instead */ const _close = async () => { if (_ptt_crawler) { await _ptt_crawler.close(); _ptt_crawler = undefined; } }; export { _initialize as initialize, _getResults as getResults, _close as close }; //# sourceMappingURL=ptt_crawler.js.map