UNPKG

@waynechang65/ptt-crawler

Version:

A web crawler module designed to scarp data from Ptt.

203 lines (202 loc) 6.82 kB
import { type LaunchOptions } from 'puppeteer'; import { AttemptOptions } from '@lifeomic/attempt'; /** * Options for debug setting of PPT crawler. */ export interface DebugOptions { /** Whether to enable debug mode. */ enable?: boolean; /** Whether to save the result to files. */ saveResultToFiles?: boolean; /** Whether to print retry information. */ printRetryInfo?: boolean; /** Whether to print workers information. */ printWorkersInfo?: boolean; /** Whether to print crawl information. */ printCrawlInfo?: boolean; } /** * Options for the initial of PPT crawler. */ export interface InitOptions { /** The number of concurrent requests. */ concurrency?: number; /** The debug options. */ debug?: DebugOptions; /** The retry options. */ retry?: AttemptOptions<void>; } /** * Options for the PTT crawler. */ export interface CrawlerOptions { /** The number of pages to crawl. */ pages?: number; /** The name of the board to crawl. */ board?: string; /** Whether to skip pinned posts (置底文). */ skipPBs?: undefined | boolean; /** Whether to fetch the content of each post. */ getContents?: undefined | boolean; /** A callback function to receive progress updates. */ onProgress?: (progress: Progress) => void; } /** * Represents the progress of the crawler. */ export interface Progress { /** The type of the current operation. */ type: 'crawling_pages' | 'fetching_contents'; /** A human-readable message describing the current status. */ message: string; /** The number of items completed so far. */ current: number; /** The total number of items to process. */ total: number; /** The completion percentage (0-100). */ percent: number; } /** * Represents the merged data from multiple crawled pages. */ export interface MergedPages { /** Array of post titles. */ titles: string[]; /** Array of post URLs. */ urls: string[]; /** Array of post recommendation counts (rates). */ rates: string[]; /** Array of post authors. */ authors: string[]; /** Array of post dates. */ dates: string[]; /** Array of post marks (e.g., 'M', 'S'). */ marks: string[]; /** Array of post contents. Only available if `getContents` is true. */ contents?: string[]; } /** * Represents a single PTT post. */ export interface Post { /** The title of the post. */ title: string; /** The URL of the post. */ url: string; /** The recommendation count (推文數). */ rate: string; /** The author of the post. */ author: string; /** The date of the post. */ date: string; /** The mark of the post (e.g., 'M', 'S'). */ mark: string; /** The full content of the post, including comments. Only available if `getContents` is true. */ content?: string; } /** * Represents a hot board on PTT. */ export interface HotBoard { /** The name of the board (e.g., 'Gossiping'). */ name: string; /** The classification of the board (e.g., '綜合', '學術'). */ class: string; /** The title of the board (e.g., '[八卦]', '[股票]'). */ title: string; } /** * A class to crawl posts from a PTT board. */ export declare class PttCrawler { private options; private readonly stopSelector; private readonly puppteerTimeout; private browser; private pages; private scrapingBoard; private scrapingPages; private skipBottomPosts; private this_os; private getContents; private concurrency; private debug; private retryOpt; /** * Creates an instance of PttCrawler. * @param {LaunchOptions} [options={}] - Puppeteer launch options. */ constructor(options?: LaunchOptions); /** * Initializes the crawler, launching a browser instance. * This must be called before any other methods. */ init(initOption?: InitOptions): Promise<void>; /** * Starts the crawling process. * @param {CrawlerOptions} [options={}] - Options for the crawl. * @returns {Promise<MergedPages>} A promise that resolves to the crawled data. */ crawl(options?: CrawlerOptions): Promise<MergedPages>; /** * Scrapes a single page of posts. This method is executed in the browser context. * It robustly parses each post as a unit (.r-ent). * If skipBPosts is true, it stops collecting when it encounters the separator for pinned posts. * @private * @param {boolean} [skipBPosts=true] - Whether to skip bottom pinned posts. * @returns {CrawlerOnePage} The scraped data from one page. */ private _scrapingOnePage; /** * Merges data from multiple pages, ensuring the correct chronological order (newest first). * @private * @param {CrawlerOnePage[]} pages - An array of scraped page data. * @returns {MergedPages} The merged data. */ private _mergePages; /** * Scrapes the content of all posts concurrently. * Uses multiple pages for speed and blocks unnecessary resources on each page. * @private * @param {string[]} aryHref - An array of post URLs. * @returns {Promise<string[]>} A promise that resolves to an array of post contents. */ private _scrapingAllContents; /** * Saves an object to a file as a beautified JSON string. * @private * @param {object} obj - The object to be saved. * @param {string} fileWithPath - The full path and filename for the output file. * @returns {Promise<void>} A promise that resolves once the file is written, or rejects on error. */ private _saveObjToFile; /** * Closes the browser instance. */ close(): Promise<void>; /** * Transforms the crawled data from a struct of arrays to an array of post objects. * @param {MergedPages} results The MergedPages object from the crawl() method. * @returns {Post[]} An array of Post objects. */ resultsToObjects(results: MergedPages): Post[]; /** * Get hot boards of Ptt. (Here is a local json file which * may become outdated and will need to be updated manually from time to time.) * @returns {Post[]} An array of Post objects. */ getHotBoards(): HotBoard[]; } /** * @deprecated The function is deprecated, use PttCrawler class instead */ declare const _initialize: (options?: LaunchOptions) => Promise<void>; /** * @deprecated The function is deprecated, use PttCrawler class instead */ declare const _getResults: (options?: CrawlerOptions) => Promise<MergedPages>; /** * @deprecated The function is deprecated, use PttCrawler class instead */ declare const _close: () => Promise<void>; export { _initialize as initialize, _getResults as getResults, _close as close };