@waynechang65/ptt-crawler
Version:
A web crawler module designed to scarp data from Ptt.
203 lines (202 loc) • 6.82 kB
TypeScript
import { type LaunchOptions } from 'puppeteer';
import { AttemptOptions } from '@lifeomic/attempt';
/**
* Options for debug setting of PPT crawler.
*/
export interface DebugOptions {
/** Whether to enable debug mode. */
enable?: boolean;
/** Whether to save the result to files. */
saveResultToFiles?: boolean;
/** Whether to print retry information. */
printRetryInfo?: boolean;
/** Whether to print workers information. */
printWorkersInfo?: boolean;
/** Whether to print crawl information. */
printCrawlInfo?: boolean;
}
/**
* Options for the initial of PPT crawler.
*/
export interface InitOptions {
/** The number of concurrent requests. */
concurrency?: number;
/** The debug options. */
debug?: DebugOptions;
/** The retry options. */
retry?: AttemptOptions<void>;
}
/**
* Options for the PTT crawler.
*/
export interface CrawlerOptions {
/** The number of pages to crawl. */
pages?: number;
/** The name of the board to crawl. */
board?: string;
/** Whether to skip pinned posts (置底文). */
skipPBs?: undefined | boolean;
/** Whether to fetch the content of each post. */
getContents?: undefined | boolean;
/** A callback function to receive progress updates. */
onProgress?: (progress: Progress) => void;
}
/**
* Represents the progress of the crawler.
*/
export interface Progress {
/** The type of the current operation. */
type: 'crawling_pages' | 'fetching_contents';
/** A human-readable message describing the current status. */
message: string;
/** The number of items completed so far. */
current: number;
/** The total number of items to process. */
total: number;
/** The completion percentage (0-100). */
percent: number;
}
/**
* Represents the merged data from multiple crawled pages.
*/
export interface MergedPages {
/** Array of post titles. */
titles: string[];
/** Array of post URLs. */
urls: string[];
/** Array of post recommendation counts (rates). */
rates: string[];
/** Array of post authors. */
authors: string[];
/** Array of post dates. */
dates: string[];
/** Array of post marks (e.g., 'M', 'S'). */
marks: string[];
/** Array of post contents. Only available if `getContents` is true. */
contents?: string[];
}
/**
* Represents a single PTT post.
*/
export interface Post {
/** The title of the post. */
title: string;
/** The URL of the post. */
url: string;
/** The recommendation count (推文數). */
rate: string;
/** The author of the post. */
author: string;
/** The date of the post. */
date: string;
/** The mark of the post (e.g., 'M', 'S'). */
mark: string;
/** The full content of the post, including comments. Only available if `getContents` is true. */
content?: string;
}
/**
* Represents a hot board on PTT.
*/
export interface HotBoard {
/** The name of the board (e.g., 'Gossiping'). */
name: string;
/** The classification of the board (e.g., '綜合', '學術'). */
class: string;
/** The title of the board (e.g., '[八卦]', '[股票]'). */
title: string;
}
/**
* A class to crawl posts from a PTT board.
*/
export declare class PttCrawler {
private options;
private readonly stopSelector;
private readonly puppteerTimeout;
private browser;
private pages;
private scrapingBoard;
private scrapingPages;
private skipBottomPosts;
private this_os;
private getContents;
private concurrency;
private debug;
private retryOpt;
/**
* Creates an instance of PttCrawler.
* @param {LaunchOptions} [options={}] - Puppeteer launch options.
*/
constructor(options?: LaunchOptions);
/**
* Initializes the crawler, launching a browser instance.
* This must be called before any other methods.
*/
init(initOption?: InitOptions): Promise<void>;
/**
* Starts the crawling process.
* @param {CrawlerOptions} [options={}] - Options for the crawl.
* @returns {Promise<MergedPages>} A promise that resolves to the crawled data.
*/
crawl(options?: CrawlerOptions): Promise<MergedPages>;
/**
* Scrapes a single page of posts. This method is executed in the browser context.
* It robustly parses each post as a unit (.r-ent).
* If skipBPosts is true, it stops collecting when it encounters the separator for pinned posts.
* @private
* @param {boolean} [skipBPosts=true] - Whether to skip bottom pinned posts.
* @returns {CrawlerOnePage} The scraped data from one page.
*/
private _scrapingOnePage;
/**
* Merges data from multiple pages, ensuring the correct chronological order (newest first).
* @private
* @param {CrawlerOnePage[]} pages - An array of scraped page data.
* @returns {MergedPages} The merged data.
*/
private _mergePages;
/**
* Scrapes the content of all posts concurrently.
* Uses multiple pages for speed and blocks unnecessary resources on each page.
* @private
* @param {string[]} aryHref - An array of post URLs.
* @returns {Promise<string[]>} A promise that resolves to an array of post contents.
*/
private _scrapingAllContents;
/**
* Saves an object to a file as a beautified JSON string.
* @private
* @param {object} obj - The object to be saved.
* @param {string} fileWithPath - The full path and filename for the output file.
* @returns {Promise<void>} A promise that resolves once the file is written, or rejects on error.
*/
private _saveObjToFile;
/**
* Closes the browser instance.
*/
close(): Promise<void>;
/**
* Transforms the crawled data from a struct of arrays to an array of post objects.
* @param {MergedPages} results The MergedPages object from the crawl() method.
* @returns {Post[]} An array of Post objects.
*/
resultsToObjects(results: MergedPages): Post[];
/**
* Get hot boards of Ptt. (Here is a local json file which
* may become outdated and will need to be updated manually from time to time.)
* @returns {Post[]} An array of Post objects.
*/
getHotBoards(): HotBoard[];
}
/**
* @deprecated The function is deprecated, use PttCrawler class instead
*/
declare const _initialize: (options?: LaunchOptions) => Promise<void>;
/**
* @deprecated The function is deprecated, use PttCrawler class instead
*/
declare const _getResults: (options?: CrawlerOptions) => Promise<MergedPages>;
/**
* @deprecated The function is deprecated, use PttCrawler class instead
*/
declare const _close: () => Promise<void>;
export { _initialize as initialize, _getResults as getResults, _close as close };