defuddle
Version:
Extract article content and metadata from web pages.
32 lines (31 loc) • 1.19 kB
TypeScript
import { DebugRemoval } from './types';
export interface ContentScore {
score: number;
element: Element;
}
export declare class ContentScorer {
private doc;
private debug;
constructor(doc: Document, debug?: boolean);
static scoreElement(element: Element): number;
static findBestElement(elements: Element[], minScore?: number): Element | null;
/**
* Scores blocks based on their content and structure
* and removes those that are likely not content.
*/
static scoreAndRemove(doc: Document, debug?: boolean, debugRemovals?: DebugRemoval[], mainContent?: Element | null): void;
/**
* Determines if an element is likely to be content based on its structure and attributes.
*/
private static isLikelyContent;
/**
* Scores a block element based on various criteria to determine if it's likely not content.
* Returns a negative score if the element is likely not content, a positive score if it is.
*/
private static scoreNonContentBlock;
/**
* Detects article card grids: blocks with 3+ headings and 2+ images
* but very little prose per heading.
*/
private static isCardGrid;
}