defuddle
Version:
Extract article content and metadata from web pages.
130 lines (129 loc) • 4.51 kB
TypeScript
import { DefuddleOptions, DefuddleResponse } from './types';
export declare class Defuddle {
private readonly doc;
private options;
private debug;
private _schemaOrgData;
private _schemaOrgExtracted;
private _metaTags;
private _metadata;
private _mobileStyles;
private _smallImages;
/**
* Create a new Defuddle instance
* @param doc - The document to parse
* @param options - Options for parsing
*/
constructor(doc: Document, options?: DefuddleOptions);
/**
* Lazily extract and cache schema.org data. Must be called before
* parse() strips script tags from the document.
*/
private getSchemaOrgData;
/**
* Parse the document and extract its main content
*/
parse(): DefuddleResponse;
/**
* Extract text content from schema.org data (e.g. SocialMediaPosting, Article)
*/
private _getSchemaText;
/**
* Remove dangerous elements and attributes from this.doc.
* Called after parseInternal so that extractors and schema extraction
* can still read script tags they depend on.
*/
private _stripUnsafeElements;
/**
* Find a DOM element whose text matches the schema.org text content.
* Used when the content scorer picked the wrong element from a feed page.
* Returns the element's inner HTML including sibling media (images, etc.)
*/
private _findContentBySchemaText;
/**
* Get the largest available src from an img element,
* checking srcset for higher-resolution versions.
*/
private _getLargestImageSrc;
/**
* Parse the document asynchronously. Checks for extractors that prefer
* async (e.g. YouTube transcripts) before sync, then falls back to async
* extractors if sync parse yields no content.
*/
parseAsync(): Promise<DefuddleResponse>;
/**
* Fetch only async variables (e.g. transcript) without re-parsing.
* Safe to call after parse() — uses cached schema.org data since
* parse() strips script tags from the document.
*/
fetchAsyncVariables(): Promise<{
[key: string]: string;
} | null>;
private tryAsyncExtractor;
/**
* Internal parse method that does the actual work
*/
private parseInternal;
private countWords;
private _log;
private _evaluateMediaQueries;
private applyMobileStyles;
private removeImages;
private removeHiddenElements;
private removeBySelector;
private findSmallImages;
private removeSmallImages;
private getElementIdentifier;
private findMainContent;
private findTableBasedContent;
private findContentByScoring;
private getElementSelector;
private getComputedStyle;
/**
* Resolve relative URLs to absolute within a DOM element
*/
private resolveRelativeUrls;
/**
* Flatten shadow DOM content into a cloned document.
* Walks both trees in parallel so positional correspondence is exact.
*/
private flattenShadowRoots;
/**
* Resolve React streaming SSR suspense boundaries.
* React's streaming SSR places content in hidden divs (id="S:0") and
* template placeholders (id="B:0") with $RC scripts to swap them.
* Since we don't execute scripts, we perform the swap manually.
*/
private resolveStreamedContent;
/**
* Replace a shadow DOM host element with a div containing its shadow content.
* Custom elements (tag names with hyphens) would re-initialize when inserted
* into a live DOM, recreating their shadow roots and hiding the content.
*/
private replaceShadowHost;
/**
* Resolve relative URLs in an HTML string
*/
private resolveContentUrls;
private _extractSchemaOrgData;
private _collectMetaTags;
private _decodeHTMLEntities;
/**
* Build a DefuddleResponse from an extractor result with metadata
*/
private buildExtractorResponse;
/**
* Filter extractor variables to only include custom ones
* (exclude standard fields that are already mapped to top-level properties)
*/
private getExtractorVariables;
/**
* Content-based pattern removal for elements that can't be detected by
* CSS selectors (e.g. Tailwind/CSS-in-JS sites with non-semantic class names).
*/
private removeByContentPattern;
/**
* Remove an element's following siblings, and optionally the element itself.
*/
private removeTrailingSiblings;
}