site-metadata-extractor
Version:
web(site) resource metadata extractor
138 lines (137 loc) • 3.98 kB
TypeScript
import type { Cheerio, CheerioAPI } from "cheerio";
import type { AnyNode } from "domhandler";
import { NewsArticle, Article } from "schema-dts";
import { LinkObj, VideoAttrs } from "./extractor";
export interface PageData {
author: string[];
canonicalLink: string;
copyright: string;
date: string;
description: string;
favicon: string;
image: string;
jsonld: NewsArticle | Article | null;
keywords: string;
lang: string;
links?: LinkObj[];
locale: string;
origin: string;
publisher: string;
siteName: string;
softTitle: string;
tags: string[];
text?: string;
title: string;
type: string;
videos?: VideoAttrs[];
}
export interface LazyExtractor {
author: () => string[];
canonicalLink: () => string;
copyright: () => string;
date: () => string;
description: () => string;
favicon: () => string;
image: () => string;
jsonld: () => NewsArticle | Article | null;
keywords: () => string;
lang: () => string;
links: () => LinkObj[];
locale: () => string;
origin: () => string;
publisher: () => string;
siteName: () => string;
softTitle: () => string;
tags: () => string[];
text: () => string;
title: () => string;
type: () => string;
videos: () => VideoAttrs[];
}
export interface ExtractOptions {
inputUrl?: string;
finalUrl?: string;
lang?: string;
maxStringLength?: number;
maxCandidates?: number;
maxReadableTextLength?: number;
}
export interface AssetCandidate {
url: string;
source: string;
rel?: string;
type?: string;
sizes?: string;
width?: number;
height?: number;
alt?: string;
}
export interface ExtractedLink {
url: string;
text: string;
rel?: string;
title?: string;
}
export interface ExtractedVideo {
url: string;
source: string;
width?: number;
height?: number;
type?: string;
}
export interface TextStats {
charCount: number;
wordCount: number;
sentenceCount: number;
truncated: boolean;
}
export interface ExtractionMetadata {
packageVersion: string;
strategyVersion: string;
warnings: string[];
confidence: number;
}
export interface ExtractedResource {
inputUrl: string;
finalUrl: string;
canonicalUrl: string;
normalizedUrl: string;
domain: string;
title: string;
softTitle: string;
description: string;
author: string[];
publisher: string;
siteName: string;
lang: string;
locale: string;
publishedAt: string;
modifiedAt: string;
faviconCandidates: AssetCandidate[];
imageCandidates: AssetCandidate[];
primaryImage: AssetCandidate | null;
jsonld: unknown[];
rawMeta: Record<string, string[]>;
links: ExtractedLink[];
videos: ExtractedVideo[];
readableText: string;
textStats: TextStats;
extraction: ExtractionMetadata;
}
export interface LazyExtractedResource {
metadata: () => ExtractedResource;
readableText: () => string;
links: () => ExtractedLink[];
videos: () => ExtractedVideo[];
extract: () => ExtractedResource;
}
export declare function extractFromHtml(html: string, options?: ExtractOptions): ExtractedResource;
export declare function extractMetadataOnly(html: string, options?: ExtractOptions): ExtractedResource;
export declare function extractLazy(html: string, options?: ExtractOptions): LazyExtractedResource;
declare const siteMetadataExtractor: (markup: string, resourceUrl: string, lang?: string) => PageData;
export default siteMetadataExtractor;
export declare const lazy: (html: string, resourceUrl: string, language?: string) => LazyExtractor;
export declare function getCleanedDoc(html: string): CheerioAPI;
export declare function getParsedDoc(html: string): CheerioAPI;
export declare function getParsedDocForText(html: string): CheerioAPI;
export declare function getTopNode(doc: CheerioAPI, lang: string): Cheerio<AnyNode>;