UNPKG

@llml-browser/types

Version:

TypeScript types and schemas for the @llml-browser API

913 lines (907 loc) 31.9 kB
import { c as ExtractedLinks } from './index-ChkdX0Va.js'; import { c as PageMetadata } from './index-CJbaIceM.js'; import { z } from 'zod'; /** * Schema for content extraction options. * Defines options for extracting different types of content from a webpage. * * @property metadataOptions - Options for metadata extraction * @property linksOptions - Options for link extraction * @property cleanedHtmlOptions - Options for HTML cleaning */ declare const contentOptionsSchema: z.ZodObject<{ /** * Options for metadata extraction. * Controls how metadata like title, description, etc. are extracted. */ metadataOptions: z.ZodOptional<z.ZodObject<{ title: z.ZodDefault<z.ZodOptional<z.ZodBoolean>>; description: z.ZodDefault<z.ZodOptional<z.ZodBoolean>>; language: z.ZodDefault<z.ZodOptional<z.ZodBoolean>>; canonical: z.ZodDefault<z.ZodOptional<z.ZodBoolean>>; robots: z.ZodDefault<z.ZodOptional<z.ZodBoolean>>; author: z.ZodDefault<z.ZodOptional<z.ZodBoolean>>; keywords: z.ZodDefault<z.ZodOptional<z.ZodBoolean>>; favicon: z.ZodDefault<z.ZodOptional<z.ZodBoolean>>; openGraph: z.ZodDefault<z.ZodOptional<z.ZodBoolean>>; twitter: z.ZodDefault<z.ZodOptional<z.ZodBoolean>>; isIframeAllowed: z.ZodDefault<z.ZodOptional<z.ZodBoolean>>; }, "strip", z.ZodTypeAny, { title: boolean; description: boolean; language: boolean; canonical: boolean; robots: boolean; author: boolean; keywords: boolean; favicon: boolean; openGraph: boolean; twitter: boolean; isIframeAllowed: boolean; }, { title?: boolean | undefined; description?: boolean | undefined; language?: boolean | undefined; canonical?: boolean | undefined; robots?: boolean | undefined; author?: boolean | undefined; keywords?: boolean | undefined; favicon?: boolean | undefined; openGraph?: boolean | undefined; twitter?: boolean | undefined; isIframeAllowed?: boolean | undefined; }>>; /** * Options for link extraction. * Controls how links are extracted and categorized. */ linksOptions: z.ZodOptional<z.ZodObject<{ includeExternal: z.ZodOptional<z.ZodBoolean>; includeMedia: z.ZodOptional<z.ZodBoolean>; excludePatterns: z.ZodOptional<z.ZodArray<z.ZodString, "many">>; removeQueryParams: z.ZodOptional<z.ZodBoolean>; }, "strict", z.ZodTypeAny, { includeExternal?: boolean | undefined; includeMedia?: boolean | undefined; excludePatterns?: string[] | undefined; removeQueryParams?: boolean | undefined; }, { includeExternal?: boolean | undefined; includeMedia?: boolean | undefined; excludePatterns?: string[] | undefined; removeQueryParams?: boolean | undefined; }>>; /** * Options for HTML cleaning. * Controls how HTML is sanitized and cleaned. */ cleanedHtmlOptions: z.ZodOptional<z.ZodObject<{ allowedHTMLTags: z.ZodOptional<z.ZodArray<z.ZodString, "many">>; disallowedHTMLTags: z.ZodOptional<z.ZodArray<z.ZodString, "many">>; extractMainContent: z.ZodDefault<z.ZodOptional<z.ZodBoolean>>; documentBaseUrl: z.ZodOptional<z.ZodString>; removeBase64Images: z.ZodDefault<z.ZodOptional<z.ZodBoolean>>; }, "strict", z.ZodTypeAny, { extractMainContent: boolean; removeBase64Images: boolean; allowedHTMLTags?: string[] | undefined; disallowedHTMLTags?: string[] | undefined; documentBaseUrl?: string | undefined; }, { allowedHTMLTags?: string[] | undefined; disallowedHTMLTags?: string[] | undefined; extractMainContent?: boolean | undefined; documentBaseUrl?: string | undefined; removeBase64Images?: boolean | undefined; }>>; }, "strip", z.ZodTypeAny, { metadataOptions?: { title: boolean; description: boolean; language: boolean; canonical: boolean; robots: boolean; author: boolean; keywords: boolean; favicon: boolean; openGraph: boolean; twitter: boolean; isIframeAllowed: boolean; } | undefined; linksOptions?: { includeExternal?: boolean | undefined; includeMedia?: boolean | undefined; excludePatterns?: string[] | undefined; removeQueryParams?: boolean | undefined; } | undefined; cleanedHtmlOptions?: { extractMainContent: boolean; removeBase64Images: boolean; allowedHTMLTags?: string[] | undefined; disallowedHTMLTags?: string[] | undefined; documentBaseUrl?: string | undefined; } | undefined; }, { metadataOptions?: { title?: boolean | undefined; description?: boolean | undefined; language?: boolean | undefined; canonical?: boolean | undefined; robots?: boolean | undefined; author?: boolean | undefined; keywords?: boolean | undefined; favicon?: boolean | undefined; openGraph?: boolean | undefined; twitter?: boolean | undefined; isIframeAllowed?: boolean | undefined; } | undefined; linksOptions?: { includeExternal?: boolean | undefined; includeMedia?: boolean | undefined; excludePatterns?: string[] | undefined; removeQueryParams?: boolean | undefined; } | undefined; cleanedHtmlOptions?: { allowedHTMLTags?: string[] | undefined; disallowedHTMLTags?: string[] | undefined; extractMainContent?: boolean | undefined; documentBaseUrl?: string | undefined; removeBase64Images?: boolean | undefined; } | undefined; }>; /** * Schema for tree options. * Defines options for building a site map tree. * * @property folderFirst - Whether to place folders before leaf nodes in the tree * @property linksOrder - How to order links within each folder */ declare const treeOptionsSchema: z.ZodObject<{ /** * Whether to place folders before leaf nodes in the tree. * Default: true */ folderFirst: z.ZodEffects<z.ZodOptional<z.ZodBoolean>, boolean | undefined, unknown>; /** * How to order links within each folder: * - 'page' preserve the original document order * - 'alphabetical' sort A→Z by URL * Default: 'page' */ linksOrder: z.ZodOptional<z.ZodEnum<["page", "alphabetical"]>>; /** * Whether to include extracted links for each node in the tree. * Default: true */ extractedLinks: z.ZodEffects<z.ZodOptional<z.ZodBoolean>, boolean | undefined, unknown>; /** * Whether to exclude subdomain as root URL. * Default: true * e.g., if false: rootUrl: https://swr.vercel.app -> https://vercel.app */ subdomainAsRootUrl: z.ZodEffects<z.ZodOptional<z.ZodBoolean>, boolean | undefined, unknown>; }, "strip", z.ZodTypeAny, { folderFirst?: boolean | undefined; linksOrder?: "page" | "alphabetical" | undefined; extractedLinks?: boolean | undefined; subdomainAsRootUrl?: boolean | undefined; }, { folderFirst?: unknown; linksOrder?: "page" | "alphabetical" | undefined; extractedLinks?: unknown; subdomainAsRootUrl?: unknown; }>; /** * Schema for links route options. * Defines the configuration for a links operation. * * @property url - The URL to scrape * @property tree - Whether to build a site map tree * @property metadata - Whether to extract metadata from the page * @property cleanedHtml - Whether to return cleaned HTML * @property robots - Whether to fetch and parse robots.txt * @property sitemapXML - Whether to fetch and parse sitemap.xml * @property linksFromTarget - Whether to extract links from the target page * @property metadataOptions - Options for metadata extraction * @property linksOptions - Options for link extraction * @property cleanedHtmlOptions - Options for HTML cleaning * @property subdomainAsRootUrl - Whether to exclude subdomain as root URL * * @example * ```typescript * const options = { * url: "https://example.com", * tree: true, * metadata: true, * cleanedHtml: false, * }; * ``` */ declare const linksOptionsSchema: z.ZodObject<{ /** * Options for metadata extraction. * Controls how metadata like title, description, etc. are extracted. */ metadataOptions: z.ZodOptional<z.ZodObject<{ title: z.ZodDefault<z.ZodOptional<z.ZodBoolean>>; description: z.ZodDefault<z.ZodOptional<z.ZodBoolean>>; language: z.ZodDefault<z.ZodOptional<z.ZodBoolean>>; canonical: z.ZodDefault<z.ZodOptional<z.ZodBoolean>>; robots: z.ZodDefault<z.ZodOptional<z.ZodBoolean>>; author: z.ZodDefault<z.ZodOptional<z.ZodBoolean>>; keywords: z.ZodDefault<z.ZodOptional<z.ZodBoolean>>; favicon: z.ZodDefault<z.ZodOptional<z.ZodBoolean>>; openGraph: z.ZodDefault<z.ZodOptional<z.ZodBoolean>>; twitter: z.ZodDefault<z.ZodOptional<z.ZodBoolean>>; isIframeAllowed: z.ZodDefault<z.ZodOptional<z.ZodBoolean>>; }, "strip", z.ZodTypeAny, { title: boolean; description: boolean; language: boolean; canonical: boolean; robots: boolean; author: boolean; keywords: boolean; favicon: boolean; openGraph: boolean; twitter: boolean; isIframeAllowed: boolean; }, { title?: boolean | undefined; description?: boolean | undefined; language?: boolean | undefined; canonical?: boolean | undefined; robots?: boolean | undefined; author?: boolean | undefined; keywords?: boolean | undefined; favicon?: boolean | undefined; openGraph?: boolean | undefined; twitter?: boolean | undefined; isIframeAllowed?: boolean | undefined; }>>; /** * Options for link extraction. * Controls how links are extracted and categorized. */ linksOptions: z.ZodOptional<z.ZodObject<{ includeExternal: z.ZodOptional<z.ZodBoolean>; includeMedia: z.ZodOptional<z.ZodBoolean>; excludePatterns: z.ZodOptional<z.ZodArray<z.ZodString, "many">>; removeQueryParams: z.ZodOptional<z.ZodBoolean>; }, "strict", z.ZodTypeAny, { includeExternal?: boolean | undefined; includeMedia?: boolean | undefined; excludePatterns?: string[] | undefined; removeQueryParams?: boolean | undefined; }, { includeExternal?: boolean | undefined; includeMedia?: boolean | undefined; excludePatterns?: string[] | undefined; removeQueryParams?: boolean | undefined; }>>; /** * Options for HTML cleaning. * Controls how HTML is sanitized and cleaned. */ cleanedHtmlOptions: z.ZodOptional<z.ZodObject<{ allowedHTMLTags: z.ZodOptional<z.ZodArray<z.ZodString, "many">>; disallowedHTMLTags: z.ZodOptional<z.ZodArray<z.ZodString, "many">>; extractMainContent: z.ZodDefault<z.ZodOptional<z.ZodBoolean>>; documentBaseUrl: z.ZodOptional<z.ZodString>; removeBase64Images: z.ZodDefault<z.ZodOptional<z.ZodBoolean>>; }, "strict", z.ZodTypeAny, { extractMainContent: boolean; removeBase64Images: boolean; allowedHTMLTags?: string[] | undefined; disallowedHTMLTags?: string[] | undefined; documentBaseUrl?: string | undefined; }, { allowedHTMLTags?: string[] | undefined; disallowedHTMLTags?: string[] | undefined; extractMainContent?: boolean | undefined; documentBaseUrl?: string | undefined; removeBase64Images?: boolean | undefined; }>>; /** * Whether to place folders before leaf nodes in the tree. * Default: true */ folderFirst: z.ZodEffects<z.ZodOptional<z.ZodBoolean>, boolean | undefined, unknown>; /** * How to order links within each folder: * - 'page' preserve the original document order * - 'alphabetical' sort A→Z by URL * Default: 'page' */ linksOrder: z.ZodOptional<z.ZodEnum<["page", "alphabetical"]>>; /** * Whether to include extracted links for each node in the tree. * Default: true */ extractedLinks: z.ZodEffects<z.ZodOptional<z.ZodBoolean>, boolean | undefined, unknown>; /** * Whether to exclude subdomain as root URL. * Default: true * e.g., if false: rootUrl: https://swr.vercel.app -> https://vercel.app */ subdomainAsRootUrl: z.ZodEffects<z.ZodOptional<z.ZodBoolean>, boolean | undefined, unknown>; /** * The URL to scrape. * Must be a valid URL string. */ url: z.ZodString; /** * Whether to build a site map tree. * Default: true */ tree: z.ZodEffects<z.ZodOptional<z.ZodBoolean>, boolean | undefined, unknown>; /** * Whether to extract metadata from the page. * Default: true */ metadata: z.ZodEffects<z.ZodOptional<z.ZodBoolean>, boolean | undefined, unknown>; /** * Whether to return cleaned HTML. * Default: false */ cleanedHtml: z.ZodEffects<z.ZodOptional<z.ZodBoolean>, boolean | undefined, unknown>; /** * Whether to fetch and parse robots.txt. * Default: false */ robots: z.ZodEffects<z.ZodOptional<z.ZodBoolean>, boolean | undefined, unknown>; /** * Whether to fetch and parse sitemap.xml. * Default: false */ sitemapXML: z.ZodEffects<z.ZodOptional<z.ZodBoolean>, boolean | undefined, unknown>; }, "strip", z.ZodTypeAny, { url: string; cleanedHtml?: boolean | undefined; robots?: boolean | undefined; metadataOptions?: { title: boolean; description: boolean; language: boolean; canonical: boolean; robots: boolean; author: boolean; keywords: boolean; favicon: boolean; openGraph: boolean; twitter: boolean; isIframeAllowed: boolean; } | undefined; linksOptions?: { includeExternal?: boolean | undefined; includeMedia?: boolean | undefined; excludePatterns?: string[] | undefined; removeQueryParams?: boolean | undefined; } | undefined; cleanedHtmlOptions?: { extractMainContent: boolean; removeBase64Images: boolean; allowedHTMLTags?: string[] | undefined; disallowedHTMLTags?: string[] | undefined; documentBaseUrl?: string | undefined; } | undefined; folderFirst?: boolean | undefined; linksOrder?: "page" | "alphabetical" | undefined; extractedLinks?: boolean | undefined; subdomainAsRootUrl?: boolean | undefined; tree?: boolean | undefined; metadata?: boolean | undefined; sitemapXML?: boolean | undefined; }, { url: string; cleanedHtml?: unknown; robots?: unknown; metadataOptions?: { title?: boolean | undefined; description?: boolean | undefined; language?: boolean | undefined; canonical?: boolean | undefined; robots?: boolean | undefined; author?: boolean | undefined; keywords?: boolean | undefined; favicon?: boolean | undefined; openGraph?: boolean | undefined; twitter?: boolean | undefined; isIframeAllowed?: boolean | undefined; } | undefined; linksOptions?: { includeExternal?: boolean | undefined; includeMedia?: boolean | undefined; excludePatterns?: string[] | undefined; removeQueryParams?: boolean | undefined; } | undefined; cleanedHtmlOptions?: { allowedHTMLTags?: string[] | undefined; disallowedHTMLTags?: string[] | undefined; extractMainContent?: boolean | undefined; documentBaseUrl?: string | undefined; removeBase64Images?: boolean | undefined; } | undefined; folderFirst?: unknown; linksOrder?: "page" | "alphabetical" | undefined; extractedLinks?: unknown; subdomainAsRootUrl?: unknown; tree?: unknown; metadata?: unknown; sitemapXML?: unknown; }>; /** * Type representing options for link scraping operations. * Derived from the linksOptionsSchema. */ type LinksOptions = z.infer<typeof linksOptionsSchema>; /** * @name can be imported as LinksTree or Tree * @description Represents a node in the site map tree. * Each node contains information about a URL and its child pages. * * @property url - The URL of this node * @property rootUrl - The root URL of the website * @property name - The name of this node * @property totalUrls - Total number of URLs in the tree * @property executionTime - Execution time of the request in milliseconds * @property lastUpdated - ISO timestamp when this node was last updated * @property lastVisited - ISO timestamp when this URL was last visited * @property children - Child pages of this URL * @property error - Error message if there was an issue processing this URL * @property metadata - Metadata extracted from the page * @property cleanedHtml - Cleaned HTML content of the page * @property extractedLinks - Extracted links from the page * @property skippedUrls - URLs that were skipped during processing * * @example * ```typescript * const treeNode: LinksTree = { * url: "https://example.com", * rootUrl: "https://example.com", * name: "example", * totalUrls: 10, * executionTime: "1234ms", * lastUpdated: "2025-04-02T14:28:23.000Z", * lastVisited: "2025-04-02T14:28:23.000Z", * children: [ * { * url: "https://example.com/about", * name: "about", * lastUpdated: "2025-04-01T10:15:30.000Z", * lastVisited: "2025-04-02T14:28:25.000Z" * } * ], * metadata: { * title: "Example Website", * description: "This is an example website" * }, * extractedLinks: { * internal: [ * 'https://example.com/about', * 'https://example.com/contact' * ], * external: [ * 'https://othersite.com/reference', * 'https://api.example.org/data' * ], * media: { * images: [ * 'https://example.com/images/logo.png', * 'https://example.com/images/banner.jpg' * ], * videos: [ * 'https://example.com/videos/intro.mp4' * ], * documents: [ * 'https://example.com/docs/whitepaper.pdf' * ] * }, * skippedUrls: { * internal: [ * { url: "https://example.com/private", reason: "Blocked by robots.txt" } * ], * external: [ * { url: "https://othersite.com", reason: "External domain" } * ] * } * } * }; * ``` */ interface LinksTree { /** * The URL of this node. */ url: string; /** * The root URL of the website. * This is the domain root, not necessarily the targetUrl. */ rootUrl?: string; /** * The name of this node. */ name?: string; /** * Total number of URLs in the tree. */ totalUrls?: number; /** * Execution time of the request in milliseconds. * Format: string with "ms" suffix (e.g., "1234ms"). */ executionTime?: string; /** * ISO timestamp when this node was last updated. * Format: ISO 8601 string. */ lastUpdated: string; /** * ISO timestamp when this URL was last visited. * Format: ISO 8601 string or null if never visited. */ lastVisited?: string | null; /** * Child pages of this URL. * Each child is another LinksTree node. */ children?: LinksTree[]; /** * Error message if there was an issue processing this URL. */ error?: string; /** * Metadata extracted from the page. * Contains information like title, description, etc. */ metadata?: PageMetadata; /** * Cleaned HTML content of the page. * Contains sanitized HTML with unnecessary elements removed. */ cleanedHtml?: string; /** * Extracted links from the page. * Contains information about the current url's extracted links. */ extractedLinks?: ExtractedLinks; /** * Skipped URLs and their reasons. * Contains information about URLs that were not processed. */ skippedUrls?: SkippedLinks; } /** * Represents a URL that has been visited. * Used to track when URLs were last accessed. * * @property url - The URL that was visited * @property lastVisited - ISO timestamp when this URL was last visited */ interface Visited { /** * The URL that was visited. */ url: string; /** * ISO timestamp when this URL was last visited. * Format: ISO 8601 string or null if never visited. */ lastVisited?: string | null; } /** * Represents a URL that was skipped during scraping. * Includes the reason why it was not processed. * * @property url - The URL that was skipped * @property reason - The reason why this URL was skipped * * @example * ```typescript * const skippedUrl: SkippedUrl = { * url: "https://example.com/private", * reason: "Blocked by robots.txt" * }; * ``` */ interface SkippedUrl { /** * The URL that was skipped. */ url: string; /** * The reason why this URL was skipped. * Examples: "Blocked by robots.txt", "HTTP error", etc. */ reason: string; } /** * Categorized collection of skipped URLs. * Follows the same structure as ExtractedLinks for consistency. * * @property internal - Internal links that were skipped * @property external - External links that were skipped * @property media - Media links that were skipped * @property other - Other links that don't fit into the above categories * * @example * ```typescript * const skippedLinks: SkippedLinks = { * internal: [ * { url: "https://example.com/private", reason: "Blocked by robots.txt" } * ], * external: [ * { url: "https://external.com", reason: "External domain" } * ] * }; * ``` */ interface SkippedLinks { /** * Internal links that were skipped. * These are links within the same domain. */ internal?: SkippedUrl[]; /** * External links that were skipped. * These are links to other domains. */ external?: SkippedUrl[]; /** * Media links that were skipped. * Categorized by media type. */ media?: { /** * Image links that were skipped. */ images?: SkippedUrl[]; /** * Video links that were skipped. */ videos?: SkippedUrl[]; /** * Document links that were skipped. */ documents?: SkippedUrl[]; }; /** * Other links that don't fit into the above categories. */ other?: SkippedUrl[]; } /** * Contains robots.txt and sitemap.xml content. * * @property robots - Content of the robots.txt file * @property sitemapXML - Content of the sitemap.xml file * * @example * ```typescript * const metaFiles: MetaFiles = { * robots: "User-agent: *\nDisallow: /private/", * sitemapXML: "<?xml version=\"1.0\"?><urlset>...</urlset>" * }; * ``` */ interface MetaFiles { /** * Content of the robots.txt file. */ robots?: string; /** * Content of the sitemap.xml file. */ sitemapXML?: string; } /** * Base interface for links POST route responses. * Contains common properties shared by both success and error responses. * * @property targetUrl - The URL that was requested to be scraped * @property timestamp - ISO timestamp when the request was processed */ interface LinksPostResponseBase { /** * Whether the operation was successful. * Will always be true for successful responses. */ success: boolean; /** * The URL that was requested to be scraped. */ targetUrl: string; /** * ISO timestamp when the request was processed. * Format: ISO 8601 string. */ timestamp: string; } /** * Represents a successful links POST route response. * Contains the scraped data and related information. * * @property success - Whether the operation was successful * @property targetUrl - The URL that was requested to be scraped * @property timestamp - ISO timestamp when the request was processed * @property executionTime - Execution time of the request in milliseconds * @property ancestors - Array of parent URLs leading to this URL * @property skippedUrls - URLs that were skipped during processing * @property tree - Site map tree starting from the root URL * * @example * ```typescript * const successResponse: LinksPostSuccessResponse = { * success: true, * targetUrl: "https://example.com", * timestamp: "2025-04-02T14:28:23.000Z", * executionTime: "1234ms", * ancestors: ["https://example.com", "https://example.com/about"], * skippedUrls: { * internal: ["https://example.com/private"], * external: ["https://othersite.com/reference"], * media: { * images: ["https://example.com/images/logo.png"], * videos: ["https://example.com/videos/intro.mp4"], * documents: ["https://example.com/docs/whitepaper.pdf"] * } * }, * tree: { * data: { * url: "https://example.com", * lastUpdated: "2025-04-02T14:28:23.000Z", * children: [...] * } * } * }; * ``` */ interface LinksPostSuccessResponse extends LinksPostResponseBase, Omit<Partial<ScrapedData>, 'rawHtml'> { /** * Whether the operation was successful. * Will always be true for successful responses. */ success: true; /** * Execution time of the request in milliseconds. * Format: string with "ms" suffix (e.g., "1234ms"). */ executionTime?: string; /** * Array of parent URLs leading to this URL. * Represents the path in the site hierarchy. */ ancestors?: string[]; /** * URLs that were skipped during processing. * Includes reasons why they were skipped. */ skippedUrls?: SkippedLinks; /** * Extracted links from the page. * Categorized by type (internal, external, media). */ extractedLinks?: ExtractedLinks; /** * Site map tree starting from the root URL. * Only included if tree generation was requested. */ tree?: LinksTree | null; } /** * Represents an error response from a links POST route. * Contains information about what went wrong. * * @property success - Whether the operation was successful * @property targetUrl - The URL that was requested to be scraped * @property timestamp - ISO timestamp when the request was processed * @property error - Error message describing what went wrong * @property tree - Partial site map tree if available * * @example * ```typescript * const errorResponse: LinksPostErrorResponse = { * success: false, * targetUrl: "https://example.com", * timestamp: "2025-04-02T14:28:23.000Z", * error: "Failed to connect to the server" * }; * ``` */ interface LinksPostErrorResponse extends LinksPostResponseBase { /** * Whether the operation was successful. * Will always be false for error responses. */ success: false; /** * Error message describing what went wrong. * Provides details about the failure reason. */ error: string; /** * Partial site map tree if available. * May contain data collected before the error occurred. */ tree?: LinksTree | null; } /** * Union type representing either a successful or failed link scraping operation. * Uses a discriminated union pattern with the 'success' property as the discriminator. * * @example * ```typescript * function handleResponse(response: LinksPostResponse) { * if (response.success) { * // TypeScript knows this is a LinksPostSuccessResponse * console.log(response.metadata?.title); * } else { * // TypeScript knows this is a LinksPostErrorResponse * console.error(response.error); * } * } * ``` */ type LinksPostResponse = LinksPostSuccessResponse | LinksPostErrorResponse; type LinksRouterTypes_LinksOptions = LinksOptions; type LinksRouterTypes_LinksPostErrorResponse = LinksPostErrorResponse; type LinksRouterTypes_LinksPostResponse = LinksPostResponse; type LinksRouterTypes_LinksPostSuccessResponse = LinksPostSuccessResponse; type LinksRouterTypes_LinksTree = LinksTree; type LinksRouterTypes_MetaFiles = MetaFiles; type LinksRouterTypes_SkippedLinks = SkippedLinks; type LinksRouterTypes_SkippedUrl = SkippedUrl; type LinksRouterTypes_Visited = Visited; declare const LinksRouterTypes_contentOptionsSchema: typeof contentOptionsSchema; declare const LinksRouterTypes_linksOptionsSchema: typeof linksOptionsSchema; declare const LinksRouterTypes_treeOptionsSchema: typeof treeOptionsSchema; declare namespace LinksRouterTypes { export { type LinksRouterTypes_LinksOptions as LinksOptions, type LinksRouterTypes_LinksPostErrorResponse as LinksPostErrorResponse, type LinksRouterTypes_LinksPostResponse as LinksPostResponse, type LinksRouterTypes_LinksPostSuccessResponse as LinksPostSuccessResponse, type LinksRouterTypes_LinksTree as LinksTree, type LinksRouterTypes_MetaFiles as MetaFiles, type LinksRouterTypes_SkippedLinks as SkippedLinks, type LinksRouterTypes_SkippedUrl as SkippedUrl, type LinksRouterTypes_Visited as Visited, LinksRouterTypes_contentOptionsSchema as contentOptionsSchema, LinksRouterTypes_linksOptionsSchema as linksOptionsSchema, LinksRouterTypes_treeOptionsSchema as treeOptionsSchema }; } /** * Represents data scraped from a webpage using Cheerio. * Contains various extracted elements and metadata from the target page. * * @interface ScrapedData * * @property title - The title of the webpage extracted from the title tag * @property rawHTML - The original unmodified HTML content of the webpage * @property description - The meta description of the webpage * @property metadata - Optional structured metadata extracted from the page (OpenGraph, Twitter Cards, etc.) * @property cleanedHtml - Optional sanitized version of the HTML with unnecessary elements removed * @property metaFiles - Optional metadata files like robots.txt and sitemap.xml * * @example * ```typescript * const scrapedData: ScrapedData = { * title: "Example Website - Home Page", * rawHTML: "<html><head><title>Example Website - Home Page</title></head><body>...</body></html>", * description: "This is an example website demonstrating web scraping capabilities.", * metadata: { * title: "Example Website - Home Page", * description: "This is an example website demonstrating web scraping capabilities.", * ogTitle: "Example Website", * // other metadata properties * }, * cleanedHtml: "<div><h1>Example Website</h1><p>Main content...</p></div>", * metaFiles: { * robots: "User-agent: *\nDisallow: /admin/", * sitemap: "<?xml version=\"1.0\"?><urlset>...</urlset>" * } * }; * ``` */ interface ScrapedData { title: string; rawHtml: string; description: string; metadata?: PageMetadata; cleanedHtml?: string; metaFiles?: MetaFiles; } type CheerioTypes_ScrapedData = ScrapedData; declare namespace CheerioTypes { export type { CheerioTypes_ScrapedData as ScrapedData }; } export { CheerioTypes as C, LinksRouterTypes as L, type MetaFiles as M, type SkippedUrl as S, type Visited as V, type LinksTree as a, type LinksOptions as b, contentOptionsSchema as c, type SkippedLinks as d, type LinksPostSuccessResponse as e, type LinksPostErrorResponse as f, type LinksPostResponse as g, type ScrapedData as h, linksOptionsSchema as l, treeOptionsSchema as t };