@llml-browser/types
Version:
TypeScript types and schemas for the @llml-browser API
1 lines • 18.9 kB
Source Map (JSON)
{"version":3,"sources":["../src/routers/links/types.ts"],"sourcesContent":["import type { ScrapedData } from '@/services/cheerio/types';\nimport { HTMLCleaningOptionsSchema } from '@/services/html-cleaning/types';\nimport {\n type ExtractedLinks,\n LinkExtractionOptionsSchema,\n} from '@/services/link/types';\nimport {\n MetadataOptionsSchema,\n type PageMetadata,\n} from '@/services/metadata/types';\nimport { z } from 'zod';\n\n/**\n * Schema for content extraction options.\n * Defines options for extracting different types of content from a webpage.\n *\n * @property metadataOptions - Options for metadata extraction\n * @property linksOptions - Options for link extraction\n * @property cleanedHtmlOptions - Options for HTML cleaning\n */\nexport const contentOptionsSchema = z.object({\n /**\n * Options for metadata extraction.\n * Controls how metadata like title, description, etc. are extracted.\n */\n metadataOptions: MetadataOptionsSchema.optional(),\n\n /**\n * Options for link extraction.\n * Controls how links are extracted and categorized.\n */\n linksOptions: LinkExtractionOptionsSchema.optional(),\n\n /**\n * Options for HTML cleaning.\n * Controls how HTML is sanitized and cleaned.\n */\n cleanedHtmlOptions: HTMLCleaningOptionsSchema.optional(),\n});\n\n/**\n * Schema for tree options.\n * Defines options for building a site map tree.\n *\n * @property folderFirst - Whether to place folders before leaf nodes in the tree\n * @property linksOrder - How to order links within each folder\n */\nexport const treeOptionsSchema = z.object({\n /**\n * Whether to place folders before leaf nodes in the tree.\n * Default: true\n */\n folderFirst: z.preprocess(\n (val) => val !== 'false' && val !== false,\n z.boolean().optional(),\n ),\n /**\n * How to order links within each folder:\n * - 'page' preserve the original document order\n * - 'alphabetical' sort A→Z by URL\n * Default: 'page'\n */\n linksOrder: z.enum(['page', 'alphabetical']).optional(),\n\n /**\n * Whether to include extracted links for each node in the tree.\n * Default: true\n */\n extractedLinks: z.preprocess(\n (val) => val !== 'false' && val !== false,\n z.boolean().optional(),\n ),\n\n /**\n * Whether to exclude subdomain as root URL.\n * Default: true\n * e.g., if false: rootUrl: https://swr.vercel.app -> https://vercel.app\n */\n subdomainAsRootUrl: z.preprocess(\n (val) => val !== 'false' && val !== false,\n z.boolean().optional(),\n ),\n});\n\n/**\n * Schema for links route options.\n * Defines the configuration for a links operation.\n *\n * @property url - The URL to scrape\n * @property tree - Whether to build a site map tree\n * @property metadata - Whether to extract metadata from the page\n * @property cleanedHtml - Whether to return cleaned HTML\n * @property robots - Whether to fetch and parse robots.txt\n * @property sitemapXML - Whether to fetch and parse sitemap.xml\n * @property linksFromTarget - Whether to extract links from the target page\n * @property metadataOptions - Options for metadata extraction\n * @property linksOptions - Options for link extraction\n * @property cleanedHtmlOptions - Options for HTML cleaning\n * @property subdomainAsRootUrl - Whether to exclude subdomain as root URL\n *\n * @example\n * ```typescript\n * const options = {\n * url: \"https://example.com\",\n * tree: true,\n * metadata: true,\n * cleanedHtml: false,\n * };\n * ```\n */\nexport const linksOptionsSchema = z.object({\n /**\n * The URL to scrape.\n * Must be a valid URL string.\n */\n url: z.string(),\n\n /**\n * Whether to build a site map tree.\n * Default: true\n */\n // default true if not set\n tree: z.preprocess(\n (val) => val !== 'false' && val !== false,\n z.boolean().optional(),\n ),\n\n /**\n * Whether to extract metadata from the page.\n * Default: true\n */\n // default true if not set\n metadata: z.preprocess(\n (val) => val !== 'false' && val !== false,\n z.boolean().optional(),\n ),\n\n /**\n * Whether to return cleaned HTML.\n * Default: false\n */\n cleanedHtml: z.preprocess(\n (val) => val === 'true' || val === true,\n z.boolean().optional(),\n ),\n\n /**\n * Whether to fetch and parse robots.txt.\n * Default: false\n */\n robots: z.preprocess(\n (val) => val === 'true' || val === true,\n z.boolean().optional(),\n ),\n\n /**\n * Whether to fetch and parse sitemap.xml.\n * Default: false\n */\n sitemapXML: z.preprocess(\n (val) => val === 'true' || val === true,\n z.boolean().optional(),\n ),\n\n ...treeOptionsSchema.shape,\n\n ...contentOptionsSchema.shape,\n});\n\n/**\n * Type representing options for link scraping operations.\n * Derived from the linksOptionsSchema.\n */\nexport type LinksOptions = z.infer<typeof linksOptionsSchema>;\n\n/**\n * @name can be imported as LinksTree or Tree\n * @description Represents a node in the site map tree.\n * Each node contains information about a URL and its child pages.\n *\n * @property url - The URL of this node\n * @property rootUrl - The root URL of the website\n * @property name - The name of this node\n * @property totalUrls - Total number of URLs in the tree\n * @property executionTime - Execution time of the request in milliseconds\n * @property lastUpdated - ISO timestamp when this node was last updated\n * @property lastVisited - ISO timestamp when this URL was last visited\n * @property children - Child pages of this URL\n * @property error - Error message if there was an issue processing this URL\n * @property metadata - Metadata extracted from the page\n * @property cleanedHtml - Cleaned HTML content of the page\n * @property extractedLinks - Extracted links from the page\n * @property skippedUrls - URLs that were skipped during processing\n *\n * @example\n * ```typescript\n * const treeNode: LinksTree = {\n * url: \"https://example.com\",\n * rootUrl: \"https://example.com\",\n * name: \"example\",\n * totalUrls: 10,\n * executionTime: \"1234ms\",\n * lastUpdated: \"2025-04-02T14:28:23.000Z\",\n * lastVisited: \"2025-04-02T14:28:23.000Z\",\n * children: [\n * {\n * url: \"https://example.com/about\",\n * name: \"about\",\n * lastUpdated: \"2025-04-01T10:15:30.000Z\",\n * lastVisited: \"2025-04-02T14:28:25.000Z\"\n * }\n * ],\n * metadata: {\n * title: \"Example Website\",\n * description: \"This is an example website\"\n * },\n * extractedLinks: {\n * internal: [\n * 'https://example.com/about',\n * 'https://example.com/contact'\n * ],\n * external: [\n * 'https://othersite.com/reference',\n * 'https://api.example.org/data'\n * ],\n * media: {\n * images: [\n * 'https://example.com/images/logo.png',\n * 'https://example.com/images/banner.jpg'\n * ],\n * videos: [\n * 'https://example.com/videos/intro.mp4'\n * ],\n * documents: [\n * 'https://example.com/docs/whitepaper.pdf'\n * ]\n * },\n * skippedUrls: {\n * internal: [\n * { url: \"https://example.com/private\", reason: \"Blocked by robots.txt\" }\n * ],\n * external: [\n * { url: \"https://othersite.com\", reason: \"External domain\" }\n * ]\n * }\n * }\n * };\n * ```\n */\nexport interface LinksTree {\n /**\n * The URL of this node.\n */\n url: string;\n\n /**\n * The root URL of the website.\n * This is the domain root, not necessarily the targetUrl.\n */\n rootUrl?: string;\n\n /**\n * The name of this node.\n */\n name?: string;\n\n /**\n * Total number of URLs in the tree.\n */\n totalUrls?: number;\n\n /**\n * Execution time of the request in milliseconds.\n * Format: string with \"ms\" suffix (e.g., \"1234ms\").\n */\n executionTime?: string;\n\n /**\n * ISO timestamp when this node was last updated.\n * Format: ISO 8601 string.\n */\n lastUpdated: string;\n\n /**\n * ISO timestamp when this URL was last visited.\n * Format: ISO 8601 string or null if never visited.\n */\n lastVisited?: string | null;\n\n /**\n * Child pages of this URL.\n * Each child is another LinksTree node.\n */\n children?: LinksTree[];\n\n /**\n * Error message if there was an issue processing this URL.\n */\n error?: string;\n\n /**\n * Metadata extracted from the page.\n * Contains information like title, description, etc.\n */\n metadata?: PageMetadata;\n\n /**\n * Cleaned HTML content of the page.\n * Contains sanitized HTML with unnecessary elements removed.\n */\n cleanedHtml?: string;\n\n /**\n * Extracted links from the page.\n * Contains information about the current url's extracted links.\n */\n extractedLinks?: ExtractedLinks;\n\n /**\n * Skipped URLs and their reasons.\n * Contains information about URLs that were not processed.\n */\n skippedUrls?: SkippedLinks;\n}\n\n/**\n * Represents a URL that has been visited.\n * Used to track when URLs were last accessed.\n *\n * @property url - The URL that was visited\n * @property lastVisited - ISO timestamp when this URL was last visited\n */\nexport interface Visited {\n /**\n * The URL that was visited.\n */\n url: string;\n\n /**\n * ISO timestamp when this URL was last visited.\n * Format: ISO 8601 string or null if never visited.\n */\n lastVisited?: string | null;\n}\n\n/**\n * Represents a URL that was skipped during scraping.\n * Includes the reason why it was not processed.\n *\n * @property url - The URL that was skipped\n * @property reason - The reason why this URL was skipped\n *\n * @example\n * ```typescript\n * const skippedUrl: SkippedUrl = {\n * url: \"https://example.com/private\",\n * reason: \"Blocked by robots.txt\"\n * };\n * ```\n */\nexport interface SkippedUrl {\n /**\n * The URL that was skipped.\n */\n url: string;\n\n /**\n * The reason why this URL was skipped.\n * Examples: \"Blocked by robots.txt\", \"HTTP error\", etc.\n */\n reason: string;\n}\n\n/**\n * Categorized collection of skipped URLs.\n * Follows the same structure as ExtractedLinks for consistency.\n *\n * @property internal - Internal links that were skipped\n * @property external - External links that were skipped\n * @property media - Media links that were skipped\n * @property other - Other links that don't fit into the above categories\n *\n * @example\n * ```typescript\n * const skippedLinks: SkippedLinks = {\n * internal: [\n * { url: \"https://example.com/private\", reason: \"Blocked by robots.txt\" }\n * ],\n * external: [\n * { url: \"https://external.com\", reason: \"External domain\" }\n * ]\n * };\n * ```\n */\nexport interface SkippedLinks {\n /**\n * Internal links that were skipped.\n * These are links within the same domain.\n */\n internal?: SkippedUrl[];\n\n /**\n * External links that were skipped.\n * These are links to other domains.\n */\n external?: SkippedUrl[];\n\n /**\n * Media links that were skipped.\n * Categorized by media type.\n */\n media?: {\n /**\n * Image links that were skipped.\n */\n images?: SkippedUrl[];\n\n /**\n * Video links that were skipped.\n */\n videos?: SkippedUrl[];\n\n /**\n * Document links that were skipped.\n */\n documents?: SkippedUrl[];\n };\n\n /**\n * Other links that don't fit into the above categories.\n */\n other?: SkippedUrl[];\n}\n\n/**\n * Contains robots.txt and sitemap.xml content.\n *\n * @property robots - Content of the robots.txt file\n * @property sitemapXML - Content of the sitemap.xml file\n *\n * @example\n * ```typescript\n * const metaFiles: MetaFiles = {\n * robots: \"User-agent: *\\nDisallow: /private/\",\n * sitemapXML: \"<?xml version=\\\"1.0\\\"?><urlset>...</urlset>\"\n * };\n * ```\n */\nexport interface MetaFiles {\n /**\n * Content of the robots.txt file.\n */\n robots?: string;\n\n /**\n * Content of the sitemap.xml file.\n */\n sitemapXML?: string;\n}\n\n/**\n * Base interface for links POST route responses.\n * Contains common properties shared by both success and error responses.\n *\n * @property targetUrl - The URL that was requested to be scraped\n * @property timestamp - ISO timestamp when the request was processed\n */\ninterface LinksPostResponseBase {\n /**\n * Whether the operation was successful.\n * Will always be true for successful responses.\n */\n success: boolean;\n\n /**\n * The URL that was requested to be scraped.\n */\n targetUrl: string;\n\n /**\n * ISO timestamp when the request was processed.\n * Format: ISO 8601 string.\n */\n timestamp: string;\n}\n\n/**\n * Represents a successful links POST route response.\n * Contains the scraped data and related information.\n *\n * @property status - Status indicator for a successful response\n * @property targetUrl - The URL that was requested to be scraped\n * @property timestamp - ISO timestamp when the request was processed\n * @property executionTime - Execution time of the request in milliseconds\n * @property ancestors - Array of parent URLs leading to this URL\n * @property skippedUrls - URLs that were skipped during processing\n * @property tree - Site map tree starting from the root URL\n *\n * @example\n * ```typescript\n * const successResponse: LinksPostSuccessResponse = {\n * status: \"success\",\n * targetUrl: \"https://example.com\",\n * timestamp: \"2025-04-02T14:28:23.000Z\",\n * executionTime: \"1234ms\",\n * ancestors: [\"https://example.com\", \"https://example.com/about\"],\n * skippedUrls: {\n * internal: [\"https://example.com/private\"],\n * external: [\"https://othersite.com/reference\"],\n * media: {\n * images: [\"https://example.com/images/logo.png\"],\n * videos: [\"https://example.com/videos/intro.mp4\"],\n * documents: [\"https://example.com/docs/whitepaper.pdf\"]\n * }\n * },\n * tree: {\n * data: {\n * url: \"https://example.com\",\n * lastUpdated: \"2025-04-02T14:28:23.000Z\",\n * children: [...]\n * }\n * }\n * };\n * ```\n */\nexport interface LinksPostSuccessResponse\n extends LinksPostResponseBase,\n Omit<Partial<ScrapedData>, 'rawHtml'> {\n /**\n * Whether the operation was successful.\n * Will always be true for successful responses.\n */\n success: true;\n\n /**\n * @deprecated Use `success` instead.\n * Status indicator for a successful response.\n * Will always be 'success' for successful responses.\n */\n status?: 'success';\n\n /**\n * Execution time of the request in milliseconds.\n * Format: string with \"ms\" suffix (e.g., \"1234ms\").\n */\n executionTime?: string;\n\n /**\n * Array of parent URLs leading to this URL.\n * Represents the path in the site hierarchy.\n */\n ancestors?: string[];\n\n /**\n * URLs that were skipped during processing.\n * Includes reasons why they were skipped.\n */\n skippedUrls?: SkippedLinks;\n\n /**\n * Extracted links from the page.\n * Categorized by type (internal, external, media).\n */\n extractedLinks?: ExtractedLinks;\n\n /**\n * Site map tree starting from the root URL.\n * Only included if tree generation was requested.\n */\n tree?: LinksTree | null;\n}\n\n/**\n * Represents an error response from a links POST route.\n * Contains information about what went wrong.\n *\n * @property status - Status indicator for an error response\n * @property targetUrl - The URL that was requested to be scraped\n * @property timestamp - ISO timestamp when the request was processed\n * @property error - Error message describing what went wrong\n * @property tree - Partial site map tree if available\n *\n * @example\n * ```typescript\n * const errorResponse: LinksPostErrorResponse = {\n * status: \"error\",\n * targetUrl: \"https://example.com\",\n * timestamp: \"2025-04-02T14:28:23.000Z\",\n * error: \"Failed to connect to the server\"\n * };\n * ```\n */\nexport interface LinksPostErrorResponse extends LinksPostResponseBase {\n /**\n * Whether the operation was successful.\n * Will always be false for error responses.\n */\n success: false;\n\n /**\n * @deprecated Use `success` instead.\n * Status indicator for an error response.\n * Will always be 'error' for error responses.\n */\n status?: 'error';\n\n /**\n * Error message describing what went wrong.\n * Provides details about the failure reason.\n */\n error: string;\n\n /**\n * Partial site map tree if available.\n * May contain data collected before the error occurred.\n */\n tree?: LinksTree | null;\n}\n\n/**\n * Union type representing either a successful or failed link scraping operation.\n * Uses a discriminated union pattern with the 'status' property as the discriminator.\n *\n * @example\n * ```typescript\n * function handleResponse(response: LinksPostResponse) {\n * if (response.status === 'success') {\n * // TypeScript knows this is a LinksPostSuccessResponse\n * console.log(response.metadata?.title);\n * } else {\n * // TypeScript knows this is a LinksPostErrorResponse\n * console.error(response.error);\n * }\n * }\n * ```\n */\nexport type LinksPostResponse =\n | LinksPostSuccessResponse\n | LinksPostErrorResponse;\n"],"mappings":";;;;;;;;;;;;;;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAUA,SAAS,SAAS;AAUX,IAAM,uBAAuB,EAAE,OAAO;AAAA;AAAA;AAAA;AAAA;AAAA,EAK3C,iBAAiB,sBAAsB,SAAS;AAAA;AAAA;AAAA;AAAA;AAAA,EAMhD,cAAc,4BAA4B,SAAS;AAAA;AAAA;AAAA;AAAA;AAAA,EAMnD,oBAAoB,0BAA0B,SAAS;AACzD,CAAC;AASM,IAAM,oBAAoB,EAAE,OAAO;AAAA;AAAA;AAAA;AAAA;AAAA,EAKxC,aAAa,EAAE;AAAA,IACb,CAAC,QAAQ,QAAQ,WAAW,QAAQ;AAAA,IACpC,EAAE,QAAQ,EAAE,SAAS;AAAA,EACvB;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA,EAOA,YAAY,EAAE,KAAK,CAAC,QAAQ,cAAc,CAAC,EAAE,SAAS;AAAA;AAAA;AAAA;AAAA;AAAA,EAMtD,gBAAgB,EAAE;AAAA,IAChB,CAAC,QAAQ,QAAQ,WAAW,QAAQ;AAAA,IACpC,EAAE,QAAQ,EAAE,SAAS;AAAA,EACvB;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA,EAOA,oBAAoB,EAAE;AAAA,IACpB,CAAC,QAAQ,QAAQ,WAAW,QAAQ;AAAA,IACpC,EAAE,QAAQ,EAAE,SAAS;AAAA,EACvB;AACF,CAAC;AA4BM,IAAM,qBAAqB,EAAE,OAAO;AAAA;AAAA;AAAA;AAAA;AAAA,EAKzC,KAAK,EAAE,OAAO;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA,EAOd,MAAM,EAAE;AAAA,IACN,CAAC,QAAQ,QAAQ,WAAW,QAAQ;AAAA,IACpC,EAAE,QAAQ,EAAE,SAAS;AAAA,EACvB;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA,EAOA,UAAU,EAAE;AAAA,IACV,CAAC,QAAQ,QAAQ,WAAW,QAAQ;AAAA,IACpC,EAAE,QAAQ,EAAE,SAAS;AAAA,EACvB;AAAA;AAAA;AAAA;AAAA;AAAA,EAMA,aAAa,EAAE;AAAA,IACb,CAAC,QAAQ,QAAQ,UAAU,QAAQ;AAAA,IACnC,EAAE,QAAQ,EAAE,SAAS;AAAA,EACvB;AAAA;AAAA;AAAA;AAAA;AAAA,EAMA,QAAQ,EAAE;AAAA,IACR,CAAC,QAAQ,QAAQ,UAAU,QAAQ;AAAA,IACnC,EAAE,QAAQ,EAAE,SAAS;AAAA,EACvB;AAAA;AAAA;AAAA;AAAA;AAAA,EAMA,YAAY,EAAE;AAAA,IACZ,CAAC,QAAQ,QAAQ,UAAU,QAAQ;AAAA,IACnC,EAAE,QAAQ,EAAE,SAAS;AAAA,EACvB;AAAA,EAEA,GAAG,kBAAkB;AAAA,EAErB,GAAG,qBAAqB;AAC1B,CAAC;","names":[]}