UNPKG

@llml-browser/types

Version:

TypeScript types and schemas for the @llml-browser API

455 lines (446 loc) 15.8 kB
"use strict"; var __create = Object.create; var __defProp = Object.defineProperty; var __getOwnPropDesc = Object.getOwnPropertyDescriptor; var __getOwnPropNames = Object.getOwnPropertyNames; var __getProtoOf = Object.getPrototypeOf; var __hasOwnProp = Object.prototype.hasOwnProperty; var __export = (target, all) => { for (var name in all) __defProp(target, name, { get: all[name], enumerable: true }); }; var __copyProps = (to, from, except, desc) => { if (from && typeof from === "object" || typeof from === "function") { for (let key of __getOwnPropNames(from)) if (!__hasOwnProp.call(to, key) && key !== except) __defProp(to, key, { get: () => from[key], enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable }); } return to; }; var __toESM = (mod, isNodeMode, target) => (target = mod != null ? __create(__getProtoOf(mod)) : {}, __copyProps( // If the importer is in node compatibility mode or this is not an ESM // file that has been converted to a CommonJS file using a Babel- // compatible transform (i.e. "__esModule" has not been set), then set // "default" to the CommonJS "module.exports" for node compatibility. isNodeMode || !mod || !mod.__esModule ? __defProp(target, "default", { value: mod, enumerable: true }) : target, mod )); var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod); // src/index.ts var src_exports = {}; __export(src_exports, { BrowseOptionsSchema: () => BrowseOptionsSchema, BrowseRouterTypes: () => types_exports5, CheerioTypes: () => types_exports, DataFormatsEnum: () => DataFormatsEnum, ElementPatternSchema: () => ElementPatternSchema, ExtractedLinksSchema: () => ExtractedLinksSchema, HTMLCleaningMetricsSchema: () => HTMLCleaningMetricsSchema, HTMLCleaningOptionsSchema: () => HTMLCleaningOptionsSchema, HTMLCleaningResultSchema: () => HTMLCleaningResultSchema, HTMLCleaningTypes: () => types_exports2, LinkExtractionOptionsSchema: () => LinkExtractionOptionsSchema, LinkServiceTypes: () => types_exports3, LinksRouterTypes: () => types_exports6, MetadataOptionsSchema: () => MetadataOptionsSchema, MetadataTypes: () => types_exports4, PageMetadataSchema: () => PageMetadataSchema, Routers: () => Routers, Services: () => Services, contentOptionsSchema: () => contentOptionsSchema, linksOptionsSchema: () => linksOptionsSchema, readOptionsSchema: () => readOptionsSchema, treeOptionsSchema: () => treeOptionsSchema }); module.exports = __toCommonJS(src_exports); // src/services/cheerio/types.ts var types_exports = {}; // src/services/html-cleaning/types.ts var types_exports2 = {}; __export(types_exports2, { ElementPatternSchema: () => ElementPatternSchema, HTMLCleaningMetricsSchema: () => HTMLCleaningMetricsSchema, HTMLCleaningOptionsSchema: () => HTMLCleaningOptionsSchema, HTMLCleaningResultSchema: () => HTMLCleaningResultSchema }); var import_zod = require("zod"); var HTMLCleaningOptionsSchema = import_zod.z.object({ allowedHTMLTags: import_zod.z.array(import_zod.z.string()).optional(), disallowedHTMLTags: import_zod.z.array(import_zod.z.string()).optional(), extractMainContent: import_zod.z.boolean().optional().default(true), /* Deprecated property, will be removed in future. add baseUrl to HTMLCleaning as a required parameter instead*/ documentBaseUrl: import_zod.z.string().optional(), removeBase64Images: import_zod.z.boolean().optional().default(true) }).strict(); var ElementPatternSchema = import_zod.z.object({ tag: import_zod.z.union([import_zod.z.string(), import_zod.z.instanceof(RegExp)]).optional(), attributes: import_zod.z.array( import_zod.z.object({ name: import_zod.z.union([import_zod.z.string(), import_zod.z.instanceof(RegExp)]), value: import_zod.z.union([import_zod.z.string(), import_zod.z.instanceof(RegExp)]).optional() }).strict() ).optional(), classNames: import_zod.z.array(import_zod.z.union([import_zod.z.string(), import_zod.z.instanceof(RegExp)])).optional(), ids: import_zod.z.array(import_zod.z.union([import_zod.z.string(), import_zod.z.instanceof(RegExp)])).optional() }).strict(); var HTMLCleaningMetricsSchema = import_zod.z.object({ inputSize: import_zod.z.number(), outputSize: import_zod.z.number(), compressionRatio: import_zod.z.number() }); var HTMLCleaningResultSchema = import_zod.z.object({ cleanedHtml: import_zod.z.string(), metrics: HTMLCleaningMetricsSchema.optional() }); // src/services/link/types.ts var types_exports3 = {}; __export(types_exports3, { ExtractedLinksSchema: () => ExtractedLinksSchema, LinkExtractionOptionsSchema: () => LinkExtractionOptionsSchema }); var import_zod2 = require("zod"); var LinkExtractionOptionsSchema = import_zod2.z.object({ includeExternal: import_zod2.z.boolean().optional(), // Include links from other domains includeMedia: import_zod2.z.boolean().optional(), // Include media files (images, videos, docs) excludePatterns: import_zod2.z.array(import_zod2.z.string()).optional(), // Regex patterns to exclude URLs removeQueryParams: import_zod2.z.boolean().optional() // Remove query parameters from URLs }).strict(); var ExtractedLinksSchema = import_zod2.z.object({ internal: import_zod2.z.array(import_zod2.z.string()).optional(), external: import_zod2.z.array(import_zod2.z.string()).optional(), media: import_zod2.z.object({ images: import_zod2.z.array(import_zod2.z.string()).optional(), videos: import_zod2.z.array(import_zod2.z.string()).optional(), documents: import_zod2.z.array(import_zod2.z.string()).optional() }).optional() }); // src/services/metadata/types.ts var types_exports4 = {}; __export(types_exports4, { MetadataOptionsSchema: () => MetadataOptionsSchema, PageMetadataSchema: () => PageMetadataSchema }); var import_zod3 = require("zod"); var MetadataOptionsSchema = import_zod3.z.object({ title: import_zod3.z.boolean().optional().default(true), description: import_zod3.z.boolean().optional().default(true), language: import_zod3.z.boolean().optional().default(true), canonical: import_zod3.z.boolean().optional().default(true), robots: import_zod3.z.boolean().optional().default(true), author: import_zod3.z.boolean().optional().default(true), keywords: import_zod3.z.boolean().optional().default(true), favicon: import_zod3.z.boolean().optional().default(true), openGraph: import_zod3.z.boolean().optional().default(true), twitter: import_zod3.z.boolean().optional().default(true), isIframeAllowed: import_zod3.z.boolean().optional().default(true) }); var PageMetadataSchema = import_zod3.z.object({ // Basic metadata title: import_zod3.z.string().optional(), description: import_zod3.z.string().optional(), language: import_zod3.z.string().optional(), canonical: import_zod3.z.string().url().optional(), robots: import_zod3.z.string().optional(), author: import_zod3.z.string().optional(), keywords: import_zod3.z.array(import_zod3.z.string()).optional(), lastModified: import_zod3.z.string().optional().nullable(), favicon: import_zod3.z.string().url().optional(), // OpenGraph metadata (flattened) ogTitle: import_zod3.z.string().optional(), ogDescription: import_zod3.z.string().optional(), ogImage: import_zod3.z.string().url().optional(), ogUrl: import_zod3.z.string().url().optional(), ogType: import_zod3.z.string().optional(), ogSiteName: import_zod3.z.string().optional(), // Twitter Card metadata (flattened) twitterCard: import_zod3.z.string().optional(), twitterSite: import_zod3.z.string().optional(), twitterCreator: import_zod3.z.string().optional(), twitterTitle: import_zod3.z.string().optional(), twitterDescription: import_zod3.z.string().optional(), twitterImage: import_zod3.z.string().url().optional(), // iframe allowed isIframeAllowed: import_zod3.z.boolean().optional() }); // src/routers/browse/types.ts var types_exports5 = {}; __export(types_exports5, { BrowseOptionsSchema: () => BrowseOptionsSchema, DataFormatsEnum: () => DataFormatsEnum }); var import_zod4 = require("zod"); var DataFormatsSchema = import_zod4.z.object({ /** * Markdown representation of the page content. * Contains the page content converted to Markdown format. */ markdown: import_zod4.z.string().optional(), /** * Raw HTML of the page as returned by the server. * Contains the unmodified HTML response from the target URL. */ rawHtml: import_zod4.z.string().optional(), /** * Cleaned HTML with unnecessary elements removed. * Contains a sanitized version of the HTML with ads, scripts, and other non-content elements removed. */ cleanedHtml: import_zod4.z.string().optional(), /** * Extracted links from the page. * Contains information about links found on the page. */ links: ExtractedLinksSchema.optional(), /** * Metadata extracted from the page. * Contains information like title, description, and other meta tags. */ metadata: PageMetadataSchema.optional() }); var DataFormatsEnum = import_zod4.z.enum([ "markdown", "rawHtml", "cleanedHtml", "links", "metadata" ]); var BrowseOptionsSchema = import_zod4.z.object({ /** * Array of data formats to include in the response. * If not specified, defaults to ['markdown', 'metadata']. * * @example * ```typescript * const options = { * formats: ['markdown', 'links', 'metadata'] * }; * ``` */ formats: import_zod4.z.array(DataFormatsEnum).optional().default(["markdown", "metadata"]), /** * Options for metadata extraction. * If not specified, defaults to the default metadata options. */ metadataOptions: MetadataOptionsSchema.optional(), /** * Options for link extraction. * If not specified, defaults to the default link extraction options. */ linksOptions: LinkExtractionOptionsSchema.optional(), /** * Options for HTML cleaning. * If not specified, defaults to the default HTML cleaning options. */ cleanedHtmlOptions: HTMLCleaningOptionsSchema.optional() }); // src/routers/links/types.ts var types_exports6 = {}; __export(types_exports6, { contentOptionsSchema: () => contentOptionsSchema, linksOptionsSchema: () => linksOptionsSchema, treeOptionsSchema: () => treeOptionsSchema }); var import_zod5 = require("zod"); var contentOptionsSchema = import_zod5.z.object({ /** * Options for metadata extraction. * Controls how metadata like title, description, etc. are extracted. */ metadataOptions: MetadataOptionsSchema.optional(), /** * Options for link extraction. * Controls how links are extracted and categorized. */ linksOptions: LinkExtractionOptionsSchema.optional(), /** * Options for HTML cleaning. * Controls how HTML is sanitized and cleaned. */ cleanedHtmlOptions: HTMLCleaningOptionsSchema.optional() }); var treeOptionsSchema = import_zod5.z.object({ /** * Whether to place folders before leaf nodes in the tree. * Default: true */ folderFirst: import_zod5.z.preprocess( (val) => val !== "false" && val !== false, import_zod5.z.boolean().optional() ), /** * How to order links within each folder: * - 'page' preserve the original document order * - 'alphabetical' sort A→Z by URL * Default: 'page' */ linksOrder: import_zod5.z.enum(["page", "alphabetical"]).optional(), /** * Whether to include extracted links for each node in the tree. * Default: true */ extractedLinks: import_zod5.z.preprocess( (val) => val !== "false" && val !== false, import_zod5.z.boolean().optional() ), /** * Whether to exclude subdomain as root URL. * Default: true * e.g., if false: rootUrl: https://swr.vercel.app -> https://vercel.app */ subdomainAsRootUrl: import_zod5.z.preprocess( (val) => val !== "false" && val !== false, import_zod5.z.boolean().optional() ) }); var linksOptionsSchema = import_zod5.z.object({ /** * The URL to scrape. * Must be a valid URL string. */ url: import_zod5.z.string(), /** * Whether to build a site map tree. * Default: true */ // default true if not set tree: import_zod5.z.preprocess( (val) => val !== "false" && val !== false, import_zod5.z.boolean().optional() ), /** * Whether to extract metadata from the page. * Default: true */ // default true if not set metadata: import_zod5.z.preprocess( (val) => val !== "false" && val !== false, import_zod5.z.boolean().optional() ), /** * Whether to return cleaned HTML. * Default: false */ cleanedHtml: import_zod5.z.preprocess( (val) => val === "true" || val === true, import_zod5.z.boolean().optional() ), /** * Whether to fetch and parse robots.txt. * Default: false */ robots: import_zod5.z.preprocess( (val) => val === "true" || val === true, import_zod5.z.boolean().optional() ), /** * Whether to fetch and parse sitemap.xml. * Default: false */ sitemapXML: import_zod5.z.preprocess( (val) => val === "true" || val === true, import_zod5.z.boolean().optional() ), ...treeOptionsSchema.shape, ...contentOptionsSchema.shape }); // src/routers/read/types.ts var import_zod6 = __toESM(require("zod")); var readOptionsSchema = import_zod6.default.object({ /** * The URL to scrape. * Must be a valid URL string. */ url: import_zod6.default.string(), /** * Whether to extract metadata from the page. * Default: true */ // default true if not set metadata: import_zod6.default.preprocess( (val) => val !== "false" && val !== false, import_zod6.default.boolean().optional() ), /** * Whether to extract markdown from the page. * Default: true */ // default true if not set markdown: import_zod6.default.preprocess( (val) => val !== "false" && val !== false, import_zod6.default.boolean().optional() ), /** * Whether to return cleaned HTML. * Default: true */ cleanedHtml: import_zod6.default.preprocess( (val) => val !== "false" && val !== false, import_zod6.default.boolean().optional() ), /** * Whether to fetch and parse robots.txt. * Default: false */ robots: import_zod6.default.preprocess( (val) => val === "true" || val === true, import_zod6.default.boolean().optional() ), /** * Whether to return raw HTML. * Default: false */ rawHtml: import_zod6.default.preprocess( (val) => val !== "false" && val !== false, import_zod6.default.boolean().optional() ), /** * Options for metadata extraction. * Controls how metadata like title, description, etc. are extracted. */ metadataOptions: MetadataOptionsSchema.optional() /** DEPRECATED: AS WE ARE NOT USING HTMLREWRITE FOR CLEANING THE HTML FOR NOW, MAY BE REUSED THIS IN THE FUTURE * Options for HTML cleaning. * Controls how HTML is sanitized and cleaned. */ // cleanedHtmlOptions: HTMLCleaningOptionsSchema.optional(), }); // src/index.ts var Services = { Cheerio: types_exports, HTMLCleaning: types_exports2, Link: types_exports3, Metadata: types_exports4 }; var Routers = { Links: types_exports6, Browse: types_exports5 }; // Annotate the CommonJS export names for ESM import in node: 0 && (module.exports = { BrowseOptionsSchema, BrowseRouterTypes, CheerioTypes, DataFormatsEnum, ElementPatternSchema, ExtractedLinksSchema, HTMLCleaningMetricsSchema, HTMLCleaningOptionsSchema, HTMLCleaningResultSchema, HTMLCleaningTypes, LinkExtractionOptionsSchema, LinkServiceTypes, LinksRouterTypes, MetadataOptionsSchema, MetadataTypes, PageMetadataSchema, Routers, Services, contentOptionsSchema, linksOptionsSchema, readOptionsSchema, treeOptionsSchema }); //# sourceMappingURL=index.js.map