@llml-browser/types
Version:
TypeScript types and schemas for the @llml-browser API
455 lines (446 loc) • 15.8 kB
JavaScript
;
var __create = Object.create;
var __defProp = Object.defineProperty;
var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
var __getOwnPropNames = Object.getOwnPropertyNames;
var __getProtoOf = Object.getPrototypeOf;
var __hasOwnProp = Object.prototype.hasOwnProperty;
var __export = (target, all) => {
for (var name in all)
__defProp(target, name, { get: all[name], enumerable: true });
};
var __copyProps = (to, from, except, desc) => {
if (from && typeof from === "object" || typeof from === "function") {
for (let key of __getOwnPropNames(from))
if (!__hasOwnProp.call(to, key) && key !== except)
__defProp(to, key, { get: () => from[key], enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable });
}
return to;
};
var __toESM = (mod, isNodeMode, target) => (target = mod != null ? __create(__getProtoOf(mod)) : {}, __copyProps(
// If the importer is in node compatibility mode or this is not an ESM
// file that has been converted to a CommonJS file using a Babel-
// compatible transform (i.e. "__esModule" has not been set), then set
// "default" to the CommonJS "module.exports" for node compatibility.
isNodeMode || !mod || !mod.__esModule ? __defProp(target, "default", { value: mod, enumerable: true }) : target,
mod
));
var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
// src/index.ts
var src_exports = {};
__export(src_exports, {
BrowseOptionsSchema: () => BrowseOptionsSchema,
BrowseRouterTypes: () => types_exports5,
CheerioTypes: () => types_exports,
DataFormatsEnum: () => DataFormatsEnum,
ElementPatternSchema: () => ElementPatternSchema,
ExtractedLinksSchema: () => ExtractedLinksSchema,
HTMLCleaningMetricsSchema: () => HTMLCleaningMetricsSchema,
HTMLCleaningOptionsSchema: () => HTMLCleaningOptionsSchema,
HTMLCleaningResultSchema: () => HTMLCleaningResultSchema,
HTMLCleaningTypes: () => types_exports2,
LinkExtractionOptionsSchema: () => LinkExtractionOptionsSchema,
LinkServiceTypes: () => types_exports3,
LinksRouterTypes: () => types_exports6,
MetadataOptionsSchema: () => MetadataOptionsSchema,
MetadataTypes: () => types_exports4,
PageMetadataSchema: () => PageMetadataSchema,
Routers: () => Routers,
Services: () => Services,
contentOptionsSchema: () => contentOptionsSchema,
linksOptionsSchema: () => linksOptionsSchema,
readOptionsSchema: () => readOptionsSchema,
treeOptionsSchema: () => treeOptionsSchema
});
module.exports = __toCommonJS(src_exports);
// src/services/cheerio/types.ts
var types_exports = {};
// src/services/html-cleaning/types.ts
var types_exports2 = {};
__export(types_exports2, {
ElementPatternSchema: () => ElementPatternSchema,
HTMLCleaningMetricsSchema: () => HTMLCleaningMetricsSchema,
HTMLCleaningOptionsSchema: () => HTMLCleaningOptionsSchema,
HTMLCleaningResultSchema: () => HTMLCleaningResultSchema
});
var import_zod = require("zod");
var HTMLCleaningOptionsSchema = import_zod.z.object({
allowedHTMLTags: import_zod.z.array(import_zod.z.string()).optional(),
disallowedHTMLTags: import_zod.z.array(import_zod.z.string()).optional(),
extractMainContent: import_zod.z.boolean().optional().default(true),
/* Deprecated property, will be removed in future. add baseUrl to HTMLCleaning as a required parameter instead*/
documentBaseUrl: import_zod.z.string().optional(),
removeBase64Images: import_zod.z.boolean().optional().default(true)
}).strict();
var ElementPatternSchema = import_zod.z.object({
tag: import_zod.z.union([import_zod.z.string(), import_zod.z.instanceof(RegExp)]).optional(),
attributes: import_zod.z.array(
import_zod.z.object({
name: import_zod.z.union([import_zod.z.string(), import_zod.z.instanceof(RegExp)]),
value: import_zod.z.union([import_zod.z.string(), import_zod.z.instanceof(RegExp)]).optional()
}).strict()
).optional(),
classNames: import_zod.z.array(import_zod.z.union([import_zod.z.string(), import_zod.z.instanceof(RegExp)])).optional(),
ids: import_zod.z.array(import_zod.z.union([import_zod.z.string(), import_zod.z.instanceof(RegExp)])).optional()
}).strict();
var HTMLCleaningMetricsSchema = import_zod.z.object({
inputSize: import_zod.z.number(),
outputSize: import_zod.z.number(),
compressionRatio: import_zod.z.number()
});
var HTMLCleaningResultSchema = import_zod.z.object({
cleanedHtml: import_zod.z.string(),
metrics: HTMLCleaningMetricsSchema.optional()
});
// src/services/link/types.ts
var types_exports3 = {};
__export(types_exports3, {
ExtractedLinksSchema: () => ExtractedLinksSchema,
LinkExtractionOptionsSchema: () => LinkExtractionOptionsSchema
});
var import_zod2 = require("zod");
var LinkExtractionOptionsSchema = import_zod2.z.object({
includeExternal: import_zod2.z.boolean().optional(),
// Include links from other domains
includeMedia: import_zod2.z.boolean().optional(),
// Include media files (images, videos, docs)
excludePatterns: import_zod2.z.array(import_zod2.z.string()).optional(),
// Regex patterns to exclude URLs
removeQueryParams: import_zod2.z.boolean().optional()
// Remove query parameters from URLs
}).strict();
var ExtractedLinksSchema = import_zod2.z.object({
internal: import_zod2.z.array(import_zod2.z.string()).optional(),
external: import_zod2.z.array(import_zod2.z.string()).optional(),
media: import_zod2.z.object({
images: import_zod2.z.array(import_zod2.z.string()).optional(),
videos: import_zod2.z.array(import_zod2.z.string()).optional(),
documents: import_zod2.z.array(import_zod2.z.string()).optional()
}).optional()
});
// src/services/metadata/types.ts
var types_exports4 = {};
__export(types_exports4, {
MetadataOptionsSchema: () => MetadataOptionsSchema,
PageMetadataSchema: () => PageMetadataSchema
});
var import_zod3 = require("zod");
var MetadataOptionsSchema = import_zod3.z.object({
title: import_zod3.z.boolean().optional().default(true),
description: import_zod3.z.boolean().optional().default(true),
language: import_zod3.z.boolean().optional().default(true),
canonical: import_zod3.z.boolean().optional().default(true),
robots: import_zod3.z.boolean().optional().default(true),
author: import_zod3.z.boolean().optional().default(true),
keywords: import_zod3.z.boolean().optional().default(true),
favicon: import_zod3.z.boolean().optional().default(true),
openGraph: import_zod3.z.boolean().optional().default(true),
twitter: import_zod3.z.boolean().optional().default(true),
isIframeAllowed: import_zod3.z.boolean().optional().default(true)
});
var PageMetadataSchema = import_zod3.z.object({
// Basic metadata
title: import_zod3.z.string().optional(),
description: import_zod3.z.string().optional(),
language: import_zod3.z.string().optional(),
canonical: import_zod3.z.string().url().optional(),
robots: import_zod3.z.string().optional(),
author: import_zod3.z.string().optional(),
keywords: import_zod3.z.array(import_zod3.z.string()).optional(),
lastModified: import_zod3.z.string().optional().nullable(),
favicon: import_zod3.z.string().url().optional(),
// OpenGraph metadata (flattened)
ogTitle: import_zod3.z.string().optional(),
ogDescription: import_zod3.z.string().optional(),
ogImage: import_zod3.z.string().url().optional(),
ogUrl: import_zod3.z.string().url().optional(),
ogType: import_zod3.z.string().optional(),
ogSiteName: import_zod3.z.string().optional(),
// Twitter Card metadata (flattened)
twitterCard: import_zod3.z.string().optional(),
twitterSite: import_zod3.z.string().optional(),
twitterCreator: import_zod3.z.string().optional(),
twitterTitle: import_zod3.z.string().optional(),
twitterDescription: import_zod3.z.string().optional(),
twitterImage: import_zod3.z.string().url().optional(),
// iframe allowed
isIframeAllowed: import_zod3.z.boolean().optional()
});
// src/routers/browse/types.ts
var types_exports5 = {};
__export(types_exports5, {
BrowseOptionsSchema: () => BrowseOptionsSchema,
DataFormatsEnum: () => DataFormatsEnum
});
var import_zod4 = require("zod");
var DataFormatsSchema = import_zod4.z.object({
/**
* Markdown representation of the page content.
* Contains the page content converted to Markdown format.
*/
markdown: import_zod4.z.string().optional(),
/**
* Raw HTML of the page as returned by the server.
* Contains the unmodified HTML response from the target URL.
*/
rawHtml: import_zod4.z.string().optional(),
/**
* Cleaned HTML with unnecessary elements removed.
* Contains a sanitized version of the HTML with ads, scripts, and other non-content elements removed.
*/
cleanedHtml: import_zod4.z.string().optional(),
/**
* Extracted links from the page.
* Contains information about links found on the page.
*/
links: ExtractedLinksSchema.optional(),
/**
* Metadata extracted from the page.
* Contains information like title, description, and other meta tags.
*/
metadata: PageMetadataSchema.optional()
});
var DataFormatsEnum = import_zod4.z.enum([
"markdown",
"rawHtml",
"cleanedHtml",
"links",
"metadata"
]);
var BrowseOptionsSchema = import_zod4.z.object({
/**
* Array of data formats to include in the response.
* If not specified, defaults to ['markdown', 'metadata'].
*
* @example
* ```typescript
* const options = {
* formats: ['markdown', 'links', 'metadata']
* };
* ```
*/
formats: import_zod4.z.array(DataFormatsEnum).optional().default(["markdown", "metadata"]),
/**
* Options for metadata extraction.
* If not specified, defaults to the default metadata options.
*/
metadataOptions: MetadataOptionsSchema.optional(),
/**
* Options for link extraction.
* If not specified, defaults to the default link extraction options.
*/
linksOptions: LinkExtractionOptionsSchema.optional(),
/**
* Options for HTML cleaning.
* If not specified, defaults to the default HTML cleaning options.
*/
cleanedHtmlOptions: HTMLCleaningOptionsSchema.optional()
});
// src/routers/links/types.ts
var types_exports6 = {};
__export(types_exports6, {
contentOptionsSchema: () => contentOptionsSchema,
linksOptionsSchema: () => linksOptionsSchema,
treeOptionsSchema: () => treeOptionsSchema
});
var import_zod5 = require("zod");
var contentOptionsSchema = import_zod5.z.object({
/**
* Options for metadata extraction.
* Controls how metadata like title, description, etc. are extracted.
*/
metadataOptions: MetadataOptionsSchema.optional(),
/**
* Options for link extraction.
* Controls how links are extracted and categorized.
*/
linksOptions: LinkExtractionOptionsSchema.optional(),
/**
* Options for HTML cleaning.
* Controls how HTML is sanitized and cleaned.
*/
cleanedHtmlOptions: HTMLCleaningOptionsSchema.optional()
});
var treeOptionsSchema = import_zod5.z.object({
/**
* Whether to place folders before leaf nodes in the tree.
* Default: true
*/
folderFirst: import_zod5.z.preprocess(
(val) => val !== "false" && val !== false,
import_zod5.z.boolean().optional()
),
/**
* How to order links within each folder:
* - 'page' preserve the original document order
* - 'alphabetical' sort A→Z by URL
* Default: 'page'
*/
linksOrder: import_zod5.z.enum(["page", "alphabetical"]).optional(),
/**
* Whether to include extracted links for each node in the tree.
* Default: true
*/
extractedLinks: import_zod5.z.preprocess(
(val) => val !== "false" && val !== false,
import_zod5.z.boolean().optional()
),
/**
* Whether to exclude subdomain as root URL.
* Default: true
* e.g., if false: rootUrl: https://swr.vercel.app -> https://vercel.app
*/
subdomainAsRootUrl: import_zod5.z.preprocess(
(val) => val !== "false" && val !== false,
import_zod5.z.boolean().optional()
)
});
var linksOptionsSchema = import_zod5.z.object({
/**
* The URL to scrape.
* Must be a valid URL string.
*/
url: import_zod5.z.string(),
/**
* Whether to build a site map tree.
* Default: true
*/
// default true if not set
tree: import_zod5.z.preprocess(
(val) => val !== "false" && val !== false,
import_zod5.z.boolean().optional()
),
/**
* Whether to extract metadata from the page.
* Default: true
*/
// default true if not set
metadata: import_zod5.z.preprocess(
(val) => val !== "false" && val !== false,
import_zod5.z.boolean().optional()
),
/**
* Whether to return cleaned HTML.
* Default: false
*/
cleanedHtml: import_zod5.z.preprocess(
(val) => val === "true" || val === true,
import_zod5.z.boolean().optional()
),
/**
* Whether to fetch and parse robots.txt.
* Default: false
*/
robots: import_zod5.z.preprocess(
(val) => val === "true" || val === true,
import_zod5.z.boolean().optional()
),
/**
* Whether to fetch and parse sitemap.xml.
* Default: false
*/
sitemapXML: import_zod5.z.preprocess(
(val) => val === "true" || val === true,
import_zod5.z.boolean().optional()
),
...treeOptionsSchema.shape,
...contentOptionsSchema.shape
});
// src/routers/read/types.ts
var import_zod6 = __toESM(require("zod"));
var readOptionsSchema = import_zod6.default.object({
/**
* The URL to scrape.
* Must be a valid URL string.
*/
url: import_zod6.default.string(),
/**
* Whether to extract metadata from the page.
* Default: true
*/
// default true if not set
metadata: import_zod6.default.preprocess(
(val) => val !== "false" && val !== false,
import_zod6.default.boolean().optional()
),
/**
* Whether to extract markdown from the page.
* Default: true
*/
// default true if not set
markdown: import_zod6.default.preprocess(
(val) => val !== "false" && val !== false,
import_zod6.default.boolean().optional()
),
/**
* Whether to return cleaned HTML.
* Default: true
*/
cleanedHtml: import_zod6.default.preprocess(
(val) => val !== "false" && val !== false,
import_zod6.default.boolean().optional()
),
/**
* Whether to fetch and parse robots.txt.
* Default: false
*/
robots: import_zod6.default.preprocess(
(val) => val === "true" || val === true,
import_zod6.default.boolean().optional()
),
/**
* Whether to return raw HTML.
* Default: false
*/
rawHtml: import_zod6.default.preprocess(
(val) => val !== "false" && val !== false,
import_zod6.default.boolean().optional()
),
/**
* Options for metadata extraction.
* Controls how metadata like title, description, etc. are extracted.
*/
metadataOptions: MetadataOptionsSchema.optional()
/** DEPRECATED: AS WE ARE NOT USING HTMLREWRITE FOR CLEANING THE HTML FOR NOW, MAY BE REUSED THIS IN THE FUTURE
* Options for HTML cleaning.
* Controls how HTML is sanitized and cleaned.
*/
// cleanedHtmlOptions: HTMLCleaningOptionsSchema.optional(),
});
// src/index.ts
var Services = {
Cheerio: types_exports,
HTMLCleaning: types_exports2,
Link: types_exports3,
Metadata: types_exports4
};
var Routers = {
Links: types_exports6,
Browse: types_exports5
};
// Annotate the CommonJS export names for ESM import in node:
0 && (module.exports = {
BrowseOptionsSchema,
BrowseRouterTypes,
CheerioTypes,
DataFormatsEnum,
ElementPatternSchema,
ExtractedLinksSchema,
HTMLCleaningMetricsSchema,
HTMLCleaningOptionsSchema,
HTMLCleaningResultSchema,
HTMLCleaningTypes,
LinkExtractionOptionsSchema,
LinkServiceTypes,
LinksRouterTypes,
MetadataOptionsSchema,
MetadataTypes,
PageMetadataSchema,
Routers,
Services,
contentOptionsSchema,
linksOptionsSchema,
readOptionsSchema,
treeOptionsSchema
});
//# sourceMappingURL=index.js.map