xscrape
Version:
A flexible and powerful library designed to extract and transform data from HTML documents using user-defined schemas
117 lines (111 loc) • 3.65 kB
JavaScript
var __create = Object.create;
var __defProp = Object.defineProperty;
var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
var __getOwnPropNames = Object.getOwnPropertyNames;
var __getProtoOf = Object.getPrototypeOf;
var __hasOwnProp = Object.prototype.hasOwnProperty;
var __export = (target, all) => {
for (var name in all)
__defProp(target, name, { get: all[name], enumerable: true });
};
var __copyProps = (to, from, except, desc) => {
if (from && typeof from === "object" || typeof from === "function") {
for (let key of __getOwnPropNames(from))
if (!__hasOwnProp.call(to, key) && key !== except)
__defProp(to, key, { get: () => from[key], enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable });
}
return to;
};
var __toESM = (mod, isNodeMode, target) => (target = mod != null ? __create(__getProtoOf(mod)) : {}, __copyProps(
// If the importer is in node compatibility mode or this is not an ESM
// file that has been converted to a CommonJS file using a Babel-
// compatible transform (i.e. "__esModule" has not been set), then set
// "default" to the CommonJS "module.exports" for node compatibility.
isNodeMode || !mod || !mod.__esModule ? __defProp(target, "default", { value: mod, enumerable: true }) : target,
mod
));
var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
// src/index.ts
var src_exports = {};
__export(src_exports, {
defineScraper: () => defineScraper
});
module.exports = __toCommonJS(src_exports);
// src/defineScraper.ts
var cheerio = __toESM(require("cheerio"), 1);
// src/validators.ts
var import_zod = require("zod");
var Validator = class {
constructor(schema, validateFunction) {
this.schema = schema;
this.validateFunction = validateFunction;
}
validate(data) {
try {
const result = this.validateFunction(this.schema, data);
return { success: true, data: result };
} catch (error) {
return { success: false, error };
}
}
};
function getSchemaBuilder(type) {
switch (type) {
case "zod":
return import_zod.z;
default:
throw new Error(`Unsupported validator type: ${type}`);
}
}
function createValidator(type, schemaFn) {
const builder = getSchemaBuilder(type);
const schema = schemaFn(builder);
switch (type) {
case "zod":
return new Validator(
schema,
(schema2, data) => schema2.parse(data)
);
default:
throw new Error(`Unsupported validator type: ${type}`);
}
}
// src/defineScraper.ts
function defineScraper(config) {
const validator = createValidator(config.validator, config.schema);
return async (html) => {
try {
const $ = cheerio.load(html);
const extractedData = $.extract(config.extract);
const validationResult = validator.validate(extractedData);
if (!validationResult.success) {
return { error: validationResult.error };
}
if (!validationResult.data) {
return {
error: new Error("Validation succeeded but no data was returned")
};
}
if (config.transform) {
try {
const transformed = await Promise.resolve(
config.transform(validationResult.data)
);
return { data: transformed };
} catch (error) {
return { error };
}
}
return { data: validationResult.data };
} catch (error) {
return { error };
}
};
}
// src/types/main.ts
var import_zod2 = require("zod");
// Annotate the CommonJS export names for ESM import in node:
0 && (module.exports = {
defineScraper
});
;