n8n-nodes-crawl4ai-dev
Version:
n8n nodes for Crawl4AI web crawler and data extraction
295 lines • 11 kB
JavaScript
"use strict";
Object.defineProperty(exports, "__esModule", { value: true });
exports.description = void 0;
exports.execute = execute;
const n8n_workflow_1 = require("n8n-workflow");
const utils_1 = require("../helpers/utils");
const formatters_1 = require("../../Crawl4aiBasicCrawler/helpers/formatters");
exports.description = [
{
displayName: 'URL',
name: 'url',
type: 'string',
required: true,
default: '',
placeholder: 'https://example.com',
description: 'The URL to extract content from',
displayOptions: {
show: {
operation: ['cssExtractor'],
},
},
},
{
displayName: 'Base Selector',
name: 'baseSelector',
type: 'string',
required: true,
default: '',
placeholder: 'div.product-item',
description: 'CSS selector for the repeating element (e.g., product items, article cards)',
displayOptions: {
show: {
operation: ['cssExtractor'],
},
},
},
{
displayName: 'Fields',
name: 'fields',
placeholder: 'Add Field',
type: 'fixedCollection',
typeOptions: {
multipleValues: true,
},
default: {},
displayOptions: {
show: {
operation: ['cssExtractor'],
},
},
options: [
{
name: 'fieldsValues',
displayName: 'Fields',
values: [
{
displayName: 'Field Name',
name: 'name',
type: 'string',
required: true,
default: '',
placeholder: 'title',
description: 'Name of the field to extract',
},
{
displayName: 'CSS Selector',
name: 'selector',
type: 'string',
required: true,
default: '',
placeholder: 'h3.title',
description: 'CSS selector relative to the base selector',
},
{
displayName: 'Field Type',
name: 'fieldType',
type: 'options',
options: [
{
name: 'Text',
value: 'text',
description: 'Extract text content',
},
{
name: 'HTML',
value: 'html',
description: 'Extract HTML content',
},
{
name: 'Attribute',
value: 'attribute',
description: 'Extract an attribute value',
},
],
default: 'text',
description: 'Type of data to extract',
},
{
displayName: 'Attribute Name',
name: 'attribute',
type: 'string',
displayOptions: {
show: {
fieldType: ['attribute'],
},
},
default: 'href',
placeholder: 'href',
description: 'Name of the attribute to extract',
},
],
},
],
},
{
displayName: 'Browser Options',
name: 'browserOptions',
type: 'collection',
placeholder: 'Add Option',
default: {},
displayOptions: {
show: {
operation: ['cssExtractor'],
},
},
options: [
{
displayName: 'Enable JavaScript',
name: 'javaScriptEnabled',
type: 'boolean',
default: true,
description: 'Whether to enable JavaScript execution',
},
{
displayName: 'Headless Mode',
name: 'headless',
type: 'boolean',
default: true,
description: 'Whether to run browser in headless mode',
},
{
displayName: 'JavaScript Code',
name: 'jsCode',
type: 'string',
typeOptions: {
rows: 4,
},
default: '',
placeholder: 'document.querySelector("button.load-more").click();',
description: 'JavaScript code to execute before extraction (e.g., to click buttons, scroll)',
},
{
displayName: 'Timeout (Ms)',
name: 'timeout',
type: 'number',
default: 30000,
description: 'Maximum time to wait for the browser to load the page',
},
{
displayName: 'Viewport Height',
name: 'viewportHeight',
type: 'number',
default: 800,
description: 'The height of the browser viewport',
},
{
displayName: 'Viewport Width',
name: 'viewportWidth',
type: 'number',
default: 1280,
description: 'The width of the browser viewport',
},
],
},
{
displayName: 'Options',
name: 'options',
type: 'collection',
placeholder: 'Add Option',
default: {},
displayOptions: {
show: {
operation: ['cssExtractor'],
},
},
options: [
{
displayName: 'Cache Mode',
name: 'cacheMode',
type: 'options',
options: [
{
name: 'Enabled (Read/Write)',
value: 'enabled',
description: 'Use cache if available, save new results to cache',
},
{
name: 'Bypass (Force Fresh)',
value: 'bypass',
description: 'Ignore cache, always fetch fresh content',
},
{
name: 'Only (Read Only)',
value: 'only',
description: 'Only use cache, do not make new requests',
},
],
default: 'enabled',
description: 'How to use the cache when crawling',
},
{
displayName: 'Include Original Text',
name: 'includeFullText',
type: 'boolean',
default: false,
description: 'Whether to include the original webpage text in output',
},
{
displayName: 'Clean Text',
name: 'cleanText',
type: 'boolean',
default: true,
description: 'Whether to clean and normalize extracted text (remove extra spaces, newlines)',
},
],
},
];
async function execute(items, nodeOptions) {
var _a;
const allResults = [];
for (let i = 0; i < items.length; i++) {
try {
const url = this.getNodeParameter('url', i, '');
const baseSelector = this.getNodeParameter('baseSelector', i, '');
const fieldsValues = this.getNodeParameter('fields.fieldsValues', i, []);
const browserOptions = this.getNodeParameter('browserOptions', i, {});
const options = this.getNodeParameter('options', i, {});
if (!url) {
throw new n8n_workflow_1.NodeOperationError(this.getNode(), 'URL cannot be empty.', { itemIndex: i });
}
if (!(0, utils_1.isValidUrl)(url)) {
throw new n8n_workflow_1.NodeOperationError(this.getNode(), `Invalid URL: ${url}`, { itemIndex: i });
}
if (!baseSelector) {
throw new n8n_workflow_1.NodeOperationError(this.getNode(), 'Base selector cannot be empty.', { itemIndex: i });
}
if (!fieldsValues || fieldsValues.length === 0) {
throw new n8n_workflow_1.NodeOperationError(this.getNode(), 'At least one field must be defined.', { itemIndex: i });
}
const schema = {
name: 'extracted_items',
baseSelector,
fields: fieldsValues.map(field => ({
name: field.name,
selector: field.selector,
type: field.fieldType,
attribute: field.attribute,
})),
};
const browserConfig = (0, utils_1.createBrowserConfig)(browserOptions);
const extractionStrategy = (0, utils_1.createCssSelectorExtractionStrategy)(schema);
const crawler = await (0, utils_1.getCrawl4aiClient)(this);
const result = await crawler.arun(url, {
browserConfig,
extractionStrategy,
cacheMode: options.cacheMode || 'enabled',
jsCode: browserOptions.jsCode,
});
const extractedData = (0, formatters_1.parseExtractedJson)(result);
const formattedResult = (0, formatters_1.formatExtractionResult)(result, extractedData, options.includeFullText);
if (options.cleanText === true && extractedData) {
formattedResult.data = (0, utils_1.cleanExtractedData)(extractedData);
}
allResults.push({
json: formattedResult,
pairedItem: { item: i },
});
}
catch (error) {
if (this.continueOnFail()) {
const node = this.getNode();
const errorItemIndex = (_a = error.itemIndex) !== null && _a !== void 0 ? _a : i;
allResults.push({
json: items[i].json,
error: new n8n_workflow_1.NodeOperationError(node, error.message, { itemIndex: errorItemIndex }),
pairedItem: { item: i },
});
continue;
}
throw error;
}
}
return allResults;
}
//# sourceMappingURL=cssExtractor.operation.js.map