UNPKG

n8n-nodes-crawl4ai-dev

Version:

n8n nodes for Crawl4AI web crawler and data extraction

295 lines 11 kB
"use strict"; Object.defineProperty(exports, "__esModule", { value: true }); exports.description = void 0; exports.execute = execute; const n8n_workflow_1 = require("n8n-workflow"); const utils_1 = require("../helpers/utils"); const formatters_1 = require("../../Crawl4aiBasicCrawler/helpers/formatters"); exports.description = [ { displayName: 'URL', name: 'url', type: 'string', required: true, default: '', placeholder: 'https://example.com', description: 'The URL to extract content from', displayOptions: { show: { operation: ['cssExtractor'], }, }, }, { displayName: 'Base Selector', name: 'baseSelector', type: 'string', required: true, default: '', placeholder: 'div.product-item', description: 'CSS selector for the repeating element (e.g., product items, article cards)', displayOptions: { show: { operation: ['cssExtractor'], }, }, }, { displayName: 'Fields', name: 'fields', placeholder: 'Add Field', type: 'fixedCollection', typeOptions: { multipleValues: true, }, default: {}, displayOptions: { show: { operation: ['cssExtractor'], }, }, options: [ { name: 'fieldsValues', displayName: 'Fields', values: [ { displayName: 'Field Name', name: 'name', type: 'string', required: true, default: '', placeholder: 'title', description: 'Name of the field to extract', }, { displayName: 'CSS Selector', name: 'selector', type: 'string', required: true, default: '', placeholder: 'h3.title', description: 'CSS selector relative to the base selector', }, { displayName: 'Field Type', name: 'fieldType', type: 'options', options: [ { name: 'Text', value: 'text', description: 'Extract text content', }, { name: 'HTML', value: 'html', description: 'Extract HTML content', }, { name: 'Attribute', value: 'attribute', description: 'Extract an attribute value', }, ], default: 'text', description: 'Type of data to extract', }, { displayName: 'Attribute Name', name: 'attribute', type: 'string', displayOptions: { show: { fieldType: ['attribute'], }, }, default: 'href', placeholder: 'href', description: 'Name of the attribute to extract', }, ], }, ], }, { displayName: 'Browser Options', name: 'browserOptions', type: 'collection', placeholder: 'Add Option', default: {}, displayOptions: { show: { operation: ['cssExtractor'], }, }, options: [ { displayName: 'Enable JavaScript', name: 'javaScriptEnabled', type: 'boolean', default: true, description: 'Whether to enable JavaScript execution', }, { displayName: 'Headless Mode', name: 'headless', type: 'boolean', default: true, description: 'Whether to run browser in headless mode', }, { displayName: 'JavaScript Code', name: 'jsCode', type: 'string', typeOptions: { rows: 4, }, default: '', placeholder: 'document.querySelector("button.load-more").click();', description: 'JavaScript code to execute before extraction (e.g., to click buttons, scroll)', }, { displayName: 'Timeout (Ms)', name: 'timeout', type: 'number', default: 30000, description: 'Maximum time to wait for the browser to load the page', }, { displayName: 'Viewport Height', name: 'viewportHeight', type: 'number', default: 800, description: 'The height of the browser viewport', }, { displayName: 'Viewport Width', name: 'viewportWidth', type: 'number', default: 1280, description: 'The width of the browser viewport', }, ], }, { displayName: 'Options', name: 'options', type: 'collection', placeholder: 'Add Option', default: {}, displayOptions: { show: { operation: ['cssExtractor'], }, }, options: [ { displayName: 'Cache Mode', name: 'cacheMode', type: 'options', options: [ { name: 'Enabled (Read/Write)', value: 'enabled', description: 'Use cache if available, save new results to cache', }, { name: 'Bypass (Force Fresh)', value: 'bypass', description: 'Ignore cache, always fetch fresh content', }, { name: 'Only (Read Only)', value: 'only', description: 'Only use cache, do not make new requests', }, ], default: 'enabled', description: 'How to use the cache when crawling', }, { displayName: 'Include Original Text', name: 'includeFullText', type: 'boolean', default: false, description: 'Whether to include the original webpage text in output', }, { displayName: 'Clean Text', name: 'cleanText', type: 'boolean', default: true, description: 'Whether to clean and normalize extracted text (remove extra spaces, newlines)', }, ], }, ]; async function execute(items, nodeOptions) { var _a; const allResults = []; for (let i = 0; i < items.length; i++) { try { const url = this.getNodeParameter('url', i, ''); const baseSelector = this.getNodeParameter('baseSelector', i, ''); const fieldsValues = this.getNodeParameter('fields.fieldsValues', i, []); const browserOptions = this.getNodeParameter('browserOptions', i, {}); const options = this.getNodeParameter('options', i, {}); if (!url) { throw new n8n_workflow_1.NodeOperationError(this.getNode(), 'URL cannot be empty.', { itemIndex: i }); } if (!(0, utils_1.isValidUrl)(url)) { throw new n8n_workflow_1.NodeOperationError(this.getNode(), `Invalid URL: ${url}`, { itemIndex: i }); } if (!baseSelector) { throw new n8n_workflow_1.NodeOperationError(this.getNode(), 'Base selector cannot be empty.', { itemIndex: i }); } if (!fieldsValues || fieldsValues.length === 0) { throw new n8n_workflow_1.NodeOperationError(this.getNode(), 'At least one field must be defined.', { itemIndex: i }); } const schema = { name: 'extracted_items', baseSelector, fields: fieldsValues.map(field => ({ name: field.name, selector: field.selector, type: field.fieldType, attribute: field.attribute, })), }; const browserConfig = (0, utils_1.createBrowserConfig)(browserOptions); const extractionStrategy = (0, utils_1.createCssSelectorExtractionStrategy)(schema); const crawler = await (0, utils_1.getCrawl4aiClient)(this); const result = await crawler.arun(url, { browserConfig, extractionStrategy, cacheMode: options.cacheMode || 'enabled', jsCode: browserOptions.jsCode, }); const extractedData = (0, formatters_1.parseExtractedJson)(result); const formattedResult = (0, formatters_1.formatExtractionResult)(result, extractedData, options.includeFullText); if (options.cleanText === true && extractedData) { formattedResult.data = (0, utils_1.cleanExtractedData)(extractedData); } allResults.push({ json: formattedResult, pairedItem: { item: i }, }); } catch (error) { if (this.continueOnFail()) { const node = this.getNode(); const errorItemIndex = (_a = error.itemIndex) !== null && _a !== void 0 ? _a : i; allResults.push({ json: items[i].json, error: new n8n_workflow_1.NodeOperationError(node, error.message, { itemIndex: errorItemIndex }), pairedItem: { item: i }, }); continue; } throw error; } } return allResults; } //# sourceMappingURL=cssExtractor.operation.js.map