UNPKG

n8n-nodes-crawl4ai-dev

Version:

n8n nodes for Crawl4AI web crawler and data extraction

github.com/golfamigo/n8n-nodes-crawl4j

golfamigo/n8n-nodes-crawl4j

414 lines • 15.6 kB

JavaScript

"use strict"; Object.defineProperty(exports, "__esModule", { value: true }); exports.description = void 0; exports.execute = execute; const n8n_workflow_1 = require("n8n-workflow"); const utils_1 = require("../helpers/utils"); const formatters_1 = require("../../Crawl4aiBasicCrawler/helpers/formatters"); exports.description = [ { displayName: 'URL', name: 'url', type: 'string', required: true, default: '', placeholder: 'https://example.com', description: 'The URL to extract content from', displayOptions: { show: { operation: ['llmExtractor'], }, }, }, { displayName: 'Extraction Instructions', name: 'instruction', type: 'string', typeOptions: { rows: 4, }, required: true, default: '', placeholder: 'Extract the product name, price, and description from this page.', description: 'Instructions for the LLM on what to extract from the page', displayOptions: { show: { operation: ['llmExtractor'], }, }, }, { displayName: 'Schema Fields', name: 'schemaFields', placeholder: 'Add Schema Field', type: 'fixedCollection', typeOptions: { multipleValues: true, }, default: {}, required: true, displayOptions: { show: { operation: ['llmExtractor'], }, }, options: [ { name: 'fieldsValues', displayName: 'Fields', values: [ { displayName: 'Field Name', name: 'name', type: 'string', required: true, default: '', placeholder: 'title', description: 'Name of the field to extract', }, { displayName: 'Field Type', name: 'fieldType', type: 'options', options: [ { name: 'String', value: 'string', description: 'Plain text string', }, { name: 'Number', value: 'number', description: 'Numeric value', }, { name: 'Boolean', value: 'boolean', description: 'True/false value', }, { name: 'Array', value: 'array', description: 'Array of values', }, ], default: 'string', description: 'Type of the field', }, { displayName: 'Description', name: 'description', type: 'string', default: '', placeholder: 'The main title of the product', description: 'Description of the field to help the LLM understand what to extract', }, { displayName: 'Required', name: 'required', type: 'boolean', default: true, description: 'Whether this field is required', }, ], }, ], }, { displayName: 'Browser Options', name: 'browserOptions', type: 'collection', placeholder: 'Add Option', default: {}, displayOptions: { show: { operation: ['llmExtractor'], }, }, options: [ { displayName: 'Enable JavaScript', name: 'javaScriptEnabled', type: 'boolean', default: true, description: 'Whether to enable JavaScript execution', }, { displayName: 'Headless Mode', name: 'headless', type: 'boolean', default: true, description: 'Whether to run browser in headless mode', }, { displayName: 'JavaScript Code', name: 'jsCode', type: 'string', typeOptions: { rows: 4, }, default: '', placeholder: 'document.querySelector("button.load-more").click();', description: 'JavaScript code to execute before extraction (e.g., to click buttons, scroll)', }, { displayName: 'Timeout (MS)', name: 'timeout', type: 'number', default: 60000, description: 'Maximum time to wait for the browser to load the page', }, { displayName: 'Viewport Height', name: 'viewportHeight', type: 'number', default: 800, description: 'The height of the browser viewport', }, { displayName: 'Viewport Width', name: 'viewportWidth', type: 'number', default: 1280, description: 'The width of the browser viewport', }, ], }, { displayName: 'LLM Options', name: 'llmOptions', type: 'collection', placeholder: 'Add Option', default: {}, displayOptions: { show: { operation: ['llmExtractor'], }, }, options: [ { displayName: 'LLM Provider', name: 'llmProvider', type: 'options', options: [ { name: 'Anthropic Claude 3 Sonnet', value: 'anthropic/claude-3-sonnet', }, { name: 'Groq Llama 3 70B', value: 'groq/llama3-70b-8192', }, { name: 'Ollama Llama 3', value: 'ollama/llama3', description: 'Ollama Llama 3 (Local)', }, { name: 'OpenAI GPT-3.5 Turbo', value: 'openai/gpt-3.5-turbo', }, { name: 'OpenAI GPT-4o', value: 'openai/gpt-4o', }, ], default: 'openai/gpt-4o', description: 'LLM provider to use for extraction', displayOptions: { show: { overrideProvider: [true], }, }, }, { displayName: 'Max Tokens', name: 'maxTokens', type: 'number', default: 2000, description: 'Maximum number of tokens for the LLM response', }, { displayName: 'Override LLM Provider', name: 'overrideProvider', type: 'boolean', default: false, description: 'Whether to override the LLM provider from credentials', }, { displayName: 'Provider API Key', name: 'apiKey', type: 'string', typeOptions: { password: true, }, default: '', description: 'API key for the LLM provider (leave empty to use API key from credentials)', displayOptions: { show: { overrideProvider: [true], }, }, }, { displayName: 'Temperature', name: 'temperature', type: 'number', typeOptions: { minValue: 0, maxValue: 1, numberPrecision: 1, }, default: 0, description: 'Controls randomness: 0 for deterministic results, higher for more creativity', }, ], }, { displayName: 'Options', name: 'options', type: 'collection', placeholder: 'Add Option', default: {}, displayOptions: { show: { operation: ['llmExtractor'], }, }, options: [ { displayName: 'Cache Mode', name: 'cacheMode', type: 'options', options: [ { name: 'Enabled (Read/Write)', value: 'enabled', description: 'Use cache if available, save new results to cache', }, { name: 'Bypass (Force Fresh)', value: 'bypass', description: 'Ignore cache, always fetch fresh content', }, { name: 'Only (Read Only)', value: 'only', description: 'Only use cache, do not make new requests', }, ], default: 'enabled', description: 'How to use the cache when crawling', }, { displayName: 'Include Original Text', name: 'includeFullText', type: 'boolean', default: false, description: 'Whether to include the original webpage text in output', }, { displayName: 'CSS Selector', name: 'cssSelector', type: 'string', default: '', placeholder: 'article.content', description: 'CSS selector to focus extraction on a specific part of the page (leave empty for full page)', }, ], }, ]; async function execute(items, nodeOptions) { var _a; const allResults = []; const credentials = (await this.getCredentials('crawl4aiApi')); if (!credentials.enableLlm) { throw new n8n_workflow_1.NodeOperationError(this.getNode(), 'LLM features are not enabled in Crawl4AI credentials. Please enable them and configure an LLM provider.', { itemIndex: 0 }); } for (let i = 0; i < items.length; i++) { try { const url = this.getNodeParameter('url', i, ''); const instruction = this.getNodeParameter('instruction', i, ''); const schemaFieldsValues = this.getNodeParameter('schemaFields.fieldsValues', i, []); const browserOptions = this.getNodeParameter('browserOptions', i, {}); const llmOptions = this.getNodeParameter('llmOptions', i, {}); const options = this.getNodeParameter('options', i, {}); if (!url) { throw new n8n_workflow_1.NodeOperationError(this.getNode(), 'URL cannot be empty.', { itemIndex: i }); } if (!(0, utils_1.isValidUrl)(url)) { throw new n8n_workflow_1.NodeOperationError(this.getNode(), `Invalid URL: ${url}`, { itemIndex: i }); } if (!instruction) { throw new n8n_workflow_1.NodeOperationError(this.getNode(), 'Extraction instructions cannot be empty.', { itemIndex: i }); } if (!schemaFieldsValues || schemaFieldsValues.length === 0) { throw new n8n_workflow_1.NodeOperationError(this.getNode(), 'At least one schema field must be defined.', { itemIndex: i }); } const schemaProperties = {}; const requiredFields = []; schemaFieldsValues.forEach(field => { const fieldName = field.name; schemaProperties[fieldName] = { name: fieldName, type: field.fieldType, description: field.description || undefined, }; if (field.required === true) { requiredFields.push(fieldName); } }); const schema = { title: 'ExtractedData', type: 'object', properties: schemaProperties, required: requiredFields.length > 0 ? requiredFields : undefined, }; let provider = credentials.llmProvider || 'openai/gpt-4o'; let apiKey = credentials.apiKey; if (llmOptions.overrideProvider === true) { provider = llmOptions.llmProvider || provider; apiKey = llmOptions.apiKey || apiKey; } const browserConfig = (0, utils_1.createBrowserConfig)(browserOptions); const extractionStrategy = (0, utils_1.createLlmExtractionStrategy)(schema, instruction, provider, apiKey); const crawler = await (0, utils_1.getCrawl4aiClient)(this); const extraArgs = {}; if (llmOptions.temperature !== undefined) { extraArgs.temperature = llmOptions.temperature; } if (llmOptions.maxTokens !== undefined) { extraArgs.max_tokens = llmOptions.maxTokens; } const result = await crawler.arun(url, { browserConfig, extractionStrategy, cacheMode: options.cacheMode || 'enabled', jsCode: browserOptions.jsCode, cssSelector: options.cssSelector, extraArgs, }); const extractedData = (0, formatters_1.parseExtractedJson)(result); const formattedResult = (0, formatters_1.formatExtractionResult)(result, extractedData, options.includeFullText); allResults.push({ json: formattedResult, pairedItem: { item: i }, }); } catch (error) { if (this.continueOnFail()) { const node = this.getNode(); const errorItemIndex = (_a = error.itemIndex) !== null && _a !== void 0 ? _a : i; allResults.push({ json: items[i].json, error: new n8n_workflow_1.NodeOperationError(node, error.message, { itemIndex: errorItemIndex }), pairedItem: { item: i }, }); continue; } throw error; } } return allResults; } //# sourceMappingURL=llmExtractor.operation.js.map