UNPKG

n8n

Version:

n8n Workflow Automation Tool

249 lines 9.13 kB
"use strict"; var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) { if (k2 === undefined) k2 = k; var desc = Object.getOwnPropertyDescriptor(m, k); if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) { desc = { enumerable: true, get: function() { return m[k]; } }; } Object.defineProperty(o, k2, desc); }) : (function(o, m, k, k2) { if (k2 === undefined) k2 = k; o[k2] = m[k]; })); var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) { Object.defineProperty(o, "default", { enumerable: true, value: v }); }) : function(o, v) { o["default"] = v; }); var __importStar = (this && this.__importStar) || (function () { var ownKeys = function(o) { ownKeys = Object.getOwnPropertyNames || function (o) { var ar = []; for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k; return ar; }; return ownKeys(o); }; return function (mod) { if (mod && mod.__esModule) return mod; var result = {}; if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]); __setModuleDefault(result, mod); return result; }; })(); var __importDefault = (this && this.__importDefault) || function (mod) { return (mod && mod.__esModule) ? mod : { "default": mod }; }; Object.defineProperty(exports, "__esModule", { value: true }); exports.fetchAndExtract = fetchAndExtract; const turndown_plugin_gfm_1 = require("@joplin/turndown-plugin-gfm"); const readability_1 = require("@mozilla/readability"); const linkedom_1 = require("linkedom"); const turndown_1 = __importDefault(require("turndown")); const undici_1 = require("undici"); const DEFAULT_TIMEOUT_MS = 30_000; const MAX_TIMEOUT_MS = 120_000; const MAX_RESPONSE_BYTES = 5 * 1024 * 1024; const DEFAULT_MAX_CONTENT_LENGTH = 30_000; const MAX_REDIRECTS = 10; async function fetchAndExtract(url, options) { const maxContentLength = options.maxContentLength ?? DEFAULT_MAX_CONTENT_LENGTH; const maxResponseBytes = options.maxResponseBytes ?? MAX_RESPONSE_BYTES; const timeoutMs = Math.min(options.timeoutMs ?? DEFAULT_TIMEOUT_MS, MAX_TIMEOUT_MS); const { authorizeUrl, ssrf } = options; let currentUrl = url; let response; let redirectCount = 0; while (redirectCount <= MAX_REDIRECTS) { const validation = await ssrf.validateUrl(currentUrl); if (!validation.ok) throw validation.error; const dispatcher = new undici_1.Agent({ connect: { lookup: ssrf.createSecureLookup() } }); const controller = new AbortController(); const timeout = setTimeout(() => controller.abort(), timeoutMs); try { response = await fetch(currentUrl, { signal: controller.signal, headers: { 'User-Agent': 'n8n-instance-ai/1.0 (content extraction)', Accept: 'text/html,application/xhtml+xml,application/xml;q=0.9,text/plain;q=0.8,application/pdf;q=0.7,*/*;q=0.5', }, redirect: 'manual', dispatcher, }); } finally { clearTimeout(timeout); void dispatcher.close().catch(() => { }); } if (response.status >= 300 && response.status < 400) { const location = response.headers.get('location'); if (!location) break; redirectCount++; if (redirectCount > MAX_REDIRECTS) { throw new Error(`Too many redirects (max ${MAX_REDIRECTS})`); } currentUrl = new URL(location, currentUrl).href; ssrf.validateRedirectSync(currentUrl); if (authorizeUrl) { await authorizeUrl(currentUrl); } continue; } break; } const finalUrl = currentUrl; if (!response.ok) { return { url, finalUrl, title: '', content: `HTTP ${response.status}: ${response.statusText}`, truncated: false, contentLength: 0, }; } const rawBody = await readLimitedBody(response, maxResponseBytes); const contentType = response.headers.get('content-type') ?? ''; if (contentType.includes('application/pdf')) { return await extractPdf(url, finalUrl, rawBody, maxContentLength); } if (contentType.includes('text/plain') || contentType.includes('text/markdown')) { return extractPlainText(url, finalUrl, rawBody, maxContentLength); } return extractHtml(url, finalUrl, rawBody, maxContentLength); } async function readLimitedBody(response, maxBytes) { const chunks = []; let totalBytes = 0; if (!response.body) { return Buffer.alloc(0); } const reader = response.body.getReader(); try { for (;;) { const { done, value } = await reader.read(); if (done) break; const chunk = Buffer.from(value); totalBytes += chunk.length; if (totalBytes > maxBytes) { chunks.push(chunk.subarray(0, maxBytes - (totalBytes - chunk.length))); break; } chunks.push(chunk); } } finally { reader.releaseLock(); } return Buffer.concat(chunks); } function extractHtml(url, finalUrl, body, maxContentLength) { const html = body.toString('utf-8'); const { document } = (0, linkedom_1.parseHTML)(html); const safetyFlags = detectSafetyFlags(html); const reader = new readability_1.Readability(document); const article = reader.parse(); if (!article) { const fallbackText = document.body?.textContent ?? ''; const truncated = fallbackText.length > maxContentLength; const content = truncated ? fallbackText.slice(0, maxContentLength) : fallbackText; return { url, finalUrl, title: document.title ?? '', content, truncated, contentLength: fallbackText.length, ...(hasSafetyFlags(safetyFlags) ? { safetyFlags } : {}), }; } const turndown = createTurndownService(); let markdown = turndown.turndown(article.content ?? ''); const truncated = markdown.length > maxContentLength; const contentLength = markdown.length; if (truncated) { markdown = markdown.slice(0, maxContentLength); } return { url, finalUrl, title: article.title ?? '', content: markdown, truncated, contentLength, ...(hasSafetyFlags(safetyFlags) ? { safetyFlags } : {}), }; } async function extractPdf(url, finalUrl, body, maxContentLength) { const { PDFParse } = await Promise.resolve().then(() => __importStar(require('pdf-parse'))); const parser = new PDFParse({ data: body }); let textResult; let title = ''; try { textResult = await parser.getText(); try { const infoResult = await parser.getInfo(); const titleField = infoResult.info?.Title; if (typeof titleField === 'string') title = titleField; } catch { } } finally { await parser.destroy(); } const truncated = textResult.text.length > maxContentLength; const content = truncated ? textResult.text.slice(0, maxContentLength) : textResult.text; return { url, finalUrl, title, content, truncated, contentLength: textResult.text.length, }; } function extractPlainText(url, finalUrl, body, maxContentLength) { const text = body.toString('utf-8'); const truncated = text.length > maxContentLength; const content = truncated ? text.slice(0, maxContentLength) : text; return { url, finalUrl, title: '', content, truncated, contentLength: text.length, }; } function createTurndownService() { const turndown = new turndown_1.default({ headingStyle: 'atx', codeBlockStyle: 'fenced', }); turndown.use(turndown_plugin_gfm_1.gfm); return turndown; } function detectSafetyFlags(html) { const flags = {}; const hasAppRoot = /<div\s+id=["'](?:app|root|__next|__nuxt)["']\s*>/i.test(html); const hasNoscript = /<noscript/i.test(html); if (hasAppRoot && hasNoscript) { flags.jsRenderingSuspected = true; } const hasLoginForm = /action=["'][^"']*login/i.test(html); const hasLoginRedirect = /meta[^>]+url=.*(?:login|signin|auth)/i.test(html); if (hasLoginForm || hasLoginRedirect) { flags.loginRequired = true; } return flags; } function hasSafetyFlags(flags) { return (flags !== undefined && (flags.jsRenderingSuspected === true || flags.loginRequired === true)); } //# sourceMappingURL=fetch-and-extract.js.map