n8n
Version:
n8n Workflow Automation Tool
249 lines • 9.13 kB
JavaScript
;
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
var desc = Object.getOwnPropertyDescriptor(m, k);
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
desc = { enumerable: true, get: function() { return m[k]; } };
}
Object.defineProperty(o, k2, desc);
}) : (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
o[k2] = m[k];
}));
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
Object.defineProperty(o, "default", { enumerable: true, value: v });
}) : function(o, v) {
o["default"] = v;
});
var __importStar = (this && this.__importStar) || (function () {
var ownKeys = function(o) {
ownKeys = Object.getOwnPropertyNames || function (o) {
var ar = [];
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
return ar;
};
return ownKeys(o);
};
return function (mod) {
if (mod && mod.__esModule) return mod;
var result = {};
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
__setModuleDefault(result, mod);
return result;
};
})();
var __importDefault = (this && this.__importDefault) || function (mod) {
return (mod && mod.__esModule) ? mod : { "default": mod };
};
Object.defineProperty(exports, "__esModule", { value: true });
exports.fetchAndExtract = fetchAndExtract;
const turndown_plugin_gfm_1 = require("@joplin/turndown-plugin-gfm");
const readability_1 = require("@mozilla/readability");
const linkedom_1 = require("linkedom");
const turndown_1 = __importDefault(require("turndown"));
const undici_1 = require("undici");
const DEFAULT_TIMEOUT_MS = 30_000;
const MAX_TIMEOUT_MS = 120_000;
const MAX_RESPONSE_BYTES = 5 * 1024 * 1024;
const DEFAULT_MAX_CONTENT_LENGTH = 30_000;
const MAX_REDIRECTS = 10;
async function fetchAndExtract(url, options) {
const maxContentLength = options.maxContentLength ?? DEFAULT_MAX_CONTENT_LENGTH;
const maxResponseBytes = options.maxResponseBytes ?? MAX_RESPONSE_BYTES;
const timeoutMs = Math.min(options.timeoutMs ?? DEFAULT_TIMEOUT_MS, MAX_TIMEOUT_MS);
const { authorizeUrl, ssrf } = options;
let currentUrl = url;
let response;
let redirectCount = 0;
while (redirectCount <= MAX_REDIRECTS) {
const validation = await ssrf.validateUrl(currentUrl);
if (!validation.ok)
throw validation.error;
const dispatcher = new undici_1.Agent({ connect: { lookup: ssrf.createSecureLookup() } });
const controller = new AbortController();
const timeout = setTimeout(() => controller.abort(), timeoutMs);
try {
response = await fetch(currentUrl, {
signal: controller.signal,
headers: {
'User-Agent': 'n8n-instance-ai/1.0 (content extraction)',
Accept: 'text/html,application/xhtml+xml,application/xml;q=0.9,text/plain;q=0.8,application/pdf;q=0.7,*/*;q=0.5',
},
redirect: 'manual',
dispatcher,
});
}
finally {
clearTimeout(timeout);
void dispatcher.close().catch(() => { });
}
if (response.status >= 300 && response.status < 400) {
const location = response.headers.get('location');
if (!location)
break;
redirectCount++;
if (redirectCount > MAX_REDIRECTS) {
throw new Error(`Too many redirects (max ${MAX_REDIRECTS})`);
}
currentUrl = new URL(location, currentUrl).href;
ssrf.validateRedirectSync(currentUrl);
if (authorizeUrl) {
await authorizeUrl(currentUrl);
}
continue;
}
break;
}
const finalUrl = currentUrl;
if (!response.ok) {
return {
url,
finalUrl,
title: '',
content: `HTTP ${response.status}: ${response.statusText}`,
truncated: false,
contentLength: 0,
};
}
const rawBody = await readLimitedBody(response, maxResponseBytes);
const contentType = response.headers.get('content-type') ?? '';
if (contentType.includes('application/pdf')) {
return await extractPdf(url, finalUrl, rawBody, maxContentLength);
}
if (contentType.includes('text/plain') || contentType.includes('text/markdown')) {
return extractPlainText(url, finalUrl, rawBody, maxContentLength);
}
return extractHtml(url, finalUrl, rawBody, maxContentLength);
}
async function readLimitedBody(response, maxBytes) {
const chunks = [];
let totalBytes = 0;
if (!response.body) {
return Buffer.alloc(0);
}
const reader = response.body.getReader();
try {
for (;;) {
const { done, value } = await reader.read();
if (done)
break;
const chunk = Buffer.from(value);
totalBytes += chunk.length;
if (totalBytes > maxBytes) {
chunks.push(chunk.subarray(0, maxBytes - (totalBytes - chunk.length)));
break;
}
chunks.push(chunk);
}
}
finally {
reader.releaseLock();
}
return Buffer.concat(chunks);
}
function extractHtml(url, finalUrl, body, maxContentLength) {
const html = body.toString('utf-8');
const { document } = (0, linkedom_1.parseHTML)(html);
const safetyFlags = detectSafetyFlags(html);
const reader = new readability_1.Readability(document);
const article = reader.parse();
if (!article) {
const fallbackText = document.body?.textContent ?? '';
const truncated = fallbackText.length > maxContentLength;
const content = truncated ? fallbackText.slice(0, maxContentLength) : fallbackText;
return {
url,
finalUrl,
title: document.title ?? '',
content,
truncated,
contentLength: fallbackText.length,
...(hasSafetyFlags(safetyFlags) ? { safetyFlags } : {}),
};
}
const turndown = createTurndownService();
let markdown = turndown.turndown(article.content ?? '');
const truncated = markdown.length > maxContentLength;
const contentLength = markdown.length;
if (truncated) {
markdown = markdown.slice(0, maxContentLength);
}
return {
url,
finalUrl,
title: article.title ?? '',
content: markdown,
truncated,
contentLength,
...(hasSafetyFlags(safetyFlags) ? { safetyFlags } : {}),
};
}
async function extractPdf(url, finalUrl, body, maxContentLength) {
const { PDFParse } = await Promise.resolve().then(() => __importStar(require('pdf-parse')));
const parser = new PDFParse({ data: body });
let textResult;
let title = '';
try {
textResult = await parser.getText();
try {
const infoResult = await parser.getInfo();
const titleField = infoResult.info?.Title;
if (typeof titleField === 'string')
title = titleField;
}
catch {
}
}
finally {
await parser.destroy();
}
const truncated = textResult.text.length > maxContentLength;
const content = truncated ? textResult.text.slice(0, maxContentLength) : textResult.text;
return {
url,
finalUrl,
title,
content,
truncated,
contentLength: textResult.text.length,
};
}
function extractPlainText(url, finalUrl, body, maxContentLength) {
const text = body.toString('utf-8');
const truncated = text.length > maxContentLength;
const content = truncated ? text.slice(0, maxContentLength) : text;
return {
url,
finalUrl,
title: '',
content,
truncated,
contentLength: text.length,
};
}
function createTurndownService() {
const turndown = new turndown_1.default({
headingStyle: 'atx',
codeBlockStyle: 'fenced',
});
turndown.use(turndown_plugin_gfm_1.gfm);
return turndown;
}
function detectSafetyFlags(html) {
const flags = {};
const hasAppRoot = /<div\s+id=["'](?:app|root|__next|__nuxt)["']\s*>/i.test(html);
const hasNoscript = /<noscript/i.test(html);
if (hasAppRoot && hasNoscript) {
flags.jsRenderingSuspected = true;
}
const hasLoginForm = /action=["'][^"']*login/i.test(html);
const hasLoginRedirect = /meta[^>]+url=.*(?:login|signin|auth)/i.test(html);
if (hasLoginForm || hasLoginRedirect) {
flags.loginRequired = true;
}
return flags;
}
function hasSafetyFlags(flags) {
return (flags !== undefined && (flags.jsRenderingSuspected === true || flags.loginRequired === true));
}
//# sourceMappingURL=fetch-and-extract.js.map