UNPKG

open-graph-scraper

Version:

Node.js scraper module for Open Graph and Twitter Card info

110 lines (109 loc) 4.34 kB
"use strict"; var __importDefault = (this && this.__importDefault) || function (mod) { return (mod && mod.__esModule) ? mod : { "default": mod }; }; Object.defineProperty(exports, "__esModule", { value: true }); exports.default = requestAndResultsFormatter; const undici_1 = require("undici"); const iconv_lite_1 = require("iconv-lite"); const cheerio_1 = require("cheerio"); const chardet_1 = __importDefault(require("chardet")); /** * checks if an element exists */ const doesElementExist = (selector, attribute, $) => ($(selector).attr(attribute) && ($(selector).attr(attribute)?.length ?? 0) > 0); /** * gets the charset of the html */ function getCharset(body, buffer, $) { if (doesElementExist('meta', 'charset', $)) { return $('meta').attr('charset'); } if (doesElementExist('head > meta[name="charset"]', 'content', $)) { return $('head > meta[name="charset"]').attr('content'); } if (doesElementExist('head > meta[http-equiv="content-type"]', 'content', $)) { const content = $('head > meta[http-equiv="content-type"]').attr('content') ?? ''; const charsetRegEx = /charset=([^()<>@,;:"/[\]?.=\s]*)/i; if (charsetRegEx.test(content)) { const charsetRegExExec = charsetRegEx.exec(content); if (charsetRegExExec?.[1]) return charsetRegExExec[1]; } } if (body) { return chardet_1.default.detect(Buffer.from(buffer)); } return 'utf-8'; } /** * performs the fetch request and formats the body for ogs * * @param {object} options - options for ogs * @return {object} formatted request body and response * */ async function requestAndResultsFormatter(options) { let body; let response; try { // eslint-disable-next-line no-control-regex const isLatin1 = /^[\u0000-\u00ff]{0,}$/; let url = options.url ?? ''; if (!isLatin1.test(url)) url = encodeURI(url); response = await (0, undici_1.fetch)(url ?? '', { signal: AbortSignal.timeout((options.timeout ?? 10) * 1000), ...options.fetchOptions, headers: { Origin: url ?? '', Accept: 'text/html', ...options.fetchOptions?.headers }, }); const bodyArrayBuffer = await response.arrayBuffer(); const bodyText = Buffer.from(bodyArrayBuffer).toString('utf-8'); const charset = getCharset(bodyText, bodyArrayBuffer, (0, cheerio_1.load)(bodyText)) ?? 'utf-8'; if (charset.toLowerCase() === 'utf-8') { body = bodyText; } else { body = (0, iconv_lite_1.decode)(Buffer.from(bodyArrayBuffer), charset); } const contentType = response?.headers?.get('content-type')?.toLowerCase(); if (contentType && !contentType.includes('text/')) { throw new Error('Page must return a header content-type with text/'); } if (response?.status && (response.status.toString().startsWith('4') || response.status.toString().startsWith('5'))) { switch (response.status) { case 400: throw new Error('400 Bad Request'); case 401: throw new Error('401 Unauthorized'); case 403: throw new Error('403 Forbidden'); case 404: throw new Error('404 Not Found'); case 408: throw new Error('408 Request Timeout'); case 410: throw new Error('410 Gone'); case 500: throw new Error('500 Internal Server Error'); case 502: throw new Error('502 Bad Gateway'); case 503: throw new Error('503 Service Unavailable'); case 504: throw new Error('504 Gateway Timeout'); default: throw new Error('Server has returned a 400/500 error code'); } } if (body === undefined || body === '') { throw new Error('Page not found'); } } catch (error) { if (error instanceof Error && error.message === 'fetch failed') throw error.cause; throw error; } return { body, response }; }