UNPKG

link-preview-js

Version:

Javascript module to extract and fetch HTTP link information from blocks of text.

383 lines (382 loc) 16.1 kB
"use strict"; var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) { function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); } return new (P || (P = Promise))(function (resolve, reject) { function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } } function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } } function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); } step((generator = generator.apply(thisArg, _arguments || [])).next()); }); }; Object.defineProperty(exports, "__esModule", { value: true }); exports.getLinkPreview = getLinkPreview; exports.getPreviewFromContent = getPreviewFromContent; const cheerio_1 = require("cheerio"); const constants_1 = require("./constants"); function throwOnLoopback(address) { if (constants_1.CONSTANTS.REGEX_LOOPBACK.test(address)) { throw new Error("SSRF request detected, trying to query host"); } } function metaTag(doc, type, attr) { const nodes = doc(`meta[${attr}='${type}']`); return nodes.length ? nodes : null; } function metaTagContent(doc, type, attr) { return doc(`meta[${attr}='${type}']`).attr(`content`); } function getTitle(doc) { let title = metaTagContent(doc, `og:title`, `property`) || metaTagContent(doc, `og:title`, `name`); if (!title) { title = doc(`head > title`).text(); } return title; } function getSiteName(doc) { const siteName = metaTagContent(doc, `og:site_name`, `property`) || metaTagContent(doc, `og:site_name`, `name`); return siteName; } function getAuthor(doc) { const author = metaTagContent(doc, `author`, `name`) || metaTagContent(doc, `article:author`, `property`); return author; } function getDescription(doc) { const description = metaTagContent(doc, `description`, `name`) || metaTagContent(doc, `Description`, `name`) || metaTagContent(doc, `og:description`, `property`); return description; } function getMediaType(doc) { const node = metaTag(doc, `medium`, `name`); if (node) { const content = node.attr(`content`); return content === `image` ? `photo` : content; } return metaTagContent(doc, `og:type`, `property`) || metaTagContent(doc, `og:type`, `name`); } function getImages(doc, rootUrl, imagesPropertyType) { let images = []; let nodes; let src; let dic = {}; const imagePropertyType = imagesPropertyType !== null && imagesPropertyType !== void 0 ? imagesPropertyType : `og`; nodes = metaTag(doc, `${imagePropertyType}:image`, `property`) || metaTag(doc, `${imagePropertyType}:image`, `name`); if (nodes) { nodes.each((_, node) => { if (node.type === `tag`) { src = node.attribs.content; if (src) { src = new URL(src, rootUrl).href; images.push(src); } } }); } if (images.length <= 0 && !imagesPropertyType) { src = doc(`link[rel=image_src]`).attr(`href`); if (src) { src = new URL(src, rootUrl).href; images = [src]; } else { nodes = doc(`img`); if (nodes === null || nodes === void 0 ? void 0 : nodes.length) { dic = {}; images = []; nodes.each((_, node) => { if (node.type === `tag`) src = node.attribs.src; if (src && !dic[src]) { dic[src] = true; // width = node.attribs.width; // height = node.attribs.height; images.push(new URL(src, rootUrl).href); } }); } } } return images; } function getVideos(doc) { const videos = []; let nodeTypes; let nodeSecureUrls; let nodeType; let nodeSecureUrl; let video; let videoType; let videoSecureUrl; let width; let height; let videoObj; let index; const nodes = metaTag(doc, `og:video`, `property`) || metaTag(doc, `og:video`, `name`); if (nodes === null || nodes === void 0 ? void 0 : nodes.length) { nodeTypes = metaTag(doc, `og:video:type`, `property`) || metaTag(doc, `og:video:type`, `name`); nodeSecureUrls = metaTag(doc, `og:video:secure_url`, `property`) || metaTag(doc, `og:video:secure_url`, `name`); width = metaTagContent(doc, `og:video:width`, `property`) || metaTagContent(doc, `og:video:width`, `name`); height = metaTagContent(doc, `og:video:height`, `property`) || metaTagContent(doc, `og:video:height`, `name`); for (index = 0; index < nodes.length; index += 1) { const node = nodes[index]; if (node.type === `tag`) video = node.attribs.content; nodeType = nodeTypes === null || nodeTypes === void 0 ? void 0 : nodeTypes[index]; if ((nodeType === null || nodeType === void 0 ? void 0 : nodeType.type) === `tag`) { videoType = nodeType ? nodeType.attribs.content : null; } nodeSecureUrl = nodeSecureUrls === null || nodeSecureUrls === void 0 ? void 0 : nodeSecureUrls[index]; if ((nodeSecureUrl === null || nodeSecureUrl === void 0 ? void 0 : nodeSecureUrl.type) === `tag`) { videoSecureUrl = nodeSecureUrl ? nodeSecureUrl.attribs.content : null; } videoObj = { url: video, secureUrl: videoSecureUrl, type: videoType, width, height, }; if (videoType && videoType.indexOf(`video/`) === 0) { videos.splice(0, 0, videoObj); } else { videos.push(videoObj); } } } return videos; } // returns default favicon (//hostname/favicon.ico) for a url function getDefaultFavicon(rootUrl) { return new URL(`/favicon.ico`, rootUrl).href; } // returns an array of URLs to favicon images function getFavicons(doc, rootUrl) { const images = []; let nodes; let src; const relSelectors = [`rel=icon`, `rel="shortcut icon"`, `rel=apple-touch-icon`]; relSelectors.forEach((relSelector) => { // look for all icon tags nodes = doc(`link[${relSelector}]`); // collect all images from icon tags if (nodes.length) { nodes.each((_, node) => { if (node.type === `tag`) src = node.attribs.href; if (src) { src = new URL(src, rootUrl).href; images.push(src); } }); } }); // if no icon images, use default favicon location if (images.length <= 0) { images.push(getDefaultFavicon(rootUrl)); } return images; } function parseImageResponse(url, contentType) { return { url, mediaType: `image`, contentType, favicons: [getDefaultFavicon(url)], }; } function parseAudioResponse(url, contentType) { return { url, mediaType: `audio`, contentType, favicons: [getDefaultFavicon(url)], }; } function parseVideoResponse(url, contentType) { return { url, mediaType: `video`, contentType, favicons: [getDefaultFavicon(url)], }; } function parseApplicationResponse(url, contentType) { return { url, mediaType: `application`, contentType, favicons: [getDefaultFavicon(url)], }; } function parseTextResponse(body, url, options = {}, contentType) { const doc = (0, cheerio_1.load)(body); let response = { url, title: getTitle(doc), siteName: getSiteName(doc), description: getDescription(doc), author: getAuthor(doc), mediaType: getMediaType(doc) || `website`, contentType, images: getImages(doc, url, options.imagesPropertyType), videos: getVideos(doc), favicons: getFavicons(doc, url), }; if ((options === null || options === void 0 ? void 0 : options.onResponse) && typeof options.onResponse !== `function`) { throw new Error(`link-preview-js onResponse option must be a function`); } if (options === null || options === void 0 ? void 0 : options.onResponse) { // send in a cloned response (to avoid mutation of original response reference) const clonedResponse = structuredClone(response); const urlObject = new URL(url); response = options.onResponse(clonedResponse, doc, urlObject); } return response; } function parseUnknownResponse(body, url, options = {}, contentType) { return parseTextResponse(body, url, options, contentType); } function parseResponse(response, options) { try { // console.log("[link-preview-js] response", response); let contentType = response.headers[`content-type`]; let contentTypeTokens = []; let charset = null; if (!contentType) { return parseUnknownResponse(response.data, response.url, options); } if (contentType.includes(`;`)) { contentTypeTokens = contentType.split(`;`); contentType = contentTypeTokens[0]; for (let token of contentTypeTokens) { if (token.indexOf("charset=") !== -1) { charset = token.split("=")[1]; } } } // parse response depending on content type if (constants_1.CONSTANTS.REGEX_CONTENT_TYPE_IMAGE.test(contentType)) { return Object.assign(Object.assign({}, parseImageResponse(response.url, contentType)), { charset }); } if (constants_1.CONSTANTS.REGEX_CONTENT_TYPE_AUDIO.test(contentType)) { return Object.assign(Object.assign({}, parseAudioResponse(response.url, contentType)), { charset }); } if (constants_1.CONSTANTS.REGEX_CONTENT_TYPE_VIDEO.test(contentType)) { return Object.assign(Object.assign({}, parseVideoResponse(response.url, contentType)), { charset }); } if (constants_1.CONSTANTS.REGEX_CONTENT_TYPE_TEXT.test(contentType)) { return Object.assign(Object.assign({}, parseTextResponse(response.data, response.url, options, contentType)), { charset }); } if (constants_1.CONSTANTS.REGEX_CONTENT_TYPE_APPLICATION.test(contentType)) { return Object.assign(Object.assign({}, parseApplicationResponse(response.url, contentType)), { charset }); } const htmlString = response.data; return Object.assign(Object.assign({}, parseUnknownResponse(htmlString, response.url, options)), { charset }); } catch (e) { throw new Error(`link-preview-js could not fetch link information ${e.toString()}`); } } /** * Parses the text, extracts the first link it finds and does a HTTP request * to fetch the website content, afterwards it tries to parse the internal HTML * and extract the information via meta tags * @param text string, text to be parsed * @param options ILinkPreviewOptions */ function getLinkPreview(text, options) { return __awaiter(this, void 0, void 0, function* () { var _a, _b, _c; if (!text || typeof text !== `string`) { throw new Error(`link-preview-js did not receive a valid url or text`); } const detectedUrl = text .replace(/\n/g, ` `) .split(` `) .find((token) => constants_1.CONSTANTS.REGEX_VALID_URL.test(token)); if (!detectedUrl) { throw new Error(`link-preview-js did not receive a valid a url or text`); } if ((options === null || options === void 0 ? void 0 : options.followRedirects) === `manual` && !(options === null || options === void 0 ? void 0 : options.handleRedirects)) { throw new Error(`link-preview-js followRedirects is set to manual, but no handleRedirects function was provided`); } if (options === null || options === void 0 ? void 0 : options.resolveDNSHost) { const resolvedUrl = yield options.resolveDNSHost(detectedUrl); throwOnLoopback(resolvedUrl); } else { console.error("[link-preview-js] You are not resolving DNS addresses (resolveDNSHost option) before fetching a link. This can cause loopback attacks. Always try to resolve DNS addresses"); } const timeout = (_a = options === null || options === void 0 ? void 0 : options.timeout) !== null && _a !== void 0 ? _a : 3000; // 3 second timeout default const controller = new AbortController(); const timeoutCounter = setTimeout(() => controller.abort(), timeout); const fetchOptions = { headers: (_b = options === null || options === void 0 ? void 0 : options.headers) !== null && _b !== void 0 ? _b : {}, redirect: (_c = options === null || options === void 0 ? void 0 : options.followRedirects) !== null && _c !== void 0 ? _c : `error`, signal: controller.signal, }; const fetchUrl = (options === null || options === void 0 ? void 0 : options.proxyUrl) ? options.proxyUrl.concat(detectedUrl) : detectedUrl; let response = yield fetch(fetchUrl, fetchOptions).catch((e) => { if (e.name === `AbortError`) { throw new Error(`Request timeout`); } clearTimeout(timeoutCounter); throw e; }); if (response.status > 300 && response.status < 309 && fetchOptions.redirect === `manual` && (options === null || options === void 0 ? void 0 : options.handleRedirects)) { const locationHeader = response.headers.get(`location`) || ``; const isAbsoluteURI = locationHeader.startsWith("http://") || locationHeader.startsWith("https://"); // Resolve the URL, handling both absolute and relative URLs const forwardedUrl = isAbsoluteURI ? locationHeader : new URL(locationHeader, fetchUrl).href; if (!options.handleRedirects(fetchUrl, forwardedUrl)) { throw new Error(`link-preview-js could not handle redirect`); } if (options === null || options === void 0 ? void 0 : options.resolveDNSHost) { const resolvedUrl = yield options.resolveDNSHost(forwardedUrl); throwOnLoopback(resolvedUrl); } response = yield fetch(forwardedUrl, fetchOptions); } clearTimeout(timeoutCounter); const headers = {}; response.headers.forEach((header, key) => { headers[key] = header; }); const normalizedResponse = { url: (options === null || options === void 0 ? void 0 : options.proxyUrl) ? response.url.replace(options.proxyUrl, ``) : response.url, headers, data: yield response.text(), }; return parseResponse(normalizedResponse, options); }); } /** * Skip the library fetching the website for you, instead pass a response object * from whatever source you get and use the internal parsing of the HTML to return * the necessary information * @param response Preview Response * @param options IPreviewLinkOptions */ function getPreviewFromContent(response, options) { return __awaiter(this, void 0, void 0, function* () { if (!response || typeof response !== `object`) { throw new Error(`link-preview-js did not receive a valid response object`); } if (!response.url) { throw new Error(`link-preview-js did not receive a valid response object`); } return parseResponse(response, options); }); }