link-preview-js
Version:
Javascript module to extract and fetch HTTP link information from blocks of text.
383 lines (382 loc) • 16.1 kB
JavaScript
;
var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
return new (P || (P = Promise))(function (resolve, reject) {
function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
step((generator = generator.apply(thisArg, _arguments || [])).next());
});
};
Object.defineProperty(exports, "__esModule", { value: true });
exports.getLinkPreview = getLinkPreview;
exports.getPreviewFromContent = getPreviewFromContent;
const cheerio_1 = require("cheerio");
const constants_1 = require("./constants");
function throwOnLoopback(address) {
if (constants_1.CONSTANTS.REGEX_LOOPBACK.test(address)) {
throw new Error("SSRF request detected, trying to query host");
}
}
function metaTag(doc, type, attr) {
const nodes = doc(`meta[${attr}='${type}']`);
return nodes.length ? nodes : null;
}
function metaTagContent(doc, type, attr) {
return doc(`meta[${attr}='${type}']`).attr(`content`);
}
function getTitle(doc) {
let title = metaTagContent(doc, `og:title`, `property`) || metaTagContent(doc, `og:title`, `name`);
if (!title) {
title = doc(`head > title`).text();
}
return title;
}
function getSiteName(doc) {
const siteName = metaTagContent(doc, `og:site_name`, `property`) || metaTagContent(doc, `og:site_name`, `name`);
return siteName;
}
function getAuthor(doc) {
const author = metaTagContent(doc, `author`, `name`) || metaTagContent(doc, `article:author`, `property`);
return author;
}
function getDescription(doc) {
const description = metaTagContent(doc, `description`, `name`) ||
metaTagContent(doc, `Description`, `name`) ||
metaTagContent(doc, `og:description`, `property`);
return description;
}
function getMediaType(doc) {
const node = metaTag(doc, `medium`, `name`);
if (node) {
const content = node.attr(`content`);
return content === `image` ? `photo` : content;
}
return metaTagContent(doc, `og:type`, `property`) || metaTagContent(doc, `og:type`, `name`);
}
function getImages(doc, rootUrl, imagesPropertyType) {
let images = [];
let nodes;
let src;
let dic = {};
const imagePropertyType = imagesPropertyType !== null && imagesPropertyType !== void 0 ? imagesPropertyType : `og`;
nodes =
metaTag(doc, `${imagePropertyType}:image`, `property`) ||
metaTag(doc, `${imagePropertyType}:image`, `name`);
if (nodes) {
nodes.each((_, node) => {
if (node.type === `tag`) {
src = node.attribs.content;
if (src) {
src = new URL(src, rootUrl).href;
images.push(src);
}
}
});
}
if (images.length <= 0 && !imagesPropertyType) {
src = doc(`link[rel=image_src]`).attr(`href`);
if (src) {
src = new URL(src, rootUrl).href;
images = [src];
}
else {
nodes = doc(`img`);
if (nodes === null || nodes === void 0 ? void 0 : nodes.length) {
dic = {};
images = [];
nodes.each((_, node) => {
if (node.type === `tag`)
src = node.attribs.src;
if (src && !dic[src]) {
dic[src] = true;
// width = node.attribs.width;
// height = node.attribs.height;
images.push(new URL(src, rootUrl).href);
}
});
}
}
}
return images;
}
function getVideos(doc) {
const videos = [];
let nodeTypes;
let nodeSecureUrls;
let nodeType;
let nodeSecureUrl;
let video;
let videoType;
let videoSecureUrl;
let width;
let height;
let videoObj;
let index;
const nodes = metaTag(doc, `og:video`, `property`) || metaTag(doc, `og:video`, `name`);
if (nodes === null || nodes === void 0 ? void 0 : nodes.length) {
nodeTypes = metaTag(doc, `og:video:type`, `property`) || metaTag(doc, `og:video:type`, `name`);
nodeSecureUrls =
metaTag(doc, `og:video:secure_url`, `property`) ||
metaTag(doc, `og:video:secure_url`, `name`);
width =
metaTagContent(doc, `og:video:width`, `property`) ||
metaTagContent(doc, `og:video:width`, `name`);
height =
metaTagContent(doc, `og:video:height`, `property`) ||
metaTagContent(doc, `og:video:height`, `name`);
for (index = 0; index < nodes.length; index += 1) {
const node = nodes[index];
if (node.type === `tag`)
video = node.attribs.content;
nodeType = nodeTypes === null || nodeTypes === void 0 ? void 0 : nodeTypes[index];
if ((nodeType === null || nodeType === void 0 ? void 0 : nodeType.type) === `tag`) {
videoType = nodeType ? nodeType.attribs.content : null;
}
nodeSecureUrl = nodeSecureUrls === null || nodeSecureUrls === void 0 ? void 0 : nodeSecureUrls[index];
if ((nodeSecureUrl === null || nodeSecureUrl === void 0 ? void 0 : nodeSecureUrl.type) === `tag`) {
videoSecureUrl = nodeSecureUrl ? nodeSecureUrl.attribs.content : null;
}
videoObj = {
url: video,
secureUrl: videoSecureUrl,
type: videoType,
width,
height,
};
if (videoType && videoType.indexOf(`video/`) === 0) {
videos.splice(0, 0, videoObj);
}
else {
videos.push(videoObj);
}
}
}
return videos;
}
// returns default favicon (//hostname/favicon.ico) for a url
function getDefaultFavicon(rootUrl) {
return new URL(`/favicon.ico`, rootUrl).href;
}
// returns an array of URLs to favicon images
function getFavicons(doc, rootUrl) {
const images = [];
let nodes;
let src;
const relSelectors = [`rel=icon`, `rel="shortcut icon"`, `rel=apple-touch-icon`];
relSelectors.forEach((relSelector) => {
// look for all icon tags
nodes = doc(`link[${relSelector}]`);
// collect all images from icon tags
if (nodes.length) {
nodes.each((_, node) => {
if (node.type === `tag`)
src = node.attribs.href;
if (src) {
src = new URL(src, rootUrl).href;
images.push(src);
}
});
}
});
// if no icon images, use default favicon location
if (images.length <= 0) {
images.push(getDefaultFavicon(rootUrl));
}
return images;
}
function parseImageResponse(url, contentType) {
return {
url,
mediaType: `image`,
contentType,
favicons: [getDefaultFavicon(url)],
};
}
function parseAudioResponse(url, contentType) {
return {
url,
mediaType: `audio`,
contentType,
favicons: [getDefaultFavicon(url)],
};
}
function parseVideoResponse(url, contentType) {
return {
url,
mediaType: `video`,
contentType,
favicons: [getDefaultFavicon(url)],
};
}
function parseApplicationResponse(url, contentType) {
return {
url,
mediaType: `application`,
contentType,
favicons: [getDefaultFavicon(url)],
};
}
function parseTextResponse(body, url, options = {}, contentType) {
const doc = (0, cheerio_1.load)(body);
let response = {
url,
title: getTitle(doc),
siteName: getSiteName(doc),
description: getDescription(doc),
author: getAuthor(doc),
mediaType: getMediaType(doc) || `website`,
contentType,
images: getImages(doc, url, options.imagesPropertyType),
videos: getVideos(doc),
favicons: getFavicons(doc, url),
};
if ((options === null || options === void 0 ? void 0 : options.onResponse) && typeof options.onResponse !== `function`) {
throw new Error(`link-preview-js onResponse option must be a function`);
}
if (options === null || options === void 0 ? void 0 : options.onResponse) {
// send in a cloned response (to avoid mutation of original response reference)
const clonedResponse = structuredClone(response);
const urlObject = new URL(url);
response = options.onResponse(clonedResponse, doc, urlObject);
}
return response;
}
function parseUnknownResponse(body, url, options = {}, contentType) {
return parseTextResponse(body, url, options, contentType);
}
function parseResponse(response, options) {
try {
// console.log("[link-preview-js] response", response);
let contentType = response.headers[`content-type`];
let contentTypeTokens = [];
let charset = null;
if (!contentType) {
return parseUnknownResponse(response.data, response.url, options);
}
if (contentType.includes(`;`)) {
contentTypeTokens = contentType.split(`;`);
contentType = contentTypeTokens[0];
for (let token of contentTypeTokens) {
if (token.indexOf("charset=") !== -1) {
charset = token.split("=")[1];
}
}
}
// parse response depending on content type
if (constants_1.CONSTANTS.REGEX_CONTENT_TYPE_IMAGE.test(contentType)) {
return Object.assign(Object.assign({}, parseImageResponse(response.url, contentType)), { charset });
}
if (constants_1.CONSTANTS.REGEX_CONTENT_TYPE_AUDIO.test(contentType)) {
return Object.assign(Object.assign({}, parseAudioResponse(response.url, contentType)), { charset });
}
if (constants_1.CONSTANTS.REGEX_CONTENT_TYPE_VIDEO.test(contentType)) {
return Object.assign(Object.assign({}, parseVideoResponse(response.url, contentType)), { charset });
}
if (constants_1.CONSTANTS.REGEX_CONTENT_TYPE_TEXT.test(contentType)) {
return Object.assign(Object.assign({}, parseTextResponse(response.data, response.url, options, contentType)), { charset });
}
if (constants_1.CONSTANTS.REGEX_CONTENT_TYPE_APPLICATION.test(contentType)) {
return Object.assign(Object.assign({}, parseApplicationResponse(response.url, contentType)), { charset });
}
const htmlString = response.data;
return Object.assign(Object.assign({}, parseUnknownResponse(htmlString, response.url, options)), { charset });
}
catch (e) {
throw new Error(`link-preview-js could not fetch link information ${e.toString()}`);
}
}
/**
* Parses the text, extracts the first link it finds and does a HTTP request
* to fetch the website content, afterwards it tries to parse the internal HTML
* and extract the information via meta tags
* @param text string, text to be parsed
* @param options ILinkPreviewOptions
*/
function getLinkPreview(text, options) {
return __awaiter(this, void 0, void 0, function* () {
var _a, _b, _c;
if (!text || typeof text !== `string`) {
throw new Error(`link-preview-js did not receive a valid url or text`);
}
const detectedUrl = text
.replace(/\n/g, ` `)
.split(` `)
.find((token) => constants_1.CONSTANTS.REGEX_VALID_URL.test(token));
if (!detectedUrl) {
throw new Error(`link-preview-js did not receive a valid a url or text`);
}
if ((options === null || options === void 0 ? void 0 : options.followRedirects) === `manual` && !(options === null || options === void 0 ? void 0 : options.handleRedirects)) {
throw new Error(`link-preview-js followRedirects is set to manual, but no handleRedirects function was provided`);
}
if (options === null || options === void 0 ? void 0 : options.resolveDNSHost) {
const resolvedUrl = yield options.resolveDNSHost(detectedUrl);
throwOnLoopback(resolvedUrl);
}
else {
console.error("[link-preview-js] You are not resolving DNS addresses (resolveDNSHost option) before fetching a link. This can cause loopback attacks. Always try to resolve DNS addresses");
}
const timeout = (_a = options === null || options === void 0 ? void 0 : options.timeout) !== null && _a !== void 0 ? _a : 3000; // 3 second timeout default
const controller = new AbortController();
const timeoutCounter = setTimeout(() => controller.abort(), timeout);
const fetchOptions = {
headers: (_b = options === null || options === void 0 ? void 0 : options.headers) !== null && _b !== void 0 ? _b : {},
redirect: (_c = options === null || options === void 0 ? void 0 : options.followRedirects) !== null && _c !== void 0 ? _c : `error`,
signal: controller.signal,
};
const fetchUrl = (options === null || options === void 0 ? void 0 : options.proxyUrl) ? options.proxyUrl.concat(detectedUrl) : detectedUrl;
let response = yield fetch(fetchUrl, fetchOptions).catch((e) => {
if (e.name === `AbortError`) {
throw new Error(`Request timeout`);
}
clearTimeout(timeoutCounter);
throw e;
});
if (response.status > 300 &&
response.status < 309 &&
fetchOptions.redirect === `manual` &&
(options === null || options === void 0 ? void 0 : options.handleRedirects)) {
const locationHeader = response.headers.get(`location`) || ``;
const isAbsoluteURI = locationHeader.startsWith("http://") || locationHeader.startsWith("https://");
// Resolve the URL, handling both absolute and relative URLs
const forwardedUrl = isAbsoluteURI ? locationHeader : new URL(locationHeader, fetchUrl).href;
if (!options.handleRedirects(fetchUrl, forwardedUrl)) {
throw new Error(`link-preview-js could not handle redirect`);
}
if (options === null || options === void 0 ? void 0 : options.resolveDNSHost) {
const resolvedUrl = yield options.resolveDNSHost(forwardedUrl);
throwOnLoopback(resolvedUrl);
}
response = yield fetch(forwardedUrl, fetchOptions);
}
clearTimeout(timeoutCounter);
const headers = {};
response.headers.forEach((header, key) => {
headers[key] = header;
});
const normalizedResponse = {
url: (options === null || options === void 0 ? void 0 : options.proxyUrl) ? response.url.replace(options.proxyUrl, ``) : response.url,
headers,
data: yield response.text(),
};
return parseResponse(normalizedResponse, options);
});
}
/**
* Skip the library fetching the website for you, instead pass a response object
* from whatever source you get and use the internal parsing of the HTML to return
* the necessary information
* @param response Preview Response
* @param options IPreviewLinkOptions
*/
function getPreviewFromContent(response, options) {
return __awaiter(this, void 0, void 0, function* () {
if (!response || typeof response !== `object`) {
throw new Error(`link-preview-js did not receive a valid response object`);
}
if (!response.url) {
throw new Error(`link-preview-js did not receive a valid response object`);
}
return parseResponse(response, options);
});
}