cloudflare-workers-unfurl
Version:
Unfurl urls in cloudflare workers using HTMLRewriter
253 lines (226 loc) • 6.63 kB
JavaScript
/**
* @template Value
* @typedef {Object} GoodResult
* @property {true} ok - The success status.
* @property {Value} value - The data extracted from the URL.
*/
/**
* @template Error
* @typedef {Object} BadResult
* @property {false} ok - The success status.
* @property {Error} error - The error
*/
/**
* @template Value, Error
* @typedef {GoodResult<Value> | BadResult<Error>} Result
*/
/**
* @typedef {Object} UnfurledData
* @property {string} [title] - The title extracted from the URL.
* @property {string} [description] - The description extracted from the URL.
* @property {string} [image] - The image URL extracted from the URL.
* @property {string} [favicon] - The favicon URL extracted from the URL.
*/
/**
* @typedef {'bad-param' | 'failed-fetch'} UnfurlError
*/
const validContentTypes = [
"text/html",
"application/xhtml+xml",
"application/xml",
"image/*",
];
/**
*
* @param {string} contentType
* @returns {boolean}
*/
function isValidContentType(contentType) {
return (
// allow unspecified, try to parse it anyway
!contentType ||
contentType.startsWith("image/") ||
validContentTypes.some((valid) => contentType.startsWith(valid))
);
}
/**
* Handles the unfurling of a URL by extracting metadata such as title, description, image, and favicon.
* @param {string} url - The URL to unfurl.
* @returns {Promise<Result<UnfurledData, UnfurlError>>} - A promise that resolves to an object containing the extracted metadata, or null if an error occurs.
*/
export async function unfurl(url) {
if (typeof url !== "string" || !url.match(/^https?:\/\//)) {
return { ok: false, error: "bad-param" };
}
// cloudflare has a built-in HTML parser/rewriter called HTMLRewriter. in order to use it, we
// need to define classes that act as event handlers for certain elements, attributes, etc.
// see https://developers.cloudflare.com/workers/runtime-apis/html-rewriter/
const meta$ = new MetaExtractor();
const title$ = new TextExtractor();
const icon$ = new IconExtractor();
try {
const headers = new Headers();
for (const contentType of validContentTypes) {
headers.append("accept", contentType);
}
// Some sites block requests that don't have a user agent.
headers.append(
"user-agent",
"tldraw-bot/0.0.8 (+https://github.com/tldraw/cloudflare-workers-unfurl)"
);
const res = await fetch(url, { headers });
if (!res.ok || !isValidContentType(res.headers.get("content-type") ?? "")) {
return { ok: false, error: "failed-fetch" };
}
if (res.headers.get("content-type")?.startsWith("image/")) {
return {
ok: true,
value: {
image: url,
title: new URL(url).pathname.split("/").pop() || undefined,
},
};
}
await new HTMLRewriter()
.on("meta", meta$)
.on("title", title$)
.on("link", icon$)
.transform(res)
.blob();
} catch {
return { ok: false, error: "failed-fetch" };
}
// we don't know exactly what we'll end up with, so this is a best-effort extraction
const { og, twitter } = meta$;
const title =
og["og:title"] ?? twitter["twitter:title"] ?? title$.string ?? undefined;
const description =
og["og:description"] ??
twitter["twitter:description"] ??
meta$.description ??
undefined;
let image =
og["og:image:secure_url"] ??
og["og:image"] ??
twitter["twitter:image"] ??
undefined;
let favicon = icon$.appleIcon ?? icon$.icon ?? undefined;
if (image && !image?.startsWith("http")) {
image = new URL(image, url).href;
}
if (favicon && !favicon?.startsWith("http")) {
favicon = new URL(favicon, url).href;
}
return {
ok: true,
value: {
title,
description,
image,
favicon,
},
};
}
/**
* Implements a handler for a GET request where the uri is passed in as a search param called `url`.
*
* e.g. GET /foo/bar?url=https://example.com
*
* @param {Request} request
* @returns {Promise<Response>}
*/
export async function handleUnfurlRequest(request) {
const url = new URL(request.url).searchParams.get("url");
if (!url) {
return new Response("Missing URL query parameter.", { status: 400 });
}
const result = await unfurl(url);
if (result.ok) {
return new Response(JSON.stringify(result.value), {
headers: { "Content-Type": "application/json" },
});
} else if (result.error === "bad-param") {
return new Response("Bad URL query parameter.", { status: 400 });
} else {
return new Response("Failed to fetch URL.", { status: 422 });
}
}
/**
* Extracts text from HTML elements.
*/
class TextExtractor {
/**
* The accumulated text extracted from elements.
* @type {string}
*/
string = "";
/**
* Handles an incoming piece of text.
* @param {Object} param - The text object.
* @param {string} param.text - The incoming text.
*/
text({ text }) {
this.string += text;
}
}
/**
* Extracts metadata from HTML elements.
*/
class MetaExtractor {
/**
* The Open Graph (og) metadata extracted from elements.
* @type {Object.<string, string|null>}
*/
og = {};
/**
* The Twitter metadata extracted from elements.
* @type {Object.<string, string|null>}
*/
twitter = {};
/**
* The description extracted from elements.
* @type {string|null}
*/
description = null;
/**
* Handles an incoming element.
* @param {Element} element - The incoming element.
*/
element(element) {
const property = element.getAttribute("property");
const name = element.getAttribute("name");
if (property && property.startsWith("og:")) {
this.og[property] = element.getAttribute("content");
} else if (name && name.startsWith("twitter:")) {
this.twitter[name] = element.getAttribute("content");
} else if (name === "description") {
this.description = element.getAttribute("content");
}
}
}
/**
* Extracts favicon URLs from HTML elements.
*/
class IconExtractor {
/**
* The Apple touch icon URL extracted from elements.
* @type {string|null}
*/
appleIcon = null;
/**
* The favicon URL extracted from elements.
* @type {string|null}
*/
icon = null;
/**
* Handles an incoming element.
* @param {Element} element - The incoming element.
*/
element(element) {
if (element.getAttribute("rel") === "icon") {
this.icon = element.getAttribute("href");
} else if (element.getAttribute("rel") === "apple-touch-icon") {
this.appleIcon = element.getAttribute("href");
}
}
}