UNPKG

metafetch

Version:

Metafetch fetches a given URL's title, description, images, links etc.

1 lines 6.06 kB
import{parseHTML}from"linkedom";export class Metafetch{#t;constructor(t="Mozilla/5.0 (X11; Linux i686; rv:141.0) Gecko/20100101 Firefox/141.0"){this.#t=t}setUserAgent(t){if("string"!=typeof t||""===t.trim())throw new Error("Invalid User Agent: Must be a non-empty string.");this.#t=t}get userAgent(){return this.#t}async _getPuppeteer(){return import("puppeteer")}async fetch(t,e={}){if("string"!=typeof t||!t)throw new Error("Invalid URL: URL must be a non-empty string.");const r=e.retries??0,a=e.retryDelay??1e3;for(let n=0;n<=r;n++)try{const r=t.split("#")[0],a={title:!0,description:!0,type:!0,url:!0,siteName:!0,charset:!0,image:!0,meta:!0,images:!0,links:!0,headers:!0,language:!0,favicon:!0,feeds:!0,...e.flags||{}};let n,i,s,o={};if(e.render){let t;try{t=(await this._getPuppeteer()).default}catch(t){throw new Error('The "render" option requires the "puppeteer" package. Please install it (`npm install puppeteer`) and try again.')}const a=await t.launch({headless:!0,args:["--no-sandbox","--disable-setuid-sandbox","--disable-dev-shm-usage"]}),c=await a.newPage();try{await c.setUserAgent(e.userAgent||this.#t);const t=await c.goto(r,{waitUntil:"networkidle0"});if(!t)throw new Error("Puppeteer navigation failed to return a response.");if(!t.ok())throw new Error(`Request failed with status: ${t.status()} ${t.statusText()}`);const a=await t.buffer();if(0===a.byteLength)throw new Error("Received an empty response body.");const l=new Uint8Array(a).buffer;o=t.headers(),i=c.url(),s=this._detectCharset(o["content-type"],l),n=await c.content()}finally{await a.close()}}else{const t={method:"GET",redirect:"follow",...e.fetch,headers:{Accept:"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8","User-Agent":e.userAgent||this.#t,...e.fetch?.headers}},a=await fetch(r,t);if(!a.ok)throw new Error(`Request failed with status: ${a.status} ${a.statusText}`);const c=await a.arrayBuffer();if(0===c.byteLength)throw new Error("Received an empty response body.");i=a.url,a.headers.forEach((t,e)=>{o[e]=t}),s=this._detectCharset(o["content-type"],c),n=new TextDecoder(s).decode(c)}const{document:c}=parseHTML(n),l={originalURL:r};if(a.charset&&(l.charset=s),this._extractMeta(c,l,a),this._extractStructuredData(c,l,a),this._extractUrls(c,{url:i},l,a),this._extractAssets(c,l,a),this._extractFavicon(c,l,a),this._extractFeeds(c,l,a),a.headers&&(l.headers=o),a.language){const t=c.documentElement?.lang||o["content-language"]?.split(",")[0].trim();t&&(l.language=t.split("-")[0])}return l}catch(t){if(n===r)throw t;const e=a*2**n+250*Math.random();await new Promise(t=>setTimeout(t,e))}throw new Error("Metafetch failed after all retry attempts.")}_detectCharset(t,e){const r=new Uint8Array(e);if(r.length>=3&&239===r[0]&&187===r[1]&&191===r[2])return"utf-8";if(r.length>=2&&254===r[0]&&255===r[1])return"utf-16be";if(r.length>=2&&255===r[0]&&254===r[1])return"utf-16le";if(t){const e=t.match(/charset="?([^"]+)"?/i);if(e&&e[1])return e[1].toLowerCase()}const a=Math.min(e.byteLength,1024),n=new TextDecoder("latin1").decode(new Uint8Array(e,0,a)),i=n.match(/<\?xml[^>]+encoding=["']([^"']+)["']/i);if(i&&i[1])return i[1].toLowerCase();const s=n.match(/<meta.+?charset=["']?([^"']+)/i);return s&&s[1]?s[1].toLowerCase():"utf-8"}_extractMeta(t,e,r){if(r.title){const r=t.querySelector("title");e.title=r?.textContent?.trim()||""}const a={};t.querySelectorAll("meta").forEach(t=>{const e=t.getAttribute("property")||t.getAttribute("name"),r=t.getAttribute("content");e&&r&&(a[e.toLowerCase()]=r)}),r.meta&&(e.meta=a),r.description&&(e.description=a["og:description"]||a.description),r.type&&(e.type=a["og:type"]),r.siteName&&(e.siteName=a["og:site_name"]),r.image&&(e.image=a["og:image"]||a["twitter:image"])}_extractUrls(t,e,r,a){if(!a.url)return;const n=t.querySelector("base"),i=n?n.getAttribute("href"):null,s=t.querySelector("link[rel=canonical]"),o=s?s.href:null,c=r.meta?r.meta["og:url"]:null;r.url=new URL(o||c||e.url,i||e.url).href;const l=t.querySelector("link[rel=amphtml]");l&&(r.ampURL=new URL(l.href,r.url).href)}_extractAssets(t,e,r){const a=t.querySelector("base"),n=(a?a.getAttribute("href"):null)||e.url||e.originalURL;if(r.images){const r=new Set;t.querySelectorAll("img").forEach(t=>{const e=t.getAttribute("src");if(e){const t=e.trim();if(""!==t&&!t.startsWith("javascript:"))try{r.add(new URL(t,n).href)}catch{}}}),e.images=[...r]}if(r.links){const r=new Set;t.querySelectorAll("a").forEach(t=>{const e=t.getAttribute("href");if(e){const t=e.trim();if(""!==t&&!t.startsWith("#")&&!t.startsWith("javascript:"))try{r.add(new URL(t,n).href)}catch{}}}),e.links=[...r]}}_flattenJsonLd(t,e,r){if(null!=t)if("object"!=typeof t||Array.isArray(t))Array.isArray(t)?t.forEach((t,a)=>{this._flattenJsonLd(t,`${e}:${a}`,r)}):r[e]=t.toString();else for(const a in t)Object.prototype.hasOwnProperty.call(t,a)&&this._flattenJsonLd(t[a],`${e}:${a}`,r)}_extractStructuredData(t,e,r){r.meta&&t.querySelectorAll('script[type="application/ld+json"]').forEach(t=>{try{const r=JSON.parse(t.textContent);"object"==typeof r&&null!==r&&this._flattenJsonLd(r,"ld",e.meta)}catch(t){console.warn("Error parsing JSON-LD:",t)}})}_extractFavicon(t,e,r){if(!r.favicon)return;const a=e.url||e.originalURL;let n={href:"",size:0};t.querySelectorAll("link[rel*='icon']").forEach(t=>{const e=t.getAttribute("href");if(!e||""===e.trim()||e.startsWith("data:"))return;const r=t.getAttribute("rel"),a=t.getAttribute("sizes");let i=0;if(a){const t=a.match(/(\d+)x\d+/i);t&&(i=parseInt(t[1],10))}let s=!1;if(n.href){const t=r.includes("apple-touch-icon"),e=n.href.includes("apple-touch-icon");(t&&!e||t===e&&i>n.size)&&(s=!0)}else s=!0;s&&(n={href:e,size:i})}),n.href?e.favicon=new URL(n.href,a).href:e.favicon=new URL("/favicon.ico",a).href}_extractFeeds(t,e,r){if(!r.feeds)return;const a=e.url||e.originalURL,n=new Set;t.querySelectorAll("link[type*='rss'], link[type*='atom']").forEach(t=>{const e=t.getAttribute("href");e&&""!==e.trim()&&n.add(new URL(e,a).href)}),n.size>0&&(e.feeds=[...n])}}export const metafetch=new Metafetch;export default metafetch;