UNPKG

media-scraper

Version:

TypeScript-first multi-platform social media scraper without API keys

64 lines (63 loc) 3.27 kB
import { $fetch } from "ofetch"; import { load } from "cheerio"; import { facebookHeaders } from "../utils/helpers.mjs"; import { facebookRegex } from "../utils/regex.mjs"; export default async (url) => { const match = url.match(facebookRegex); if (!match) throw new Error("Invalid Facebook URL"); const post = await $fetch(url, { headers: facebookHeaders }).catch(() => null); if (!post) throw new Error("Failed to fetch the Facebook URL"); const $ = load(post); const scripts = $("script[type='application/json']"); const metaDescription = $("meta[name='description']")?.attr("content"); const mustInclude = ["RelayPrefetchedStreamCache", "videoDeliveryLegacyFields"]; const mustNotInclude = ["CometUFI"]; const ownerMustInclude = ["video_owner", "displayPicture"]; const ownerMustInclude2 = ["owner_as_page", "profile_pic_uri"]; let data; let ownerData; for (const script of scripts) { const content = $(script).html(); if (content && mustInclude.every((term) => content.includes(term) && !mustNotInclude.some((term2) => content.includes(term2)))) { const json = JSON.parse(content); data = json?.require?.[0]?.[3]?.[0]?.__bbox?.require?.find((item) => item?.includes("RelayPrefetchedStreamCache"))?.[3]?.[1]?.__bbox?.result?.data; } if (content && (ownerMustInclude.every((term) => content.includes(term)) || ownerMustInclude2.every((term) => content.includes(term)))) { const json = JSON.parse(content); const fullData = json?.require?.[0]?.[3]?.[0]?.__bbox?.require?.find((item) => item?.includes("RelayPrefetchedStreamCache"))?.[3]?.[1]?.__bbox?.result?.data; ownerData = fullData?.video?.creation_story?.short_form_video_context?.video_owner || fullData?.attachments?.[0]?.media?.owner?.owner_as_page; } } if (!data) { throw new Error("Failed to extract video data from the Facebook URL"); } const video = data?.video; const caption = video?.creation_story?.message?.text || metaDescription; const attachment = video?.story?.attachments?.find((item) => item?.media?.id === video?.id) || video?.creation_story?.attachments?.[0]; const media = attachment?.media?.width && attachment?.media?.height ? attachment?.media : video?.creation_story?.short_form_video_context?.playback_video; const { width, height } = media || {}; const duration = media?.playable_duration_in_ms || (media?.length_in_second ? media.length_in_second * 1e3 : void 0); const thumbnail_url = media?.thumbnailImage?.uri || media?.preferred_thumbnail?.image?.uri; const playback_video = media?.videoDeliveryLegacyFields; return { id: video?.id, caption: caption?.trim(), permalink_url: media?.permalink_url || media?.url, thumbnail_url, author: { id: ownerData?.id, name: ownerData?.name, username: ownerData?.name, avatar_url: ownerData?.displayPicture?.uri || ownerData?.profile_pic_uri, url: ownerData?.id ? `https://www.facebook.com/profile.php?id=${ownerData?.id}` : void 0 }, width, height, created_at: media?.publish_time || video?.creation_story?.creation_time, video: { duration, sd_url: playback_video?.browser_native_sd_url, hd_url: playback_video?.browser_native_hd_url } }; };