media-scraper
Version:
TypeScript-first multi-platform social media scraper without API keys
64 lines (63 loc) • 3.27 kB
JavaScript
import { $fetch } from "ofetch";
import { load } from "cheerio";
import { facebookHeaders } from "../utils/helpers.mjs";
import { facebookRegex } from "../utils/regex.mjs";
export default async (url) => {
const match = url.match(facebookRegex);
if (!match) throw new Error("Invalid Facebook URL");
const post = await $fetch(url, { headers: facebookHeaders }).catch(() => null);
if (!post) throw new Error("Failed to fetch the Facebook URL");
const $ = load(post);
const scripts = $("script[type='application/json']");
const metaDescription = $("meta[name='description']")?.attr("content");
const mustInclude = ["RelayPrefetchedStreamCache", "videoDeliveryLegacyFields"];
const mustNotInclude = ["CometUFI"];
const ownerMustInclude = ["video_owner", "displayPicture"];
const ownerMustInclude2 = ["owner_as_page", "profile_pic_uri"];
let data;
let ownerData;
for (const script of scripts) {
const content = $(script).html();
if (content && mustInclude.every((term) => content.includes(term) && !mustNotInclude.some((term2) => content.includes(term2)))) {
const json = JSON.parse(content);
data = json?.require?.[0]?.[3]?.[0]?.__bbox?.require?.find((item) => item?.includes("RelayPrefetchedStreamCache"))?.[3]?.[1]?.__bbox?.result?.data;
}
if (content && (ownerMustInclude.every((term) => content.includes(term)) || ownerMustInclude2.every((term) => content.includes(term)))) {
const json = JSON.parse(content);
const fullData = json?.require?.[0]?.[3]?.[0]?.__bbox?.require?.find((item) => item?.includes("RelayPrefetchedStreamCache"))?.[3]?.[1]?.__bbox?.result?.data;
ownerData = fullData?.video?.creation_story?.short_form_video_context?.video_owner || fullData?.attachments?.[0]?.media?.owner?.owner_as_page;
}
}
if (!data) {
throw new Error("Failed to extract video data from the Facebook URL");
}
const video = data?.video;
const caption = video?.creation_story?.message?.text || metaDescription;
const attachment = video?.story?.attachments?.find((item) => item?.media?.id === video?.id) || video?.creation_story?.attachments?.[0];
const media = attachment?.media?.width && attachment?.media?.height ? attachment?.media : video?.creation_story?.short_form_video_context?.playback_video;
const { width, height } = media || {};
const duration = media?.playable_duration_in_ms || (media?.length_in_second ? media.length_in_second * 1e3 : void 0);
const thumbnail_url = media?.thumbnailImage?.uri || media?.preferred_thumbnail?.image?.uri;
const playback_video = media?.videoDeliveryLegacyFields;
return {
id: video?.id,
caption: caption?.trim(),
permalink_url: media?.permalink_url || media?.url,
thumbnail_url,
author: {
id: ownerData?.id,
name: ownerData?.name,
username: ownerData?.name,
avatar_url: ownerData?.displayPicture?.uri || ownerData?.profile_pic_uri,
url: ownerData?.id ? `https://www.facebook.com/profile.php?id=${ownerData?.id}` : void 0
},
width,
height,
created_at: media?.publish_time || video?.creation_story?.creation_time,
video: {
duration,
sd_url: playback_video?.browser_native_sd_url,
hd_url: playback_video?.browser_native_hd_url
}
};
};