media-scraper
Version:
TypeScript-first multi-platform social media scraper without API keys
68 lines (67 loc) • 2.52 kB
JavaScript
import { $fetch } from "ofetch";
import { load } from "cheerio";
import { threadsHeaders } from "../utils/helpers.mjs";
export default async (url) => {
const regex = /\/@([^/]+)\/post\/([^/?]+)/;
const match = url.match(regex);
const username = match?.[1];
const post_id = match?.[2];
if (!username || !post_id) throw new Error("Invalid Threads URL");
const mediaURL = `https://www.threads.com/@${username}/post/${post_id}/media`;
const post = await $fetch(mediaURL, { headers: threadsHeaders }).catch(() => null);
if (!post) throw new Error("Failed to fetch the Threads URL");
const $ = load(post);
const scripts = $("script[type='application/json']");
const mustInclude = ["RelayPrefetchedStreamCache", '"video_versions"'];
const mustNotInclude = ["relatedPosts"];
let pk = "";
let tId = "";
let data;
for (const script of scripts) {
const content = $(script).html();
if (content && mustInclude.every((term) => content.includes(term)) && mustNotInclude.every((term) => !content.includes(term))) {
const parsed = JSON.parse(content)?.require?.[0]?.[3]?.[0]?.__bbox?.require?.[0]?.[3]?.[1]?.__bbox?.result?.data?.data;
if (pk && tId) {
data = parsed?.edges?.find((edge) => edge.node?.id === pk)?.node.thread_items.find((item) => item?.post?.id === tId)?.post;
break;
}
pk = parsed?.pk;
tId = parsed?.id;
}
}
return {
id: data.id,
pk: data.pk,
code: data.code,
caption: data?.caption?.text?.trim(),
permalink_url: `https://www.threads.com/@${data?.user?.username}/post/${data.code}/`,
author: {
id: data?.user?.id,
name: data?.user?.full_name,
username: data?.user?.username,
avatar_url: data?.user?.profile_pic_url,
url: data?.user?.username ? `https://www.threads.com/@${data?.user?.username}/` : void 0
},
likes_count: data?.like_count,
created_at: data?.taken_at,
image_versions: data?.image_versions2?.candidates?.map((img) => ({
width: img?.width,
height: img?.height,
url: img?.url
})),
video_versions: data?.video_versions,
carousel_media: data?.carousel_media?.map((item) => ({
id: item?.id,
pk: item?.pk,
image_versions: item?.image_versions2?.candidates?.map((img) => ({
width: img?.width,
height: img?.height,
url: img?.url
})),
video_versions: item?.video_versions?.map((vid) => ({
type: vid?.type,
url: vid?.url
}))
}))
};
};