UNPKG

astro-loader-tweets

Version:
678 lines (673 loc) 22 kB
import { TwitterApi, ApiResponseError } from 'twitter-api-v2'; import { z } from 'astro/zod'; import fs from 'node:fs/promises'; import path from 'node:path'; // src/index.ts // package.json var package_default = { name: "astro-loader-tweets", homepage: "https://github.com/lin-stephanie/astro-loaders/blob/main/packages/astro-loader-tweets/"}; var defaultConfig = { storage: "default", storePath: "src/data/tweets.json", removeTrailingUrls: true, urlTextType: "post-text", newlineHandling: "none" }; var TweetsLoaderConfigSchema = z.object({ /** * An array of Tweet IDs to fetch content for. */ ids: z.array(z.string()), /** * The method to store the loaded tweets: * - `'default'`: Uses Astro's default KV store (`./store/data-store.json`). * - `'custom'`: Use a custom JSON file path. * - `'both'`: Both default and custom path. * * @default 'default' */ storage: z.enum(["default", "custom", "both"]).default(defaultConfig.storage), /** * The custom output path for storing tweets, either absolute or * relative to the Astro project root. Must end with `.json`. * Required if `storage` is `'custom'` or `'both'`. * * @example * 'src/data/tweets.json' * './src/content/tweets.json' * '/absolute/path/to/tweets.json' */ storePath: z.string().default(defaultConfig.storePath), /** * Whether to remove trailing URLs from the tweet text in the generated `text_html` and `text_markdown`, * typically used for views or referenced tweets. * * @default true */ removeTrailingUrls: z.boolean().default(defaultConfig.removeTrailingUrls), /** * The type of text to display for links when generating `text_html` and `text_markdown`: * - `'domain-path'`: Shows the link's domain and path. * - `'post-text'`: Uses the link text as shown in the tweet. * * @default 'post-text' */ linkTextType: z.enum(["domain-path", "post-text"]).default(defaultConfig.urlTextType), /** * The way for processing `\n` in `text_html` generation: * - `'none'`: Keep as is. * - `'break'`: Replace consecutive `\n` with `<br>`. * - `'paragraph'`: Wrap paragraphs with `<p>` while removing standalone `\n`. * * @default 'none' */ newlineHandling: z.enum(["none", "break", "paragraph"]).default(defaultConfig.newlineHandling), /** * The X app-only Bearer Token for authentication. * * This is optional; by default, it reads from the `X_TOKEN` environment variable. * You may also configure it directly here (not recommended; if you do, ensure it is not exposed * in public code repositories). * * @see * - {@link https://developer.x.com/en/docs/authentication/oauth-2-0/bearer-tokens How to create an X app-only Bearer Token} * - {@link https://docs.astro.build/en/guides/environment-variables/#setting-environment-variables How to store token in Astro project environment variables} */ authToken: z.string().optional() }).superRefine((config, ctx) => { if ((config.storage === "custom" || config.storage === "both") && config.storePath && !config.storePath.endsWith(".json")) { ctx.addIssue({ code: z.ZodIssueCode.custom, message: "`storePath` must end with `.json`" }); } }); var getTweetsApiOptions = { expansions: [ "author_id", "geo.place_id", "attachments.media_keys", "attachments.poll_ids" ], "tweet.fields": [ "id", "text", "edit_history_tweet_ids", "attachments", "author_id", "conversation_id", "created_at", "entities", "in_reply_to_user_id", "lang", "public_metrics", "referenced_tweets" ], "user.fields": [ "id", "name", "username", "connection_status", "created_at", "description", "entities", "profile_image_url", "public_metrics", "url" ], "place.fields": [ "id", "full_name", "contained_within", "country", "country_code", "geo", "name", "place_type" ], "media.fields": [ "media_key", "type", "url", "preview_image_url", "height", "width", "alt_text", "duration_ms", "public_metrics", "variants" ], "poll.fields": [ "id", "options", "duration_minutes", "end_datetime", "voting_status" ] }; var TweetV2Schema = z.object({ id: z.string(), text: z.string(), edit_history_tweet_ids: z.array(z.string()), attachments: z.object({ media_keys: z.array(z.string()).optional(), poll_ids: z.array(z.string()).optional() }).optional(), author_id: z.string().optional(), conversation_id: z.string().optional(), created_at: z.string().optional(), entities: z.object({ annotations: z.array( z.object({ start: z.number(), end: z.number(), probability: z.number(), type: z.string(), normalized_text: z.string() }) ).optional(), urls: z.array( z.object({ start: z.number(), end: z.number(), url: z.string(), expanded_url: z.string(), display_url: z.string(), unwound_url: z.string().optional(), title: z.string().optional(), description: z.string().optional(), status: z.union([z.string(), z.number()]).optional(), media_key: z.string().optional(), images: z.array( z.object({ url: z.string(), width: z.number(), height: z.number() }) ).optional() }) ).optional(), hashtags: z.array( z.object({ start: z.number(), end: z.number(), tag: z.string() }) ).optional(), cashtags: z.array( z.object({ start: z.number(), end: z.number(), tag: z.string() }) ).optional(), mentions: z.array( z.object({ start: z.number(), end: z.number(), username: z.string(), id: z.string() }) ).optional() }).optional(), geo: z.object({ coordinates: z.object({ type: z.string(), coordinates: z.tuple([z.number(), z.number()]).nullable() }), place_id: z.string() }).optional(), in_reply_to_user_id: z.string().optional(), lang: z.string().optional(), public_metrics: z.object({ retweet_count: z.number(), reply_count: z.number(), like_count: z.number(), quote_count: z.number(), impression_count: z.number(), bookmark_count: z.number().optional() }).optional(), referenced_tweets: z.array( z.object({ type: z.union([ z.literal("retweeted"), z.literal("quoted"), z.literal("replied_to") ]), id: z.string() }).optional() ).optional(), source: z.string().optional() }); var TweetV2ExtendedSchema = TweetV2Schema.extend({ text_html: z.string(), text_markdown: z.string(), view_type: z.enum(["none", "media", "link"]), url_for_link_view: z.string().optional() }); var EntitySchema = z.object({ start: z.number(), end: z.number() }); var UrlEntitySchema = EntitySchema.extend({ url: z.string(), expanded_url: z.string(), display_url: z.string() }); var HashtagEntitySchema = EntitySchema.extend({ tag: z.string().optional(), hashtag: z.string().optional() }); var CashtagEntitySchema = EntitySchema.extend({ tag: z.string().optional(), cashtag: z.string().optional() }); var MentionEntitySchema = EntitySchema.extend({ username: z.string().optional() }); var UserV2Schema = z.object({ id: z.string(), name: z.string(), username: z.string(), connection_status: z.array(z.string()).optional(), created_at: z.string().optional(), description: z.string().optional(), entities: z.object({ url: z.object({ urls: z.array(UrlEntitySchema) }).optional(), description: z.object({ urls: z.array(UrlEntitySchema).optional(), hashtags: z.array(HashtagEntitySchema).optional(), cashtags: z.array(CashtagEntitySchema).optional(), mentions: z.array(MentionEntitySchema).optional() }).optional() }).optional(), profile_image_url: z.string().optional(), public_metrics: z.object({ followers_count: z.number().optional(), following_count: z.number().optional(), tweet_count: z.number().optional(), listed_count: z.number().optional(), like_count: z.number().optional(), media_count: z.number().optional() }).optional(), url: z.string().optional() }); var PlaceV2Schema = z.object({ id: z.string(), full_name: z.string(), contained_within: z.array(z.string()).optional(), country: z.string().optional(), country_code: z.string().optional(), geo: z.object({ type: z.string(), bbox: z.array(z.number()), properties: z.any() }).optional(), name: z.string().optional(), place_type: z.string().optional() }); var MediaVariantsV2Schema = z.object({ bit_rate: z.number().optional(), content_type: z.union([ z.literal("video/mp4"), z.literal("application/x-mpegURL"), z.string() ]), url: z.string() }); var MediaObjectV2Schema = z.object({ media_key: z.string(), type: z.union([ z.literal("video"), z.literal("animated_gif"), z.literal("photo"), z.string() ]), url: z.string().optional(), preview_image_url: z.string().optional(), width: z.number().optional(), height: z.number().optional(), alt_text: z.string().optional(), duration_ms: z.number().optional(), public_metrics: z.object({ view_count: z.number() }).optional(), variants: z.array(MediaVariantsV2Schema).optional() }); var PollV2Schema = z.object({ id: z.string(), options: z.array( z.object({ position: z.number(), label: z.string(), votes: z.number() }) ), duration_minutes: z.number().optional(), end_datetime: z.string().optional(), voting_status: z.string().optional() }); z.object({ users: z.array(UserV2Schema).optional(), places: z.array(PlaceV2Schema).optional(), media: z.array(MediaObjectV2Schema).optional(), polls: z.array(PollV2Schema).optional() }); var TweetSchema = z.object({ id: z.string(), tweet: TweetV2ExtendedSchema, user: z.union([UserV2Schema, z.null()]), place: z.union([PlaceV2Schema, z.null()]), media: z.union([z.array(MediaObjectV2Schema), z.null()]), poll: z.union([z.array(PollV2Schema), z.null()]) }); function getTrailingUrls(text) { if (text.trimEnd().length === 0) return []; const regex = /(\s+https?:\/\/t\.co\/[A-Za-z0-9]+)+$/; const match = text.match(regex); if (!match) return []; const matchedStr = match[0]; const trailingTokens = matchedStr.trim().split(/\s+/); return trailingTokens; } function cleanupUrlFromText(text, urlToRemove) { const idx = text.lastIndexOf(urlToRemove); if (idx === -1) return text; return text.slice(0, idx).trimEnd(); } function escapeHTML(str) { const escapeMap = { "&": "&amp;", "<": "&lt;", ">": "&gt;", '"': "&quot;", "'": "&#39;" }; return str?.replace(/[&<>"']/g, (match) => escapeMap[match] || match) ?? ""; } function getDomainAndPath(url) { const parsedUrl = new URL(url); return `${parsedUrl.hostname}${parsedUrl.pathname}`; } function processTweetText(tweet, options) { const { removeTrailingUrls, linkTextType, newlineHandling } = options; const entities = tweet.entities || {}; const urls = entities.urls || []; const hashtags = entities.hashtags || []; const cashtags = entities.cashtags || []; const mentions = entities.mentions || []; let text = tweet.text; let viewType = "none"; let urlForLinkView = void 0; const trailingUrls = getTrailingUrls(text); if (trailingUrls.length > 0) { const lastUrl = trailingUrls[trailingUrls.length - 1]; const foundUrl = urls.find((u) => u.url === lastUrl); viewType = foundUrl?.media_key && foundUrl.media_key.length > 0 || tweet.attachments?.media_keys && tweet.attachments.media_keys.length > 0 ? "media" : "link"; if (viewType === "link") urlForLinkView = foundUrl?.expanded_url; if (removeTrailingUrls) { text = cleanupUrlFromText(text, lastUrl); if (foundUrl && /https:\/\/twitter\.com\/\w+\/status\/\d+/.test(foundUrl.expanded_url)) { for (let i = trailingUrls.length - 2; i >= 0; i--) { const u = trailingUrls[i]; const entity = urls.find((e) => e.url === u); if (entity?.media_key) { text = cleanupUrlFromText(text, u); } } } } } let textHtml = text; let textMarkdown = text; if (hashtags.length > 0) { textHtml = textHtml.replace( /(^|\W)#([\p{L}\p{M}\w]+)/gu, (match, prefix, tag) => { const entity = hashtags.find((h) => h.tag === tag); if (entity) { const escapedTag = escapeHTML(tag); const url = `https://x.com/hashtag/${encodeURIComponent(escapedTag)}`; const htmlLink = `${prefix}<a href="${url}" target="_blank" rel="noopener noreferrer" aria-lable="Hashtag #${escapedTag}">#${escapedTag}</a>`; const mdLink = `${prefix}[#${escapedTag}](${url})`; textMarkdown = textMarkdown.replace(match, mdLink); return htmlLink; } return match; } ); } if (cashtags.length > 0) { textHtml = textHtml.replace( /(^|\W)\$([A-Za-z0-9]+)/g, (match, prefix, tag) => { const entity = cashtags.find((c) => c.tag === tag); if (entity) { const escapedTag = escapeHTML(tag); const url = `https://x.com/search?q=%24${encodeURIComponent(escapedTag)}`; const htmlLink = `${prefix}<a href="${url}" target="_blank" rel="noopener noreferrer" aria-lable="Cashtag $${escapedTag}">$${escapedTag}</a>`; const mdLink = `${prefix}[$${escapedTag}](${url})`; textMarkdown = textMarkdown.replace(match, mdLink); return htmlLink; } return match; } ); } if (mentions.length > 0) { textHtml = textHtml.replace(/(^|\W)@(\w+)/g, (match, prefix, username) => { const entity = mentions.find((m) => m.username === username); if (entity) { const escapedUsername = escapeHTML(username); const url = `https://x.com/${encodeURIComponent(entity.username)}`; const htmlLink = `${prefix}<a href="${url}" target="_blank" rel="noopener noreferrer" aria-lable="Mention @${escapedUsername}">@${escapedUsername}</a>`; const mdLink = `${prefix}[@${escapedUsername}](${url})`; textMarkdown = textMarkdown.replace(match, mdLink); return htmlLink; } return match; }); } let lastMatchedUrlEntity; if (urls.length > 0) { textHtml = textHtml.replace(/https?:\/\/t\.co\/[A-Za-z0-9]+/g, (match) => { const entity = urls.find((e) => e.url === match); if (entity) { lastMatchedUrlEntity = entity; const escapedUrl = escapeHTML(entity.expanded_url); const escapedText = linkTextType === "domain-path" ? escapeHTML(getDomainAndPath(entity.expanded_url)) : escapeHTML(entity.display_url); const htmlLink = `<a href="${escapedUrl}" target="_blank" rel="noopener noreferrer" aria-lable="Link to ${escapedText}">${escapedText}</a>`; const mdLink = `[${escapedText}](${escapedUrl})`; textMarkdown = textMarkdown.replace(match, mdLink); return htmlLink; } return match; }); } if (viewType === "none" && lastMatchedUrlEntity) { viewType = lastMatchedUrlEntity.media_key ? "media" : "link"; if (viewType === "link") urlForLinkView = lastMatchedUrlEntity.expanded_url; } if (newlineHandling !== "none") { if (newlineHandling === "break") { textHtml = textHtml.replace(/\n+/g, "<br/ >\n"); } else if (newlineHandling === "paragraph") { textHtml = textHtml.split("\n").map((line) => { const l = line.trim(); if (l.length > 0) return `<p>${l}</p>`; }).join(""); } } return { ...tweet, text_html: textHtml, text_markdown: textMarkdown, view_type: viewType, url_for_link_view: urlForLinkView }; } function processTweets(processedTweets, includes) { if (!includes) { return processedTweets.map((tweet) => ({ id: tweet.id, tweet, user: null, place: null, media: null, poll: null })); } return processedTweets.map((tweet) => { const processedTweet = { id: tweet.id, tweet, user: null, place: null, media: null, poll: null }; if (tweet.author_id && includes.users) { processedTweet.user = includes.users.find((user) => user.id === tweet.author_id) || null; } if (tweet.geo?.place_id && includes.places) { processedTweet.place = includes.places.find((place) => place.id === tweet.geo?.place_id) || null; } if (tweet.attachments?.media_keys && includes.media) { const mediaArray = includes.media.filter( (media) => tweet.attachments?.media_keys?.includes(media.media_key) ); processedTweet.media = mediaArray.length > 0 ? mediaArray : null; } else { processedTweet.media = null; } if (tweet.attachments?.poll_ids && includes.polls) { const pollArray = includes.polls.filter( (poll) => tweet.attachments?.poll_ids?.includes(poll.id) ); processedTweet.poll = pollArray.length > 0 ? pollArray : null; } else { processedTweet.poll = null; } return processedTweet; }); } var SavedTweets = z.array( z.object({ id: z.string() }).passthrough() ); async function saveOrUpdateTweets(tweets, storePath) { const resolvedPath = path.isAbsolute(storePath) ? storePath : path.resolve(process.cwd(), storePath); let savedTweets = []; let fileExists = true; try { await fs.access(resolvedPath); } catch { fileExists = false; } if (fileExists) { const fileContent = await fs.readFile(resolvedPath, "utf-8"); const parsedContent = JSON.parse(fileContent); const parsedData = SavedTweets.safeParse(parsedContent); if (!parsedData.success) throw Error( "Invalid JSON format. Ensure the file contains an array of objects, each with a valid `id` field as a string." ); savedTweets = parsedData.data; const savedTweetsMap = new Map(savedTweets.map((t) => [t.id, t])); for (const newTweet of savedTweets) { savedTweetsMap.set(newTweet.id, newTweet); } savedTweets = Array.from(savedTweetsMap.values()); } else { savedTweets = tweets; } await fs.mkdir(path.dirname(resolvedPath), { recursive: true }); await fs.writeFile(resolvedPath, JSON.stringify(savedTweets, null, 2), "utf8"); } // src/index.ts var MAX_IDS_PER_REQUEST = 100; function tweetsLoader(userConfig) { return { name: package_default.name, schema: TweetSchema, async load({ logger, store, parseData, generateDigest }) { const parsedConfig = TweetsLoaderConfigSchema.safeParse(userConfig); if (!parsedConfig.success) { logger.error( `The configuration provided is invalid. ${parsedConfig.error.issues.map((issue) => issue.message).join("\n")}. Check out the configuration: ${package_default.homepage}README.md#configuration.` ); return; } const { ids, storage, storePath, authToken, ...processTweetTextConfig } = parsedConfig.data; const token = authToken || import.meta.env.X_TOKEN; if (ids.length === 0) { logger.warn("No tweet IDs provided and no tweets will be loaded"); return; } if (!token) { logger.error( "No X (Twitter) token provided. Please provide a `authToken` or set the `X_TOKEN` environment variable.\nHow to create an X app-only Bearer Token: https://developer.x.com/en/docs/authentication/oauth-2-0/bearer-tokens.\nHow to store token in Astro project environment variables: https://docs.astro.build/en/guides/environment-variables/#setting-environment-variables." ); return; } logger.info(`Loading ${ids.length} tweets`); const tweets = []; const client = new TwitterApi(token); try { let index = 0; while (index < ids.length) { const batchIds = ids.slice(index, index + MAX_IDS_PER_REQUEST); const res = await client.v2.tweets(batchIds, getTweetsApiOptions); const processedTweets = res.data.map( (tweet) => processTweetText(tweet, processTweetTextConfig) ); tweets.push(...processTweets(processedTweets, res.includes)); index += MAX_IDS_PER_REQUEST; } if (storage === "default" || storage === "both") { for (const item of tweets) { const parsedItem = await parseData({ id: item.id, data: item }); store.set({ id: item.id, data: parsedItem, digest: generateDigest(parsedItem), rendered: { html: item.tweet.text_html } }); } logger.info( `Successfully loaded ${tweets.length} tweets into the Astro store` ); } if (storage === "custom" || storage === "both") { try { await saveOrUpdateTweets(tweets, storePath); logger.info( `Successfully loaded ${tweets.length} tweets into '${storePath}'` ); } catch (error) { logger.error( `Failed to save tweets to '${storePath}'. ${error.message}` ); } } } catch (error) { if (error instanceof ApiResponseError && error.rateLimitError && error.rateLimit) { logger.warn( `Please try again later as the rate limit of ${error.rateLimit.limit} per 15 minutes is exceeded with ${error.rateLimit.remaining} left` ); } else { logger.error(`Failed to load tweets. ${error.message}`); } } } }; } export { TweetSchema, tweetsLoader };