UNPKG

url-to-json-markdown

Version:

A TypeScript library that fetches URLs and converts them to structured JSON and Markdown format.

508 lines (494 loc) 17.6 kB
#!/usr/bin/env node "use strict"; var __create = Object.create; var __defProp = Object.defineProperty; var __getOwnPropDesc = Object.getOwnPropertyDescriptor; var __getOwnPropNames = Object.getOwnPropertyNames; var __getProtoOf = Object.getPrototypeOf; var __hasOwnProp = Object.prototype.hasOwnProperty; var __copyProps = (to, from, except, desc) => { if (from && typeof from === "object" || typeof from === "function") { for (let key of __getOwnPropNames(from)) if (!__hasOwnProp.call(to, key) && key !== except) __defProp(to, key, { get: () => from[key], enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable }); } return to; }; var __toESM = (mod, isNodeMode, target) => (target = mod != null ? __create(__getProtoOf(mod)) : {}, __copyProps( // If the importer is in node compatibility mode or this is not an ESM // file that has been converted to a CommonJS file using a Babel- // compatible transform (i.e. "__esModule" has not been set), then set // "default" to the CommonJS "module.exports" for node compatibility. isNodeMode || !mod || !mod.__esModule ? __defProp(target, "default", { value: mod, enumerable: true }) : target, mod )); // src/index.ts var import_turndown = __toESM(require("turndown"), 1); var import_jsdom = require("jsdom"); async function urlToJsonMarkdown(url, options) { if (isRedditUrl(url)) { return await parseRedditUrl(url, options); } else if (isTwitterUrl(url)) { return await parseTwitterUrl(url); } else { return await parseGenericUrl(url, options); } } async function getRedditToken(credentials) { const auth = Buffer.from( `${credentials.clientId}:${credentials.clientSecret}` ).toString("base64"); const response = await fetch("https://www.reddit.com/api/v1/access_token", { method: "POST", headers: { Authorization: `Basic ${auth}`, "Content-Type": "application/x-www-form-urlencoded" }, body: "grant_type=client_credentials" }); if (!response.ok) { throw new Error( `Failed to get Reddit token: ${response.status} ${response.statusText}` ); } const data = await response.json(); return data.access_token; } async function parseTwitterUrl(url) { try { const tweetIdMatch = url.match(/status\/(\d+)/); if (!tweetIdMatch) { throw new Error("Invalid Twitter URL: Could not extract tweet ID"); } const embedUrl = `https://publish.twitter.com/oembed?url=${encodeURIComponent( url )}&omit_script=true`; const response = await fetch(embedUrl, { headers: { "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", Accept: "application/json" } }); if (!response.ok) { throw new Error( `Failed to fetch Twitter embed data: ${response.status} ${response.statusText}` ); } const embedData = await response.json(); const dom = new import_jsdom.JSDOM(embedData.html); const document = dom.window.document; const tweetTextElement = document.querySelector("p"); const tweetText = tweetTextElement?.textContent?.trim() || ""; const authorName = embedData.author_name || "Unknown"; const title = `Tweet by ${authorName}`; const content = `# ${title} ${tweetText} --- Author: ${authorName} URL: ${url}`; return { title, content, type: "twitter" }; } catch (error) { if (error instanceof Error) { throw error; } throw new Error(`Failed to parse Twitter URL: ${error}`); } } async function parseRedditUrl(url, options) { try { let apiUrl; let headers; if (options?.clientId && options?.clientSecret) { const token = await getRedditToken({ clientId: options.clientId, clientSecret: options.clientSecret }); apiUrl = convertToOAuthUrl(url); headers = { Authorization: `Bearer ${token}`, "User-Agent": "web:url-to-json-markdown:v1.0.0 (by /u/paradite)" }; } else { apiUrl = convertToPublicJsonUrl(url); headers = { "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", Accept: "application/json, text/plain, */*", "Accept-Language": "en-US,en;q=0.9", "Accept-Encoding": "gzip, deflate, br", "Cache-Control": "no-cache", Pragma: "no-cache" }; } const response = await fetch(apiUrl, { headers }); if (!response.ok) { throw new Error( `Failed to fetch Reddit data: ${response.status} ${response.statusText}` ); } if (!options?.clientId || !options?.clientSecret) { const contentType = response.headers.get("content-type"); if (!contentType?.includes("application/json")) { throw new Error( "Reddit returned HTML instead of JSON. This may be due to rate limiting or CORS restrictions. Consider providing Reddit credentials for more reliable access." ); } } const data = await response.json(); const post = data[0].data.children[0].data; const commentId = extractCommentId(url); if (commentId && data.length > 1) { const comment = findCommentById(data[1], commentId); if (comment) { const commentTitle = extractCommentTitle(comment.body); return { title: commentTitle, content: formatCommentToMarkdown(comment, post.title, options), type: "reddit" }; } } const comments = data.length > 1 ? data[1].data.children : null; return { title: post.title, content: formatPostToMarkdown(post, comments, options), type: "reddit" }; } catch (error) { if (error instanceof Error) { throw error; } throw new Error(`Failed to convert URL to JSON/Markdown: ${error}`); } } async function parseGenericUrl(url, options) { try { const response = await fetch(url, { headers: { "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", "Accept-Language": "en-US,en;q=0.9", "Accept-Encoding": "gzip, deflate, br", "Cache-Control": "no-cache", "Pragma": "no-cache" } }); if (!response.ok) { throw new Error( `Failed to fetch URL: ${response.status} ${response.statusText}` ); } const html = await response.text(); return processHtmlContent(html, url); } catch (error) { const lastError = error instanceof Error ? error : new Error(String(error)); if (lastError.message.includes("403") && options?.enableArchiveFallback) { try { console.log("Attempting archive.org fallback..."); const archiveUrl = `https://web.archive.org/web/2/${url}`; const response = await fetch(archiveUrl, { headers: { "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8" } }); if (response.ok) { const html = await response.text(); const result = processHtmlContent(html, url); result.title = `[Archived] ${result.title}`; result.content = result.content + ` --- *Retrieved from [Internet Archive Wayback Machine](${archiveUrl})*`; return result; } } catch (archiveError) { } } if (lastError.message.includes("403")) { const errorMessage = `Failed to fetch URL: 403 Forbidden. The website "${new URL(url).hostname}" is blocking automated requests. This is common for sites with anti-bot protection (like Cloudflare). Try accessing the URL manually in a browser first, or the site may require authentication.`; if (!options?.enableArchiveFallback) { throw new Error(errorMessage + ` You can also try enabling the archive fallback option.`); } else { throw new Error(errorMessage + ` Archive fallback was attempted but also failed.`); } } throw lastError; } } function processHtmlContent(html, _url) { try { const dom = new import_jsdom.JSDOM(html); const document = dom.window.document; const title = document.querySelector("title")?.textContent?.trim() || document.querySelector("h1")?.textContent?.trim() || "Untitled"; const scripts = document.querySelectorAll( "script, style, nav, footer, header, aside" ); scripts.forEach((element) => element.remove()); const contentSelectors = [ "article", "main", '[role="main"]', ".content", ".post-content", ".entry-content", ".article-content", "body" ]; let contentElement = null; for (const selector of contentSelectors) { contentElement = document.querySelector(selector); if (contentElement && contentElement.textContent?.trim()) { break; } } if (!contentElement) { contentElement = document.body; } const turndownService = new import_turndown.default({ headingStyle: "atx", bulletListMarker: "-", codeBlockStyle: "fenced" }); const markdown = turndownService.turndown(contentElement.innerHTML); return { title, content: markdown, type: "generic" }; } catch (error) { if (error instanceof Error) { throw error; } throw new Error(`Failed to parse generic URL: ${error}`); } } function convertToOAuthUrl(url) { const urlObj = new URL(url); let path = urlObj.pathname; if (path.endsWith("/")) { path = path.slice(0, -1); } const jsonPath = path.endsWith(".json") ? path : `${path}.json`; return `https://oauth.reddit.com${jsonPath}`; } function convertToPublicJsonUrl(url) { const urlObj = new URL(url); let path = urlObj.pathname; if (path.endsWith("/")) { path = path.slice(0, -1); } const jsonPath = path.endsWith(".json") ? path : `${path}.json`; return `https://www.reddit.com${jsonPath}`; } function isRedditUrl(url) { try { const urlObj = new URL(url); return urlObj.hostname === "www.reddit.com" || urlObj.hostname === "reddit.com"; } catch { return false; } } function isTwitterUrl(url) { try { const urlObj = new URL(url); return urlObj.hostname === "x.com" || urlObj.hostname === "twitter.com" || urlObj.hostname === "www.x.com" || urlObj.hostname === "www.twitter.com"; } catch { return false; } } function extractCommentId(url) { const match = url.match(/\/comment\/([a-zA-Z0-9]+)\/?/); return match ? match[1] : null; } function normalizeQuotes(text) { return text.replace(/[\u2018\u2019]/g, "'").replace(/[\u201C\u201D]/g, '"').replace(/[\u2013\u2014]/g, "-").replace(/\u2026/g, "..."); } function extractCommentTitle(commentBody) { if (!commentBody) { return "Untitled Comment"; } const firstLine = normalizeQuotes(commentBody.split("\n")[0].trim()); if (!firstLine || firstLine.length < 3) { return "Untitled Comment"; } if (firstLine.length > 200) { return firstLine.substring(0, 197) + "..."; } return firstLine; } function findCommentById(commentsListing, commentId) { if (!commentsListing?.data?.children) { return null; } for (const child of commentsListing.data.children) { if (child.kind === "t1" && child.data.id === commentId) { return child.data; } if (child.data.replies && typeof child.data.replies === "object") { const found = findCommentById(child.data.replies, commentId); if (found) { return found; } } } return null; } function formatCommentToMarkdown(comment, postTitle, options) { const titleSuffix = postTitle ? ` on "${normalizeQuotes(postTitle)}"` : ""; let markdown = `# Comment by ${comment.author}${titleSuffix} `; if (comment.body) { markdown += `${normalizeQuotes(comment.body)} `; } markdown += `[permalink](https://reddit.com${comment.permalink}) `; const createdDate = new Date(comment.created_utc * 1e3).toLocaleString( "en-US", { year: "numeric", month: "2-digit", day: "2-digit", hour: "2-digit", minute: "2-digit", second: "2-digit" } ); markdown += `by *${comment.author}* (\u2191 ${comment.ups}/ \u2193 ${comment.downs}) ${createdDate}`; if (options?.includeComments && comment.replies && typeof comment.replies === "object" && comment.replies.data?.children) { markdown += "\n\n## Replies\n\n"; comment.replies.data.children.forEach((reply) => { markdown += formatCommentTree(reply); }); } return markdown; } function formatCommentTree(comment) { if (!comment.data) { return ""; } let output = ""; const depth = comment.data.depth || 0; let depthIndicator = ""; if (depth > 0) { depthIndicator = `\u251C${"\u2500".repeat(depth)} `; } else { depthIndicator = "##### "; } if (comment.data.body) { const commentBody = normalizeQuotes(comment.data.body); output += `${depthIndicator}${commentBody} \u23E4 by *${comment.data.author}* (\u2191 ${comment.data.ups}/ \u2193 ${comment.data.downs}) `; } else { output += `${depthIndicator}deleted `; } if (comment.data.replies && typeof comment.data.replies === "object" && comment.data.replies.data?.children) { comment.data.replies.data.children.forEach((reply) => { output += formatCommentTree(reply); }); } if (depth === 0 && comment.data.replies) { output += "\u2514\u2500\u2500\u2500\u2500\n\n"; } return output; } function formatPostToMarkdown(post, comments, options) { let markdown = `# ${normalizeQuotes(post.title)} `; if (post.selftext) { markdown += `${normalizeQuotes(post.selftext)} `; } markdown += `[permalink](https://reddit.com${post.permalink}) `; const createdDate = new Date(post.created_utc * 1e3).toLocaleString( "en-US", { year: "numeric", month: "2-digit", day: "2-digit", hour: "2-digit", minute: "2-digit", second: "2-digit" } ); markdown += `by *${post.author}* (\u2191 ${post.ups}/ \u2193 ${post.downs}) ${createdDate}`; if (options?.includeComments && comments && comments.length > 0) { markdown += "\n\n## Comments\n\n"; comments.forEach((comment) => { markdown += formatCommentTree(comment); }); } return markdown; } // src/cli.ts async function main() { const args = process.argv.slice(2); if (args.length === 0 || args[0] === "--help") { console.error("Usage: url-to-json-markdown <url> [options]"); console.error(""); console.error("Options:"); console.error(" --client-id <id> Reddit client ID (for authenticated requests)"); console.error(" --client-secret <secret> Reddit client secret (for authenticated requests)"); console.error(" --include-comments Include comments in the output"); console.error(" --help Show this help message"); console.error(""); console.error("Examples:"); console.error(" url-to-json-markdown https://www.reddit.com/r/programming/comments/123/title/"); console.error(" url-to-json-markdown https://example.com/article --include-comments"); console.error(" url-to-json-markdown https://reddit.com/r/test/comments/123/ --client-id abc --client-secret xyz"); process.exit(1); } const url = args[0]; const options = {}; for (let i = 1; i < args.length; i++) { const arg = args[i]; if (arg === "--help") { console.log("Usage: url-to-json-markdown <url> [options]"); console.log(""); console.log("Options:"); console.log(" --client-id <id> Reddit client ID (for authenticated requests)"); console.log(" --client-secret <secret> Reddit client secret (for authenticated requests)"); console.log(" --include-comments Include comments in the output"); console.log(" --help Show this help message"); console.log(""); console.log("Examples:"); console.log(" url-to-json-markdown https://www.reddit.com/r/programming/comments/123/title/"); console.log(" url-to-json-markdown https://example.com/article --include-comments"); console.log(" url-to-json-markdown https://reddit.com/r/test/comments/123/ --client-id abc --client-secret xyz"); process.exit(0); } else if (arg === "--client-id") { if (i + 1 >= args.length) { console.error("Error: --client-id requires a value"); process.exit(1); } options.clientId = args[++i]; } else if (arg === "--client-secret") { if (i + 1 >= args.length) { console.error("Error: --client-secret requires a value"); process.exit(1); } options.clientSecret = args[++i]; } else if (arg === "--include-comments") { options.includeComments = true; } else { console.error(`Error: Unknown option ${arg}`); process.exit(1); } } try { const result = await urlToJsonMarkdown(url, options); console.log(JSON.stringify(result, null, 2)); } catch (error) { console.error("Error:", error instanceof Error ? error.message : String(error)); process.exit(1); } } main().catch((error) => { console.error("Unexpected error:", error instanceof Error ? error.message : String(error)); process.exit(1); });