url-to-json-markdown
Version:
A TypeScript library that fetches URLs and converts them to structured JSON and Markdown format.
508 lines (494 loc) • 17.6 kB
JavaScript
;
var __create = Object.create;
var __defProp = Object.defineProperty;
var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
var __getOwnPropNames = Object.getOwnPropertyNames;
var __getProtoOf = Object.getPrototypeOf;
var __hasOwnProp = Object.prototype.hasOwnProperty;
var __copyProps = (to, from, except, desc) => {
if (from && typeof from === "object" || typeof from === "function") {
for (let key of __getOwnPropNames(from))
if (!__hasOwnProp.call(to, key) && key !== except)
__defProp(to, key, { get: () => from[key], enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable });
}
return to;
};
var __toESM = (mod, isNodeMode, target) => (target = mod != null ? __create(__getProtoOf(mod)) : {}, __copyProps(
// If the importer is in node compatibility mode or this is not an ESM
// file that has been converted to a CommonJS file using a Babel-
// compatible transform (i.e. "__esModule" has not been set), then set
// "default" to the CommonJS "module.exports" for node compatibility.
isNodeMode || !mod || !mod.__esModule ? __defProp(target, "default", { value: mod, enumerable: true }) : target,
mod
));
// src/index.ts
var import_turndown = __toESM(require("turndown"), 1);
var import_jsdom = require("jsdom");
async function urlToJsonMarkdown(url, options) {
if (isRedditUrl(url)) {
return await parseRedditUrl(url, options);
} else if (isTwitterUrl(url)) {
return await parseTwitterUrl(url);
} else {
return await parseGenericUrl(url, options);
}
}
async function getRedditToken(credentials) {
const auth = Buffer.from(
`${credentials.clientId}:${credentials.clientSecret}`
).toString("base64");
const response = await fetch("https://www.reddit.com/api/v1/access_token", {
method: "POST",
headers: {
Authorization: `Basic ${auth}`,
"Content-Type": "application/x-www-form-urlencoded"
},
body: "grant_type=client_credentials"
});
if (!response.ok) {
throw new Error(
`Failed to get Reddit token: ${response.status} ${response.statusText}`
);
}
const data = await response.json();
return data.access_token;
}
async function parseTwitterUrl(url) {
try {
const tweetIdMatch = url.match(/status\/(\d+)/);
if (!tweetIdMatch) {
throw new Error("Invalid Twitter URL: Could not extract tweet ID");
}
const embedUrl = `https://publish.twitter.com/oembed?url=${encodeURIComponent(
url
)}&omit_script=true`;
const response = await fetch(embedUrl, {
headers: {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
Accept: "application/json"
}
});
if (!response.ok) {
throw new Error(
`Failed to fetch Twitter embed data: ${response.status} ${response.statusText}`
);
}
const embedData = await response.json();
const dom = new import_jsdom.JSDOM(embedData.html);
const document = dom.window.document;
const tweetTextElement = document.querySelector("p");
const tweetText = tweetTextElement?.textContent?.trim() || "";
const authorName = embedData.author_name || "Unknown";
const title = `Tweet by ${authorName}`;
const content = `# ${title}
${tweetText}
---
Author: ${authorName}
URL: ${url}`;
return {
title,
content,
type: "twitter"
};
} catch (error) {
if (error instanceof Error) {
throw error;
}
throw new Error(`Failed to parse Twitter URL: ${error}`);
}
}
async function parseRedditUrl(url, options) {
try {
let apiUrl;
let headers;
if (options?.clientId && options?.clientSecret) {
const token = await getRedditToken({
clientId: options.clientId,
clientSecret: options.clientSecret
});
apiUrl = convertToOAuthUrl(url);
headers = {
Authorization: `Bearer ${token}`,
"User-Agent": "web:url-to-json-markdown:v1.0.0 (by /u/paradite)"
};
} else {
apiUrl = convertToPublicJsonUrl(url);
headers = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
Accept: "application/json, text/plain, */*",
"Accept-Language": "en-US,en;q=0.9",
"Accept-Encoding": "gzip, deflate, br",
"Cache-Control": "no-cache",
Pragma: "no-cache"
};
}
const response = await fetch(apiUrl, { headers });
if (!response.ok) {
throw new Error(
`Failed to fetch Reddit data: ${response.status} ${response.statusText}`
);
}
if (!options?.clientId || !options?.clientSecret) {
const contentType = response.headers.get("content-type");
if (!contentType?.includes("application/json")) {
throw new Error(
"Reddit returned HTML instead of JSON. This may be due to rate limiting or CORS restrictions. Consider providing Reddit credentials for more reliable access."
);
}
}
const data = await response.json();
const post = data[0].data.children[0].data;
const commentId = extractCommentId(url);
if (commentId && data.length > 1) {
const comment = findCommentById(data[1], commentId);
if (comment) {
const commentTitle = extractCommentTitle(comment.body);
return {
title: commentTitle,
content: formatCommentToMarkdown(comment, post.title, options),
type: "reddit"
};
}
}
const comments = data.length > 1 ? data[1].data.children : null;
return {
title: post.title,
content: formatPostToMarkdown(post, comments, options),
type: "reddit"
};
} catch (error) {
if (error instanceof Error) {
throw error;
}
throw new Error(`Failed to convert URL to JSON/Markdown: ${error}`);
}
}
async function parseGenericUrl(url, options) {
try {
const response = await fetch(url, {
headers: {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
"Accept-Language": "en-US,en;q=0.9",
"Accept-Encoding": "gzip, deflate, br",
"Cache-Control": "no-cache",
"Pragma": "no-cache"
}
});
if (!response.ok) {
throw new Error(
`Failed to fetch URL: ${response.status} ${response.statusText}`
);
}
const html = await response.text();
return processHtmlContent(html, url);
} catch (error) {
const lastError = error instanceof Error ? error : new Error(String(error));
if (lastError.message.includes("403") && options?.enableArchiveFallback) {
try {
console.log("Attempting archive.org fallback...");
const archiveUrl = `https://web.archive.org/web/2/${url}`;
const response = await fetch(archiveUrl, {
headers: {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"
}
});
if (response.ok) {
const html = await response.text();
const result = processHtmlContent(html, url);
result.title = `[Archived] ${result.title}`;
result.content = result.content + `
---
*Retrieved from [Internet Archive Wayback Machine](${archiveUrl})*`;
return result;
}
} catch (archiveError) {
}
}
if (lastError.message.includes("403")) {
const errorMessage = `Failed to fetch URL: 403 Forbidden. The website "${new URL(url).hostname}" is blocking automated requests. This is common for sites with anti-bot protection (like Cloudflare). Try accessing the URL manually in a browser first, or the site may require authentication.`;
if (!options?.enableArchiveFallback) {
throw new Error(errorMessage + ` You can also try enabling the archive fallback option.`);
} else {
throw new Error(errorMessage + ` Archive fallback was attempted but also failed.`);
}
}
throw lastError;
}
}
function processHtmlContent(html, _url) {
try {
const dom = new import_jsdom.JSDOM(html);
const document = dom.window.document;
const title = document.querySelector("title")?.textContent?.trim() || document.querySelector("h1")?.textContent?.trim() || "Untitled";
const scripts = document.querySelectorAll(
"script, style, nav, footer, header, aside"
);
scripts.forEach((element) => element.remove());
const contentSelectors = [
"article",
"main",
'[role="main"]',
".content",
".post-content",
".entry-content",
".article-content",
"body"
];
let contentElement = null;
for (const selector of contentSelectors) {
contentElement = document.querySelector(selector);
if (contentElement && contentElement.textContent?.trim()) {
break;
}
}
if (!contentElement) {
contentElement = document.body;
}
const turndownService = new import_turndown.default({
headingStyle: "atx",
bulletListMarker: "-",
codeBlockStyle: "fenced"
});
const markdown = turndownService.turndown(contentElement.innerHTML);
return {
title,
content: markdown,
type: "generic"
};
} catch (error) {
if (error instanceof Error) {
throw error;
}
throw new Error(`Failed to parse generic URL: ${error}`);
}
}
function convertToOAuthUrl(url) {
const urlObj = new URL(url);
let path = urlObj.pathname;
if (path.endsWith("/")) {
path = path.slice(0, -1);
}
const jsonPath = path.endsWith(".json") ? path : `${path}.json`;
return `https://oauth.reddit.com${jsonPath}`;
}
function convertToPublicJsonUrl(url) {
const urlObj = new URL(url);
let path = urlObj.pathname;
if (path.endsWith("/")) {
path = path.slice(0, -1);
}
const jsonPath = path.endsWith(".json") ? path : `${path}.json`;
return `https://www.reddit.com${jsonPath}`;
}
function isRedditUrl(url) {
try {
const urlObj = new URL(url);
return urlObj.hostname === "www.reddit.com" || urlObj.hostname === "reddit.com";
} catch {
return false;
}
}
function isTwitterUrl(url) {
try {
const urlObj = new URL(url);
return urlObj.hostname === "x.com" || urlObj.hostname === "twitter.com" || urlObj.hostname === "www.x.com" || urlObj.hostname === "www.twitter.com";
} catch {
return false;
}
}
function extractCommentId(url) {
const match = url.match(/\/comment\/([a-zA-Z0-9]+)\/?/);
return match ? match[1] : null;
}
function normalizeQuotes(text) {
return text.replace(/[\u2018\u2019]/g, "'").replace(/[\u201C\u201D]/g, '"').replace(/[\u2013\u2014]/g, "-").replace(/\u2026/g, "...");
}
function extractCommentTitle(commentBody) {
if (!commentBody) {
return "Untitled Comment";
}
const firstLine = normalizeQuotes(commentBody.split("\n")[0].trim());
if (!firstLine || firstLine.length < 3) {
return "Untitled Comment";
}
if (firstLine.length > 200) {
return firstLine.substring(0, 197) + "...";
}
return firstLine;
}
function findCommentById(commentsListing, commentId) {
if (!commentsListing?.data?.children) {
return null;
}
for (const child of commentsListing.data.children) {
if (child.kind === "t1" && child.data.id === commentId) {
return child.data;
}
if (child.data.replies && typeof child.data.replies === "object") {
const found = findCommentById(child.data.replies, commentId);
if (found) {
return found;
}
}
}
return null;
}
function formatCommentToMarkdown(comment, postTitle, options) {
const titleSuffix = postTitle ? ` on "${normalizeQuotes(postTitle)}"` : "";
let markdown = `# Comment by ${comment.author}${titleSuffix}
`;
if (comment.body) {
markdown += `${normalizeQuotes(comment.body)}
`;
}
markdown += `[permalink](https://reddit.com${comment.permalink})
`;
const createdDate = new Date(comment.created_utc * 1e3).toLocaleString(
"en-US",
{
year: "numeric",
month: "2-digit",
day: "2-digit",
hour: "2-digit",
minute: "2-digit",
second: "2-digit"
}
);
markdown += `by *${comment.author}* (\u2191 ${comment.ups}/ \u2193 ${comment.downs}) ${createdDate}`;
if (options?.includeComments && comment.replies && typeof comment.replies === "object" && comment.replies.data?.children) {
markdown += "\n\n## Replies\n\n";
comment.replies.data.children.forEach((reply) => {
markdown += formatCommentTree(reply);
});
}
return markdown;
}
function formatCommentTree(comment) {
if (!comment.data) {
return "";
}
let output = "";
const depth = comment.data.depth || 0;
let depthIndicator = "";
if (depth > 0) {
depthIndicator = `\u251C${"\u2500".repeat(depth)} `;
} else {
depthIndicator = "##### ";
}
if (comment.data.body) {
const commentBody = normalizeQuotes(comment.data.body);
output += `${depthIndicator}${commentBody} \u23E4 by *${comment.data.author}* (\u2191 ${comment.data.ups}/ \u2193 ${comment.data.downs})
`;
} else {
output += `${depthIndicator}deleted
`;
}
if (comment.data.replies && typeof comment.data.replies === "object" && comment.data.replies.data?.children) {
comment.data.replies.data.children.forEach((reply) => {
output += formatCommentTree(reply);
});
}
if (depth === 0 && comment.data.replies) {
output += "\u2514\u2500\u2500\u2500\u2500\n\n";
}
return output;
}
function formatPostToMarkdown(post, comments, options) {
let markdown = `# ${normalizeQuotes(post.title)}
`;
if (post.selftext) {
markdown += `${normalizeQuotes(post.selftext)}
`;
}
markdown += `[permalink](https://reddit.com${post.permalink})
`;
const createdDate = new Date(post.created_utc * 1e3).toLocaleString(
"en-US",
{
year: "numeric",
month: "2-digit",
day: "2-digit",
hour: "2-digit",
minute: "2-digit",
second: "2-digit"
}
);
markdown += `by *${post.author}* (\u2191 ${post.ups}/ \u2193 ${post.downs}) ${createdDate}`;
if (options?.includeComments && comments && comments.length > 0) {
markdown += "\n\n## Comments\n\n";
comments.forEach((comment) => {
markdown += formatCommentTree(comment);
});
}
return markdown;
}
// src/cli.ts
async function main() {
const args = process.argv.slice(2);
if (args.length === 0 || args[0] === "--help") {
console.error("Usage: url-to-json-markdown <url> [options]");
console.error("");
console.error("Options:");
console.error(" --client-id <id> Reddit client ID (for authenticated requests)");
console.error(" --client-secret <secret> Reddit client secret (for authenticated requests)");
console.error(" --include-comments Include comments in the output");
console.error(" --help Show this help message");
console.error("");
console.error("Examples:");
console.error(" url-to-json-markdown https://www.reddit.com/r/programming/comments/123/title/");
console.error(" url-to-json-markdown https://example.com/article --include-comments");
console.error(" url-to-json-markdown https://reddit.com/r/test/comments/123/ --client-id abc --client-secret xyz");
process.exit(1);
}
const url = args[0];
const options = {};
for (let i = 1; i < args.length; i++) {
const arg = args[i];
if (arg === "--help") {
console.log("Usage: url-to-json-markdown <url> [options]");
console.log("");
console.log("Options:");
console.log(" --client-id <id> Reddit client ID (for authenticated requests)");
console.log(" --client-secret <secret> Reddit client secret (for authenticated requests)");
console.log(" --include-comments Include comments in the output");
console.log(" --help Show this help message");
console.log("");
console.log("Examples:");
console.log(" url-to-json-markdown https://www.reddit.com/r/programming/comments/123/title/");
console.log(" url-to-json-markdown https://example.com/article --include-comments");
console.log(" url-to-json-markdown https://reddit.com/r/test/comments/123/ --client-id abc --client-secret xyz");
process.exit(0);
} else if (arg === "--client-id") {
if (i + 1 >= args.length) {
console.error("Error: --client-id requires a value");
process.exit(1);
}
options.clientId = args[++i];
} else if (arg === "--client-secret") {
if (i + 1 >= args.length) {
console.error("Error: --client-secret requires a value");
process.exit(1);
}
options.clientSecret = args[++i];
} else if (arg === "--include-comments") {
options.includeComments = true;
} else {
console.error(`Error: Unknown option ${arg}`);
process.exit(1);
}
}
try {
const result = await urlToJsonMarkdown(url, options);
console.log(JSON.stringify(result, null, 2));
} catch (error) {
console.error("Error:", error instanceof Error ? error.message : String(error));
process.exit(1);
}
}
main().catch((error) => {
console.error("Unexpected error:", error instanceof Error ? error.message : String(error));
process.exit(1);
});