rsshub
Version:
Make RSS Great Again!
130 lines (122 loc) • 7.19 kB
JavaScript
import "./esm-shims-CzJ_djXG.mjs";
import { t as config } from "./config-C37vj7VH.mjs";
import "./dist-BInvbO1W.mjs";
import "./logger-Czu8UMNd.mjs";
import "./ofetch-BIyrKU3Y.mjs";
import { t as cache_default } from "./cache-Bo__VnGm.mjs";
import "./helpers-DxBp0Pty.mjs";
import { t as got_default } from "./got-KxxWdaxq.mjs";
import { t as config_not_found_default } from "./config-not-found-Dyp3RlZZ.mjs";
import { load } from "cheerio";
import sanitizeHtml from "sanitize-html";
//#region lib/routes/rsshub/transform/html.ts
const route = {
path: "/transform/html/:url/:routeParams",
categories: ["other"],
example: "/rsshub/transform/html/https%3A%2F%2Fwechat2rss.xlab.app%2Fposts%2Flist%2F/item=div%5Bclass%3D%27post%2Dcontent%27%5D%20p%20a",
parameters: {
url: "`encodeURIComponent`ed URL address",
routeParams: "Transformation rules, requires URL encode"
},
features: {
requireConfig: [{
name: "ALLOW_USER_SUPPLY_UNSAFE_DOMAIN",
description: ""
}],
requirePuppeteer: false,
antiCrawler: false,
supportBT: false,
supportPodcast: false,
supportScihub: false
},
name: "Transformation - HTML",
maintainers: ["ttttmr", "hyoban"],
description: `Pass URL and transformation rules to convert HTML/JSON into RSS.
Specify options (in the format of query string) in parameter \`routeParams\` parameter to extract data from HTML.
| Key | Meaning | Accepted Values | Default |
| ------------------- | ------------------------------------------------------------------------------------------------------------- | --------------- | ------------------------ |
| \`title\` | The title of the RSS | \`string\` | Extract from \`<title>\` |
| \`item\` | The HTML elements as \`item\` using CSS selector | \`string\` | html |
| \`itemTitle\` | The HTML elements as \`title\` in \`item\` using CSS selector | \`string\` | \`item\` element |
| \`itemTitleAttr\` | The attributes of \`title\` element as title | \`string\` | Element text |
| \`itemLink\` | The HTML elements as \`link\` in \`item\` using CSS selector | \`string\` | \`item\` element |
| \`itemLinkAttr\` | The attributes of \`link\` element as link | \`string\` | \`href\` |
| \`itemDesc\` | The HTML elements as \`descrption\` in \`item\` using CSS selector | \`string\` | \`item\` element |
| \`itemDescAttr\` | The attributes of \`descrption\` element as description | \`string\` | Element html |
| \`itemPubDate\` | The HTML elements as \`pubDate\` in \`item\` using CSS selector | \`string\` | \`item\` element |
| \`itemPubDateAttr\` | The attributes of \`pubDate\` element as pubDate | \`string\` | Element html |
| \`itemContent\` | The HTML elements as \`description\` in \`item\` using CSS selector ( in \`itemLink\` page for full content ) | \`string\` | |
| \`encoding\` | The encoding of the HTML content | \`string\` | utf-8 |
Parameters parsing in the above example:
| Parameter | Value |
| ------------- | ----------------------------------------- |
| \`url\` | \`https://wechat2rss.xlab.app/posts/list/\` |
| \`routeParams\` | \`item=div[class='post-content'] p a\` |
Parsing of \`routeParams\` parameter:
| Parameter | Value |
| --------- | ------------------------------- |
| \`item\` | \`div[class='post-content'] p a\` |`,
handler: async (ctx) => {
if (!config.feature.allow_user_supply_unsafe_domain) throw new config_not_found_default(`This RSS is disabled unless 'ALLOW_USER_SUPPLY_UNSAFE_DOMAIN' is set to 'true'.`);
const url = ctx.req.param("url");
const response = await got_default({
method: "get",
url,
responseType: "arrayBuffer"
});
const routeParams = new URLSearchParams(ctx.req.param("routeParams"));
const encoding = routeParams.get("encoding") || "utf-8";
const decoder = new TextDecoder(encoding);
const $ = load(decoder.decode(response.data));
const rssTitle = routeParams.get("title") || $("title").text();
let items = $(routeParams.get("item") || "html").toArray().slice(0, 20).map((item) => {
try {
item = $(item);
const titleEle = routeParams.get("itemTitle") ? item.find(routeParams.get("itemTitle")) : item;
const title = routeParams.get("itemTitleAttr") ? titleEle.attr(routeParams.get("itemTitleAttr")) : titleEle.text();
let link;
const linkEle = routeParams.get("itemLink") ? item.find(routeParams.get("itemLink")) : item;
if (routeParams.get("itemLinkAttr")) link = linkEle.attr(routeParams.get("itemLinkAttr"));
else link = linkEle.is("a") ? linkEle.attr("href") : linkEle.find("a").attr("href");
link = link.trim();
if (link && !link.startsWith("http")) link = new URL(link, url).href;
const descEle = routeParams.get("itemDesc") ? item.find(routeParams.get("itemDesc")) : item;
const desc = routeParams.get("itemDescAttr") ? descEle.attr(routeParams.get("itemDescAttr")) : descEle.html();
const pubDateEle = routeParams.get("itemPubDate") ? item.find(routeParams.get("itemPubDate")) : item;
const pubDate = routeParams.get("itemPubDateAttr") ? pubDateEle.attr(routeParams.get("itemPubDateAttr")) : pubDateEle.html();
return {
title,
link,
description: desc,
pubDate
};
} catch {
return null;
}
}).filter((i) => !!i);
const itemContentSelector = routeParams.get("itemContent");
if (itemContentSelector) items = await Promise.all(items.map((item) => {
if (!item.link) return item;
return cache_default.tryGet(`transform:${item.link}:${itemContentSelector}`, async () => {
const response$1 = await got_default({
method: "get",
url: item.link,
responseType: "arrayBuffer"
});
if (!response$1 || typeof response$1 === "string") return item;
const content = load(decoder.decode(response$1.data))(itemContentSelector).html();
if (!content) return item;
item.description = sanitizeHtml(content, { allowedTags: [...sanitizeHtml.defaults.allowedTags, "img"] });
return item;
});
}));
return {
title: rssTitle,
link: url,
description: `Proxy ${url}`,
item: items
};
}
};
//#endregion
export { route };