rsshub
Version:
Make RSS Great Again!
124 lines (108 loc) • 4.36 kB
text/typescript
import cache from '@/utils/cache';
import { load } from 'cheerio';
import { parseDate } from '@/utils/parse-date';
import { getCookies, setCookies } from '@/utils/puppeteer-utils';
import logger from '@/utils/logger';
let cookie;
import ofetch from '@/utils/ofetch';
const baseUrl = 'https://www.cw.com.tw';
const pathMap = {
today: {
pageUrl: () => '/today',
limit: 30,
},
master: {
pageUrl: (channel) => `/masterChannel.action?idMasterChannel=${channel}`,
limit: 12,
},
sub: {
pageUrl: (channel) => `/subchannel.action?idSubChannel=${channel}`,
limit: 12,
},
author: {
pageUrl: (channel) => `/author/${channel}`,
limit: 10,
},
};
const getCookie = async (browser, tryGet) => {
if (!cookie) {
cookie = await tryGet('cw:cookie', async () => {
const page = await browser.newPage();
await page.setRequestInterception(true);
page.on('request', (request) => {
request.resourceType() === 'document' || request.resourceType() === 'script' ? request.continue() : request.abort();
});
logger.http(`Requesting ${baseUrl}/user/get/cookie-bar`);
await page.goto(`${baseUrl}/user/get/cookie-bar`, {
waitUntil: 'domcontentloaded',
});
cookie = await getCookies(page);
await page.close();
return cookie;
});
}
return cookie;
};
const parsePage = async (path, browser, ctx) => {
const pageUrl = `${baseUrl}${pathMap[path].pageUrl(ctx.req.param('channel'))}`;
const cookie = await getCookie(browser, cache.tryGet);
const page = await browser.newPage();
await page.setRequestInterception(true);
page.on('request', (request) => {
request.resourceType() === 'document' || request.resourceType() === 'script' ? request.continue() : request.abort();
});
await setCookies(page, cookie, 'cw.com.tw');
logger.http(`Requesting ${pageUrl}`);
await page.goto(pageUrl, {
waitUntil: 'domcontentloaded',
});
await page.waitForSelector('.caption');
const response = await page.evaluate(() => document.documentElement.innerHTML);
await page.close();
const $ = load(response);
const list = parseList($, ctx.req.query('limit') ? Number(ctx.req.query('limit')) : pathMap[path].limit);
const items = await parseItems(list, browser, cache.tryGet);
return { $, items };
};
const parseList = ($, limit) =>
$('.caption')
.toArray()
.map((item) => {
item = $(item);
return {
title: item.find('h3').text(),
link: item.find('h3 a').attr('href'),
pubDate: parseDate(item.find('time').text()),
};
})
.slice(0, limit);
const parseItems = (list, browser, tryGet) =>
Promise.all(
list.map((item) =>
tryGet(item.link, async () => {
const response = await ofetch(item.link, {
headers: {
Cookie: await getCookie(browser, tryGet),
'User-Agent': browser.userAgent(),
},
});
const $ = load(response);
const meta = JSON.parse($('head script[type="application/ld+json"]').eq(0).text());
$('.article__head .breadcrumb, .article__head h1, .article__provideViews, .ad').remove();
$('img.lazyload').each((_, img) => {
if (img.attribs['data-src']) {
img.attribs.src = img.attribs['data-src'];
delete img.attribs['data-src'];
}
});
item.title = $('head title').text();
item.category = $('meta[name=keywords]').attr('content').split(',');
item.pubDate = parseDate(meta.datePublished);
item.author = meta.author.name.replace(',', ' ') || meta.publisher.name;
item.description = $('.article__head .container').html() + $('.article__content').html();
return item;
})
)
);
export { baseUrl, pathMap, getCookie, parsePage, parseList, parseItems };
export { setCookies } from '@/utils/puppeteer-utils';