UNPKG

kasha

Version:

Pre-render your Single-Page Application.

585 lines (453 loc) 16.1 kB
const { PassThrough, Transform } = require('stream') const { XmlEntities } = require('html-entities') const fetch = require('node-fetch') const URLRewriter = require('url-rewrite') const mongo = require('../lib/mongo') const RESTError = require('../lib/RESTError') const config = require('../lib/config') const logger = require('../lib/logger') const PAGE_LIMIT = 50000 const GOOGLE_LIMIT = 1000 const entities = new XmlEntities() function checkLimitParam(limit, max) { if (limit <= 0 || limit > max) { throw new RESTError('INVALID_PARAM', 'limit') } } function checkPageParam(page) { if (page <= 0) { throw new RESTError('INVALID_PARAM', 'page') } } const standardSitemapStream = { header: '<?xml version="1.0" encoding="UTF-8"?><urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">', transform(doc, encoding, cb) { cb(null, `<url>${standardTags(doc)}</url>`) } } const googleSitemapStream = { header: '<?xml version="1.0" encoding="UTF-8"?><urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:news="http://www.google.com/schemas/sitemap-news/0.9" xmlns:image="http://www.google.com/schemas/sitemap-image/1.1" xmlns:video="http://www.google.com/schemas/sitemap-video/1.1">', transform(doc, encoding, cb) { this.push('<url>') this.push(standardTags(doc)) if (doc.news) { this.push(googleNewsTags(doc.news)) } if (doc.image) { doc.image.forEach(img => this.push(googleImageTags(img))) } if (doc.video) { doc.video.forEach(video => this.push(googleVideoTags(video))) } this.push('</url>') cb() } } const googleNewsSitemapStream = { header: '<?xml version="1.0" encoding="UTF-8"?><urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:news="http://www.google.com/schemas/sitemap-news/0.9">', transform(doc, encoding, cb) { cb(null, `<url>${standardTags(doc)}${googleNewsTags(doc.news)}</url>`) } } const googleImageSitemapStream = { header: '<?xml version="1.0" encoding="UTF-8"?><urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:image="http://www.google.com/schemas/sitemap-image/1.1">', transform(doc, encoding, cb) { this.push('<url>') this.push(standardTags(doc)) doc.image.forEach(img => this.push(googleImageTags(img))) this.push('</url>') cb() } } const googleVideoSitemapStream = { header: '<?xml version="1.0" encoding="UTF-8"?><urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:video="http://www.google.com/schemas/sitemap-video/1.1">', transform(doc, encoding, cb) { this.push('<url>') this.push(standardTags(doc)) doc.video.forEach(video => this.push(googleVideoTags(video))) this.push('</url>') cb() } } function standardTags(page) { let tags = `<loc>${entities.encode(page.site + page.path)}</loc>` if (page.lastmod) { tags += `<lastmod>${page.lastmod}</lastmod>` } if (page.changefreq) { tags += `<changefreq>${page.changefreq}</changefreq>` } if (page.priority) { tags += `<priority>${page.priority}</priority>` } return tags } function googleNewsTags(news) { let tags = '<news:news>' tags += '<news:publication>' tags += `<news:name>${entities.encode(news.publication.name)}</news:name>` tags += `<news:language>${news.publication.language}</news:language>` tags += '</news:publication>' tags += `<news:publication_date>${news.publication_date.toISOString()}</news:publication_date>` tags += `<news:title>${entities.encode(news.title)}</news:title>` tags += '</news:news>' return tags } function googleImageTags(image) { let tags = '<image:image>' tags += `<image:loc>${entities.encode(image.loc)}</image:loc>` if (image.caption) { tags += `<image:caption>${entities.encode(image.caption)}</image:caption>` } if (image.geo_location) { tags += `<image:geo_location>${entities.encode(image.geo_location)}</image:geo_location>` } if (image.title) { tags += `<image:title>${entities.encode(image.title)}</image:title>` } if (image.license) { tags += `<image:license>${entities.encode(image.license)}</image:license>` } tags += '</image:image>' return tags } function googleVideoTags(video) { let tags = '<video:video>' tags += `<video:thumbnail_loc>${entities.encode(video.thumbnail_loc)}</video:thumbnail_loc>` tags += `<video:title>${entities.encode(video.title)}</video:title>` tags += `<video:description>${entities.encode(video.description)}</video:description>` if (video.content_loc) { tags += `<video:content_loc>${entities.encode(video.content_loc)}</video:content_loc>` } if (video.player_loc) { tags += `<video:player_loc${video.player_loc.allow_embed ? ` allow_embed="${video.player_loc.allow_embed}"` : ''}>${entities.encode(video.player_loc._)}</video:player_loc>` } if (video.duration) { tags += `<video:duration>${video.duration}</video:duration>` } if (video.expiration_date) { tags += `<video:expiration_date>${video.expiration_date}</video:expiration_date>` } if (video.rating) { tags += `<video:rating>${video.rating}</video:rating>` } if (video.view_count) { tags += `<video:view_count>${video.view_count}</video:view_count>` } if (video.publication_date) { tags += `<video:publication_date>${video.publication_date}</video:publication_date>` } if (video.family_friendly) { tags += `<video:family_friendly>${video.family_friendly}</video:family_friendly>` } if (video.restriction) { tags += `<video:restriction relationship="${video.restriction.relationship}">${video.restriction._}</video:restriction>` } if (video.platform) { tags += `<video:platform relationship="${video.platform.relationship}">${video.platform._}</video:platform>` } if (video.price) { tags += `<video:price currency="${video.price.currency}"${video.price.type ? ` type="${video.price.type}"` : ''}${video.price.resolution ? ` resolution="${video.price.resolution}"` : ''}>${video.price._}</video:price>` } if (video.requires_subscription) { tags += `<video:requires_subscription>${video.requires_subscription}</video:requires_subscription>` } if (video.uploader) { tags += `<video:uploader${video.uploader.info ? ` info="${entities.encode(video.uploader.info)}"` : ''}>${entities.encode(video.uploader._)}</video:uploader>` } if (video.live) { tags += `<video:live>${video.live}</video:live>` } if (video.tag) { tags += video.tag.map(t => `<video:tag>${entities.encode(t)}</video:tag>`).join('') } if (video.category) { tags += `<video:category>${entities.encode(video.category)}</video:category>` } if (video.gallery_loc) { tags += `<video:gallery_loc>${entities.encode(video.gallery_loc)}</video:gallery_loc>` } tags += '</video:video>' return tags } async function respond(ctx, data, { header, transform }) { if (await data.count() === 0) { return } ctx.set('Content-Type', 'text/xml; charset=utf-8') ctx.set('Cache-Control', `max-age=${config.cache.sitemap}`) ctx.body = new PassThrough() ctx.body.on('error', async() => { try { await data.close() } catch (e) { logger.debug(e) } }) ctx.body.write(header) const trans = new Transform({ writableObjectMode: true, transform }) trans.setEncoding('utf8') trans.on('end', () => { ctx.body.end('</urlset>') }) data.pipe(trans).pipe(ctx.body, { end: false }) } async function sitemap(ctx) { const site = ctx.state.origin const limit = ctx.queries.int('limit', { defaults: PAGE_LIMIT }) checkLimitParam(limit, PAGE_LIMIT) const page = ctx.params.int('page', { defaults: 1 }) checkPageParam(page) const query = { site } const options = { skip: (page - 1) * limit, limit } logger.debug('query sitemaps', query, options) const data = await mongo.db.collection('sitemaps').find(query, options) await respond(ctx, data, standardSitemapStream) } async function googleSitemap(ctx) { const site = ctx.state.origin const limit = ctx.queries.int('limit', { defaults: GOOGLE_LIMIT }) checkLimitParam(limit, GOOGLE_LIMIT) const page = ctx.params.int('page', { defaults: 1 }) checkPageParam(page) const query = { site } const options = { skip: (page - 1) * limit, limit } logger.debug('query sitemaps', query, options) const data = await mongo.db.collection('sitemaps').find(query, options) await respond(ctx, data, googleSitemapStream) } async function googleSitemapItem(ctx) { const site = ctx.state.origin const path = ctx.params.string('path') const query = { site, path } const options = { limit: 1 } logger.debug('query sitemaps', query, options) const data = await mongo.db.collection('sitemaps').find(query, options) await respond(ctx, data, googleSitemapStream) } async function googleNewsSitemap(ctx) { const site = ctx.state.origin const limit = ctx.queries.int('limit', { defaults: GOOGLE_LIMIT }) checkLimitParam(limit, GOOGLE_LIMIT) const page = ctx.params.int('page', { defaults: 1 }) checkPageParam(page) const query = { site, 'news.publication_date': { $gte: twoDaysAgo() } } const options = { skip: (page - 1) * limit, limit } logger.debug('query sitemaps', query, options) const data = await mongo.db.collection('sitemaps').find(query, options) await respond(ctx, data, googleNewsSitemapStream) } function twoDaysAgo() { return new Date(Date.now() - 2 * 24 * 60 * 60 * 1000) } async function googleImageSitemap(ctx) { const site = ctx.state.origin const limit = ctx.queries.int('limit', { defaults: GOOGLE_LIMIT }) checkLimitParam(limit, GOOGLE_LIMIT) const page = ctx.params.int('page', { defaults: 1 }) checkPageParam(page) const query = { site, hasImages: true } const options = { skip: (page - 1) * limit, limit } logger.debug('query sitemaps', query, options) const data = await mongo.db.collection('sitemaps').find(query, options) await respond(ctx, data, googleImageSitemapStream) } async function googleVideoSitemap(ctx) { const site = ctx.state.origin const limit = ctx.queries.int('limit', { defaults: GOOGLE_LIMIT }) checkLimitParam(limit, GOOGLE_LIMIT) const page = ctx.params.int('page', { defaults: 1 }) checkPageParam(page) const query = { site, hasVideos: true } const options = { skip: (page - 1) * limit, limit } logger.debug('query sitemaps', query, options) const data = await mongo.db.collection('sitemaps').find(query, options) await respond(ctx, data, googleVideoSitemapStream) } async function robotsTxt(ctx) { const site = ctx.state.origin const limit = ctx.queries.int('limit', { defaults: PAGE_LIMIT }) checkLimitParam(limit, PAGE_LIMIT) const googleLimit = ctx.queries.int('googleLimit', { defaults: GOOGLE_LIMIT }) checkLimitParam(googleLimit, GOOGLE_LIMIT) const queryAll = { site } const queryNews = { site, 'news.publication_date': { $gte: twoDaysAgo() } } const queryImages = { site, hasImages: true } const queryVideos = { site, hasVideos: true } logger.debug('count sitemaps', queryAll, queryNews, queryImages, queryVideos) const sitemaps = mongo.db.collection('sitemaps') const [allCount, newsCount, imageCount, videoCount, rules] = await Promise.all([ sitemaps.countDocuments(queryAll), sitemaps.countDocuments(queryNews), sitemaps.countDocuments(queryImages), sitemaps.countDocuments(queryVideos), (async() => { let url = site + '/robots.txt' if (ctx.state.site && ctx.state.site.rewrites) { url = new URLRewriter(ctx.state.site.rewrites).from(url) if (!url) { return '' } } logger.debug('fetch robots.txt:', url) try { const res = await fetch(url, { headers: { accept: 'text/plain' } }) if (!res.ok || !res.headers.get('content-type').includes('text/plain')) { return '' } return res.text() } catch (e) { return '' } })() ]) const normalSitemapIndexCount = Math.ceil(allCount / limit / PAGE_LIMIT) const googleSitemapIndexCount = Math.ceil(allCount / googleLimit / PAGE_LIMIT) const newsSitemapIndexCount = Math.ceil(newsCount / googleLimit / PAGE_LIMIT) const imageSitemapIndexCount = Math.ceil(imageCount / googleLimit / PAGE_LIMIT) const videoSitemapIndexCount = Math.ceil(videoCount / googleLimit / PAGE_LIMIT) ctx.set('Cache-Control', `max-age=${config.cache.robotsTxt}`) ctx.body = rules + '\n' for (let n = 1; n <= normalSitemapIndexCount; n++) { ctx.body += `Sitemap: ${site}/sitemap.index.${n}.xml` if (limit !== PAGE_LIMIT) { ctx.body += `?limit=${limit}` } ctx.body += '\n' } for (let n = 1; n <= googleSitemapIndexCount; n++) { ctx.body += `Sitemap: ${site}/sitemap.index.google.${n}.xml` if (googleLimit !== GOOGLE_LIMIT) { ctx.body += `?limit=${googleLimit}` } ctx.body += '\n' } for (let n = 1; n <= newsSitemapIndexCount; n++) { ctx.body += `Sitemap: ${site}/sitemap.index.google.news.${n}.xml` if (googleLimit !== GOOGLE_LIMIT) { ctx.body += `?limit=${googleLimit}` } ctx.body += '\n' } for (let n = 1; n <= imageSitemapIndexCount; n++) { ctx.body += `Sitemap: ${site}/sitemap.index.google.image.${n}.xml` if (googleLimit !== GOOGLE_LIMIT) { ctx.body += `?limit=${googleLimit}` } ctx.body += '\n' } for (let n = 1; n <= videoSitemapIndexCount; n++) { ctx.body += `Sitemap: ${site}/sitemap.index.google.video.${n}.xml` if (googleLimit !== GOOGLE_LIMIT) { ctx.body += `?limit=${googleLimit}` } ctx.body += '\n' } } async function _sitemapIndex(ctx, type) { const MAX = type === 'normal' ? PAGE_LIMIT : GOOGLE_LIMIT const site = ctx.state.origin const limit = ctx.queries.int('limit', { defaults: MAX }) checkLimitParam(limit, MAX) const page = ctx.params.int('page', { defaults: 1 }) checkPageParam(page) const query = { site } if (type === 'news') { query['news.publication_date'] = { $gte: twoDaysAgo() } } else if (type === 'image') { query.hasImages = true } else if (type === 'video') { query.hasVideos = true } const options = { skip: (page - 1) * limit * PAGE_LIMIT, limit: limit * PAGE_LIMIT } logger.debug('count sitemaps', query, options) const docCount = await mongo.db.collection('sitemaps').countDocuments(query, options) if (docCount) { ctx.set('Content-Type', 'text/xml; charset=utf-8') ctx.set('Cache-Control', `max-age=${config.cache.sitemap}`) const stream = new PassThrough() ctx.body = stream stream.write('<?xml version="1.0" encoding="UTF-8"?><sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">') let prefix if (type === 'normal') { prefix = site + '/sitemap' } else if (type === 'google') { prefix = site + '/sitemap.google' } else { prefix = site + '/sitemap.google.' + type } const start = (page - 1) * limit const pageCount = Math.ceil(docCount / limit) for (let n = 1; n <= pageCount; n++) { stream.write(`<sitemap><loc>${prefix}.${start + n}.xml`) if (limit !== MAX) { stream.write(`?limit=${limit}`) } stream.write('</loc></sitemap>') } stream.end('</sitemapindex>') } } function sitemapIndex(ctx) { return _sitemapIndex(ctx, 'normal') } function googleSitemapIndex(ctx) { return _sitemapIndex(ctx, 'google') } function googleNewsSitemapIndex(ctx) { return _sitemapIndex(ctx, 'news') } function googleImageSitemapIndex(ctx) { return _sitemapIndex(ctx, 'image') } function googleVideoSitemapIndex(ctx) { return _sitemapIndex(ctx, 'video') } module.exports = { robotsTxt, sitemap, googleSitemap, googleSitemapItem, googleNewsSitemap, googleImageSitemap, googleVideoSitemap, sitemapIndex, googleSitemapIndex, googleNewsSitemapIndex, googleImageSitemapIndex, googleVideoSitemapIndex }