UNPKG

relatt-scraper

Version:

Metascarper

117 lines (109 loc) 3.56 kB
import slug from "slug"; import url from "url"; import stripHtml from "string-strip-html"; import { v4 as uuidv4 } from "uuid"; const processLink = require("../functions/processLink") const encode = require("./encode") /** * Here we process items fetched from rss feed, * In preview mode, images, body and reading time are not taken into account. * */ export default async (item, { fetchImages, fetchBody, processReadingTime }) => { let image = ""; let title_ = ""; let excerpt_ = ""; let read_time = 0; let validURlRegEx = /^(http:\/\/www\.|https:\/\/www\.|http:\/\/|https:\/\/)?[a-z0-9]+([\-\.]{1}[a-z0-9]+)*\.[a-z]{2,5}(:[0-9]{1,5})?(\/.*)?$/ item.link = item.link.match(validURlRegEx)? item.link : 'http://'+item.link.split('//').pop() item.guid ? item.guid : (item.guid = item.link); if (fetchImages && item.guid) { try { let regEx = /(http:|https:|)\/\/(player.|www.)?(vimeo\.com|youtu(be\.com|\.be|be\.googleapis\.com)|dailymotion.com)\/(video\/|embed\/|watch\?v=|v\/)?([A-Za-z0-9._%-]*)(\&\S+)?/; if (!item.guid.match(regEx)) { const metadata = await processLink.metascrape(item.guid); image = metadata.image; title_ = metadata.title; excerpt_ = metadata.description; } } catch (e) {} } if (processReadingTime) { let stats = 0; read_time = stats; } let tags = []; if (item.categories) { tags = item.categories.map((item) => { if (item.name) { return { name: item.name, slug: item.slug, ttuid: uuidv4(), }; } return { name: item, slug: slug(item, { lower: true }), ttuid: uuidv4(), }; }); } let authors = []; if (item.creator) { authors.push({ display_name: item.creator, }); } let parsedGuid = url.parse(item.guid); let isPermalink = !!parsedGuid.hostname; let guidPathName = ""; if (isPermalink) { guidPathName = parsedGuid.pathname !== "/" ? `${parsedGuid.pathname.replace(/\/+$/, "")}` : `${parsedGuid.pathname}`; if (parsedGuid.query) { guidPathName = `${guidPathName}${"?" + parsedGuid.query}`; } } let { pathname, query } = url.parse(item.link); pathname = pathname !== "/" ? `${pathname.replace(/\/+$/, "")}` : `${pathname}`; if (query) { pathname = `${pathname}${query ? "?" + query : ""}`; } return { pathname, isPermalink, guidPathName, ttuid: uuidv4(), rss_title: encode.forceDecoding(item.title), title: encode.forceDecoding(title_ ? title_ : item.title), status: item.status ? item.status : "publish", body: fetchBody ? item.content : "", guid: item.guid, image, rss_excerpt: encode.forceDecoding(stripHtml(item.contentSnippet ? item.contentSnippet : "")), excerpt: encode.forceDecoding(stripHtml(excerpt_)), slug: slug(item.title ? item.title : title_, { lower: true }), published_at: { formatted: item.pubDate ? new Date(item.pubDate.toString()).toISOString() : new Date().toISOString(), }, created_at: item.isoDate ? new Date(item.isoDate.toString()).toISOString() : new Date().toISOString(), modified_at: item.modified ? new Date(item.modified.toString()).toISOString() : item.lastBuildDate ? new Date(item.lastBuildDate.toString()).toISOString() : item.isoDate ? new Date(item.isoDate.toString()).toISOString() : new Date().toISOString(), tags, authors, read_time, link: item.link, }; };