relatt-scraper
Version:
Metascarper
117 lines (109 loc) • 3.56 kB
JavaScript
import slug from "slug";
import url from "url";
import stripHtml from "string-strip-html";
import { v4 as uuidv4 } from "uuid";
const processLink = require("../functions/processLink")
const encode = require("./encode")
/**
* Here we process items fetched from rss feed,
* In preview mode, images, body and reading time are not taken into account.
* */
export default async (item, { fetchImages, fetchBody, processReadingTime }) => {
let image = "";
let title_ = "";
let excerpt_ = "";
let read_time = 0;
let validURlRegEx = /^(http:\/\/www\.|https:\/\/www\.|http:\/\/|https:\/\/)?[a-z0-9]+([\-\.]{1}[a-z0-9]+)*\.[a-z]{2,5}(:[0-9]{1,5})?(\/.*)?$/
item.link = item.link.match(validURlRegEx)? item.link : 'http://'+item.link.split('//').pop()
item.guid ? item.guid : (item.guid = item.link);
if (fetchImages && item.guid) {
try {
let regEx = /(http:|https:|)\/\/(player.|www.)?(vimeo\.com|youtu(be\.com|\.be|be\.googleapis\.com)|dailymotion.com)\/(video\/|embed\/|watch\?v=|v\/)?([A-Za-z0-9._%-]*)(\&\S+)?/;
if (!item.guid.match(regEx)) {
const metadata = await processLink.metascrape(item.guid);
image = metadata.image;
title_ = metadata.title;
excerpt_ = metadata.description;
}
} catch (e) {}
}
if (processReadingTime) {
let stats = 0;
read_time = stats;
}
let tags = [];
if (item.categories) {
tags = item.categories.map((item) => {
if (item.name) {
return {
name: item.name,
slug: item.slug,
ttuid: uuidv4(),
};
}
return {
name: item,
slug: slug(item, { lower: true }),
ttuid: uuidv4(),
};
});
}
let authors = [];
if (item.creator) {
authors.push({
display_name: item.creator,
});
}
let parsedGuid = url.parse(item.guid);
let isPermalink = !!parsedGuid.hostname;
let guidPathName = "";
if (isPermalink) {
guidPathName =
parsedGuid.pathname !== "/"
? `${parsedGuid.pathname.replace(/\/+$/, "")}`
: `${parsedGuid.pathname}`;
if (parsedGuid.query) {
guidPathName = `${guidPathName}${"?" + parsedGuid.query}`;
}
}
let { pathname, query } = url.parse(item.link);
pathname =
pathname !== "/" ? `${pathname.replace(/\/+$/, "")}` : `${pathname}`;
if (query) {
pathname = `${pathname}${query ? "?" + query : ""}`;
}
return {
pathname,
isPermalink,
guidPathName,
ttuid: uuidv4(),
rss_title: encode.forceDecoding(item.title),
title: encode.forceDecoding(title_ ? title_ : item.title),
status: item.status ? item.status : "publish",
body: fetchBody ? item.content : "",
guid: item.guid,
image,
rss_excerpt: encode.forceDecoding(stripHtml(item.contentSnippet ? item.contentSnippet : "")),
excerpt: encode.forceDecoding(stripHtml(excerpt_)),
slug: slug(item.title ? item.title : title_, { lower: true }),
published_at: {
formatted: item.pubDate
? new Date(item.pubDate.toString()).toISOString()
: new Date().toISOString(),
},
created_at: item.isoDate
? new Date(item.isoDate.toString()).toISOString()
: new Date().toISOString(),
modified_at: item.modified
? new Date(item.modified.toString()).toISOString()
: item.lastBuildDate
? new Date(item.lastBuildDate.toString()).toISOString()
: item.isoDate
? new Date(item.isoDate.toString()).toISOString()
: new Date().toISOString(),
tags,
authors,
read_time,
link: item.link,
};
};