fetch-fic
Version:
Package up delicious, delicious fanfic from various sources into epub ebooks ready for reading in your ereader of choice.
463 lines (436 loc) • 17.3 kB
JavaScript
/* eslint-disable no-useless-escape */
const url = require('url')
const Site = use('site')
const moment = require('moment-timezone')
const knownSites = {
'forums.sufficientvelocity.com': 'Sufficient Velocity',
'forums.spacebattles.com': 'Spacebattles',
'forum.questionablequesting.com': 'Questionable Questing',
'questionablequesting.com': 'Questionable Questing'
}
class Xenforo extends Site {
static matches (siteUrlStr) {
const siteUrl = url.parse(siteUrlStr)
if (!/^[/](threads|posts)[/]|^[/]index[.]php[?]topic|^[/]goto[/]post[?]id/.test(siteUrl.path)) return false
return true
}
constructor (siteUrlStr) {
super(siteUrlStr)
const siteUrl = url.parse(siteUrlStr)
const hostname = siteUrl.hostname
if (!knownSites[hostname]) {
this.warnings.push(`Has not yet been tested with ${hostname}, may not work.`)
}
this.publisher = hostname
this.publisherName = knownSites[hostname] || hostname
this.canScrape = true
const path = siteUrl.pathname || siteUrl.path || ''
const nameMatch = path.match(/^[/]threads[/]([^.]+)/)
this.name = nameMatch && nameMatch[1]
}
async getFicMetadata (fetch, fic) {
async function fetchWithCheerio (url) {
const cheerio = require('cheerio')
const [meta, html] = await fetch(url)
return cheerio.load(html)
}
fic.link = this.link
fic.publisher = this.publisherName
const $ = await fetchWithCheerio(this.threadmarkUrl())
const base = $('base').attr('href') || this.threadmarkUrl()
const tat = this.detagTitle(this.scrapeTitle($))
fic.title = tat.title
fic.tags = fic.tags.concat(tat.tags.map(t => `title:${t}`))
const $sections = $('div.threadmarks ol.tabs li')
let leastRecent
let mostRecent
const tz = this.getTz($)
const loadThreadmarks = (type, $) => {
let chapters = $('li.threadmarkItem')
if (chapters.length === 0) chapters = $('li.primaryContent') // qq
chapters.each((ii, chapter) => {
const $chapter = $(chapter)
$chapter.find('li').remove() // remove child chapters so that $link.text() works right
if ($chapter.find('a.username').length) return
const $link = $chapter.find('a')
const name = $link.text().trim()
const link = this.normalizeLink($link.attr('href'), base)
const created = this.dateTime($chapter.find('.DateTime'), tz)
if (!leastRecent || created < leastRecent) leastRecent = created
if (!mostRecent || created > mostRecent) mostRecent = created
fic.chapters.addChapter({name, type, link, created})
})
}
loadThreadmarks('chapter', $)
if ($sections.length > 1) {
const sections = []
$sections.each((ii, section) => {
if (ii === 0) return
const $section = $(section)
sections.push({type: $section.text().trim(), link: url.resolve(base, $section.find('a').attr('href'))})
})
for (let section of sections) {
loadThreadmarks(section.type, await fetchWithCheerio(section.link))
}
}
fic.created = leastRecent
fic.modified = mostRecent
if (!fic.chapters.length) return
const chapter = await fic.chapters[0].getContent(fetch.withOpts({cacheBreak: false}))
fic.tags = fic.tags.concat(chapter.tags)
fic.author = chapter.author
fic.authorUrl = chapter.authorUrl
fic.notes = chapter.$content.text().trim().replace(/^([^\n]+)[\s\S]*?$/, '$1')
}
async scrapeFicMetadata (fetch, fic) {
if (!fic.publisher) fic.publisher = this.publisherName
const Chapter = use('fic').Chapter
const chapter = await Chapter.getContent(fetch, this.link)
// we guard all the fic metadata updates because we might be
// acting in addition to the result from getFicMetadata
if (!fic.link) fic.link = this.normalizeLink(chapter.link)
const tz = this.getTz(chapter.$)
if (!fic.created) fic.created = this.dateTime(chapter.$('.DateTime'), tz)
if (!fic.title || !fic.tags) {
const tat = this.detagTitle(this.scrapeTitle(chapter.$))
const ficTitle = tat.title
const ficTags = tat.tags.map(t => `title:${t}`)
if (!fic.title) fic.title = ficTitle
if (!fic.tags.length) fic.tags = ficTags
}
fic.tags = fic.tags.concat(this.getTags(chapter.$))
if (!fic.author) fic.author = chapter.author
if (!fic.authorUrl) fic.authorUrl = chapter.authorUrl
const firstPara = chapter.$content.text().trim().replace(/^([^\n]+)[\s\S]*?$/, '$1')
if (!fic.notes) fic.notes = firstPara
const chapters = []
const links = chapter.$content('a')
links.each((_, link) => {
const $link = chapter.$content(link)
const href = this.normalizeLink($link.attr('href'), chapter.base)
let name = $link.text().trim()
if (name === '↑') return // don't add links to quoted text as chapters
// if the name is a link, try to find one elsewhere
if (/^https?:[/][/]/.test(name) || / \| Page \d+$/.test(name)) {
let next = $link[0].prev
let nextText = chapter.$content(next).text().trim()
if (next.type === 'text' && nextText === '') {
next = next.prev
nextText = chapter.$content(next).text().trim()
}
if (next.type !== 'text') {
next = next.prev
nextText = chapter.$content(next).text().trim()
}
if (next.type === 'text') {
name = nextText
}
}
if (/^[/](?:threads|posts|s|art)[/]|^[/]index.php[?]topic/.test(url.parse(href).path)) {
chapters.push({name, link: href})
}
})
const fetchWithCache = fetch.withOpts({cacheBreak: false})
if (!chapter.name) chapter.name = fic.title
fic.addChapter(chapter)
const forEach = use('for-each')
await forEach(chapters, async ch => {
try {
const inf = await this.getChapter(fetchWithCache, new Chapter(ch))
fic.modified = inf.modified || inf.created
fic.addChapter(ch)
} catch (_) {}
})
if (fic.chapters.length > 1) {
fic.chapters[0].name = 'Table of Contents'
fic.includeTOC = false
}
}
async getChapter (fetch, chapterInfo, retried) {
let meta, html
try {
[meta, html] = await fetch(chapterInfo.fetchWith())
} catch (err) {
if (err.meta && err.meta.status === 404) {
throw new Error('No chapter found at ' + chapterInfo.fetchWith())
} else {
throw err
}
}
process.emit('debug', `Fetched ${chapterInfo.name}: ${chapterInfo.fetchWith()}`)
const ChapterContent = use('chapter-content')
const chapter = new ChapterContent(chapterInfo, {site: this, html})
const chapterHash = url.parse(chapter.link).hash
const parsed = url.parse(meta.finalUrl)
let id
if (/^#post/.test(chapterHash)) {
id = chapterHash || parsed.hash || ''
} else {
id = parsed.hash || chapterHash || ''
}
let finalUrl = meta.finalUrl
if (id) {
parsed.hash = id
finalUrl = url.format(parsed)
}
if (finalUrl !== chapter.link) {
chapter.fetchFrom = finalUrl
}
const tz = this.getTz(chapter.$)
let $message
if (id.length > 1) {
$message = chapter.$('li.message#' + id.slice(1).replace(/[)]$/, ''))
} else {
$message = chapter.$(chapter.$('li.message')[0])
}
const $content = $message.find('article')
if ($content.length === 0) {
const $error = chapter.$('div.errorPanel')
if ($error.length === 0) {
if (!meta.fromCache || retried) {
throw new Error('No chapter found at ' + chapter.link)
} else {
process.emit('debug', `No content found, retrying ${chapterInfo.name}: ${chapterInfo.fetchWith()}`)
return this.getChapter(fetch.withOpts({cacheBreak: true}), chapter, true)
}
} else {
throw new Error('Error fetching ' + chapter + ': ' + $error.text().trim())
}
}
// at least on qq
const $contentWarning = $content.find('dl.adv_accordion')
if ($contentWarning.length) {
const label = $contentWarning.find('dt').html()
const value = $contentWarning.find('dd').html()
$contentWarning.replaceWith(`<div><h3>${label}</h3>${value}</div>`)
}
$content.find('.quoteContainer < aside').each((ii, quote) => {
const $quote = chapter.$(quote)
const $attribution = $quote.find('.attribution')
if ($attribution.length !== 0) {
if (!$attribution.text().match(/(.*) said:/)) process.emit('debug', 'QUOTE', $quote.html())
const user = $attribution.text().match(/(.*) said:/)[1].trim()
const postHref = $attribution.find('a').attr('href')
if (postHref) {
const post = postHref.match(/(\d+)/)[1]
$quote.find('.quote').attr('style', `xenforo-quote: ${post} '${user}';`)
} else {
$quote.find('.quote').attr('style', `xenforo-quote: '${user}';`)
}
} else {
$quote.find('.quote').attr('style', `xenforo-quote: true`)
}
$content.find('.quoteExpand').remove()
})
const $spoilers = $content.find('.bbCodeSpoilerContainer')
if (chapterInfo.spoilers) {
$spoilers.each((ii, spoiler) => {
const $spoiler = chapter.$(spoiler)
const spoilerLabel = $spoiler.find('.bbCodeSpoilerButton').text().trim()
$spoiler.attr('style', `border: solid black 1px; xenforo-spoiler: '${spoilerLabel}';`)
if (spoilerLabel === 'Spoiler') {
$spoiler.find('.bbCodeSpoilerButton').remove()
} else {
$spoiler.find('.bbCodeSpoilerButton').replaceWith(`<b>${spoilerLabel}</b><br/>`)
}
})
} else {
$spoilers.remove()
}
chapter.base = chapter.$('base').attr('href') || finalUrl
const $author = chapter.$($message.find('a.username')[0])
chapter.authorUrl = url.resolve(chapter.base, $author.attr('href'))
chapter.author = $author.text().trim()
chapter.created = this.dateTime($message.find('a.datePermalink .DateTime'), tz)
let baseLightness = 100
const color = require('color-ops')
if (/spacebattles/.test(chapter.link)) {
baseLightness = color.lightness(color.rgb(204, 204, 204))
} else if (/questionablequesting/.test(chapter.link)) {
baseLightness = color.lightness(color.rgb(86, 86, 86))
} else if (/sufficientvelocity/.test(chapter.link)) {
baseLightness = color.lightness(color.rgb(230, 230, 230))
}
$content.find('[style *= color]').each((ii, vv) => {
const style = chapter.$(vv).attr('style')
const colorMatch = style.match(/color: #(\S\S)(\S\S)(\S\S)/)
let ns = ''
let opacity = 1
if (colorMatch) {
const r = Number('0x' + colorMatch[1])
const g = Number('0x' + colorMatch[2])
const b = Number('0x' + colorMatch[3])
ns += `xenforo-color: rgb(${r},${g},${b});`
const lightness = color.lightness(color.rgb(r, g, b))
opacity = lightness / baseLightness
if (baseLightness < 0.5) opacity = 1 - opacity
if (opacity < 0.25) opacity = 0.25
if (opacity > 1) {
ns += 'opacity: 1; font-weight: bolder;'
} else {
ns += `opacity: ${opacity};`
}
const red = Math.round(r/25)
const green = Math.round(g/25)
const blue = Math.round(b/25)
if (red > green && red > blue) { // red
ns += 'border-style: hidden dashed;'
} else if (green > red && green > blue) { // green
ns += 'border-style: hidden double;'
} else if (blue > red && blue > green) { // blue
ns += 'border-style: hidden solid;'
} else if (red === green && red > blue) { // yellow?
ns += 'border-style: dashed double;'
} else if (red === blue && red > green) { // magenta
ns += 'border-style: dashed solid;'
} else if (green === blue && green > red) { // cyan
ns += 'border-style: double solid;'
}
} else if (style === 'color: transparent') {
opacity = 0.25
ns += 'text-decoration: line-through; font-style: oblique; opacity: 0.25;'
}
chapter.$(vv).attr('style', ns)
})
chapter.tags = this.getTags(chapter.$)
if (/Discussion in .*Quest(s|ing)/i.test(chapter.$('#pageDescription').text())) {
chapter.tags.push('Quest')
}
$content.find('div.messageTextEndMarker').remove()
chapter.content = $content.html().trim()
// content is blockquoted, for some reason
.replace(/^\s*<blockquote[^>]*>([\s\S]+)<[/]blockquote>\s*$/, '$1')
// bullshit sv holloween thingy
.replace(/^<p style="padding: 5px 0px; font-weight: bold; font-style: oblique; text-align: center; font-size: 12pt">.*?<[/]p>/g, '')
return chapter
}
getTags ($) {
const tags = []
$('.tagList a.tag').each((ii, tag) => {
tags.push($(tag).text().trim())
})
return tags
}
sanitizeHtmlConfig () {
const config = super.sanitizeHtmlConfig()
config.transformTags.img = (tagName, attribs) => { return this.cleanImages(tagName, attribs) }
return config
}
cleanImages (tagName, attribs) {
if (attribs.class) {
const classes = attribs.class.trim().split(/\s+/)
if (classes.some(this.andMatches(/^mceSmilieSprite$/))) {
const smilies = classes.filter(this.andMatches(/^mceSmilie\d+$/))
let text
switch (smilies && smilies[0]) {
case 'mceSmilie1': text = '🙂'; break
case 'mceSmilie2': text = '😉'; break
case 'mceSmilie3': text = '🙁'; break
case 'mceSmilie4': text = '😡'; break
case 'mceSmilie5': text = '🙃'; break
case 'mceSmilie6': text = '😎'; break
case 'mceSmilie7': text = '😛'; break
case 'mceSmilie8': text = '😆'; break
case 'mceSmilie9': text = '😮'; break
case 'mceSmilie10': text = '😳'; break
case 'mceSmilie11': text = '🙄'; break
case 'mceSmilie12': text = '😝'; break
case 'mceSmilie58': text = '😭'; break
case 'mceSmilie59': text = '😏'; break
case 'mceSmilie60': text = '😇'; break
case 'mceSmilie62': text = '😂'; break
case 'mceSmilie63': text = '😆😂'; break
default: text = attribs.alt
}
return {tagName: 'span', text: text}
}
}
if (!attribs.src || /^http/.test(attribs.src)) {
return {tagName: 'span', text: ''}
}
return {tagName: tagName, attribs: attribs}
}
andMatches (pattern) {
return (item) => { return pattern.test(item) }
}
threadmarkUrl () {
const threadUrl = url.parse(this.raw)
const path = threadUrl.pathname || threadUrl.path
const threadMatch = /^([/]threads[/](?:[^/]+\.)?\d+)(?:[/].*)?$/
if (threadMatch.test(path)) {
threadUrl.hash = ''
threadUrl.pathname = threadUrl.pathname.replace(threadMatch, '$1/threadmarks')
} else {
this.warnings.push("This does not appear to be a thread Url, can't find threadmarks: ", threadUrl)
}
return url.format(threadUrl)
}
scrapeTitle ($) {
try {
const titleChunk = $('div.titleBar h1')
titleChunk.find('span').remove()
return titleChunk.text().replace(/Threadmarks for: /i, '').trim()
} catch (_) {
return
}
}
detagTitle (title) {
const tagExp = /[(](.*?)[)]|[\[](.*?)[\]]/g
const tagMatch = title.match(tagExp)
let tags = []
if (tagMatch) {
title = title.replace(tagExp, '').trim()
tagMatch.map(t =>
t.slice(1,-1)
.split(/[/,|]/)
.map(st => st.trim())
.forEach(st => tags.push(st)))
}
return {title, tags}
}
normalizeLink (href, base) {
// force ssl
if (!/index.php/.test(href)) href = href.replace(/^http:/, 'https:')
// resolve base url
if (base) href = url.resolve(base, href)
// normalize post urls
href = href.replace(/[/]threads[/][^/]+[/](?:page-\d+)?#post-(\d+)$/, '/posts/$1')
.replace(/([/]posts[/][^/]+)[/]$/, '$1')
.replace(/[/]goto[/]post[?]id=(\d+).*?$/, '/posts/$1')
return href
}
dateTime (elem, tz) {
if (elem.attr('data-time')) {
return moment.unix(elem.attr('data-time')).millisecond(0).second(0)
} else if (elem.attr('data-datestring')) {
return moment.tz(elem.attr('data-datestring') + ' ' + elem.attr('data-timestring'), 'MMM DD, YYYY h:mm A Z', tz)
} else if (elem.attr('title')) {
return moment.tz(elem.attr('title'), 'MMM DD, YYYY [at] h:mm A Z', tz)
}
}
countStoryWords (chapter) {
const wordcount = require('@iarna/word-count')
let $content
if (/[.]bbCodeQuote/.test(chapter.content)) {
const cheerio = require('cheerio')
const $content = cheerio.load(chapter.content)
$content('.bbCodeQuote').remove()
return wordcount($content.text().trim())
} else {
return wordcount(chapter.$content.text().trim())
}
}
getTz ($) {
switch (this.publisherName) {
case 'Sufficient Velocity':
case 'Spacebattles':
return 'America/New_York'
case 'Questionable Questing':
return 'Europe/London'
default:
return 'America/Los_Angeles'
}
}
}
module.exports = Xenforo