fetch-fic
Version:
Package up delicious, delicious fanfic from various sources into epub ebooks ready for reading in your ereader of choice.
400 lines (378 loc) • 15.3 kB
JavaScript
'use strict'
/* eslint-disable no-useless-escape */
const url = require('url')
const Site = require('./site.js')
const cheerio = require('cheerio')
const color = require('color-ops')
const knownSites = {
'forums.sufficientvelocity.com': 'Sufficient Velocity',
'forums.spacebattles.com': 'Spacebattles',
'forum.questionablequesting.com': 'Questionable Questing',
'questionablequesting.com': 'Questionable Questing'
}
class Xenforo extends Site {
static matches (siteUrlStr) {
const siteUrl = url.parse(siteUrlStr)
if (!/^[/](threads|posts)[/]|^[/]index[.]php[?]topic|^[/]goto[/]post[?]id/.test(siteUrl.path)) return false
return true
}
constructor (siteUrlStr) {
super(siteUrlStr)
const siteUrl = url.parse(siteUrlStr)
const hostname = siteUrl.hostname
if (!knownSites[hostname]) {
this.warnings.push(`Has not yet been tested with ${hostname}, may not work.`)
}
this.publisher = hostname
this.publisherName = knownSites[hostname] || hostname
const path = siteUrl.pathname || siteUrl.path || ''
const nameMatch = path.match(/^[/]threads[/]([^.]+)/)
this.name = nameMatch && nameMatch[1]
}
getFicMetadata (fetch, fic) {
fic.link = this.link
fic.publisher = this.publisherName
fic.includeTOC = true
return fetch(this.threadmarkUrl()).spread((meta, html) => {
const $ = cheerio.load(html)
const base = $('base').attr('href') || this.threadmarkUrl()
const tat = this.detagTitle(this.scrapeTitle($))
fic.title = tat.title
if (!fic.tags.length) {
fic.tags = tat.tags
}
let chapters = $('li.threadmarkItem')
if (chapters.length === 0) chapters = $('li.primaryContent') // qq
let leastRecent
let mostRecent
chapters.each((ii, chapter) => {
const $chapter = $(chapter)
const $link = $chapter.find('a')
const name = $link.text().trim()
const link = this.normalizeLink($link.attr('href'), base)
const created = this.dateTime($chapter.find('.DateTime'))
if (!leastRecent || created < leastRecent) leastRecent = created
if (!mostRecent || created > mostRecent) mostRecent = created
fic.chapters.addChapter({name, link, created})
})
fic.created = leastRecent
fic.modified = mostRecent
return this.getChapter(link => fetch(link, false), this.link).then((chapter) => {
fic.author = chapter.author
fic.authorUrl = chapter.authorUrl
const $content = cheerio.load(chapter.content)
fic.description = $content.text().trim().replace(/^([^\n]+)[\s\S]*?$/, '$1')
})
})
}
scrapeFicMetadata (fetch, fic) {
if (!fic.link) fic.link = this.link
if (!fic.publisher) fic.publisher = this.publisherName
if (fic.includeTOC == null) fic.includeTOC = true
return this.getChapter(fetch, this.link).then(chapter => {
const $ = cheerio.load(chapter.raw)
// we guard all the fic metadata updates because we might be
// acting in addition to the result from getFicMetadata
if (!fic.link) fic.link = this.normalizeLink(chapter.finalUrl)
if (!fic.created) fic.created = this.dateTime($('.DateTime'))
if (!fic.title) fic.title = chapter.ficTitle
if (!fic.tags) fic.tags = chapter.ficTags
if (!fic.author) fic.author = chapter.author
if (!fic.authorUrl) fic.authorUrl = chapter.authorUrl
const $content = cheerio.load(chapter.content)
const firstPara = $content.text().trim().replace(/^([^\n]+)[\s\S]*?$/, '$1')
if (!fic.description) fic.description = firstPara
const links = $content('a')
const indexLink = this.normalizeLink(chapter.finalUrl)
if (links.length === 0) {
fic.addChapter({name: chapter.title || fic.title, link: indexLink, created: chapter.created})
} else {
fic.addChapter({name: 'Table of Contents', link: indexLink, created: chapter.created})
fic.includeTOC = false
}
links.each((_, link) => {
const $link = $content(link)
const href = this.normalizeLink($link.attr('href'), chapter.base)
let name = $link.text().trim()
if (name === '↑') return // don't add links to quoted text as chapters
// if the name is a link, try to find one elsewhere
if (/^https?:[/][/]/.test(name) || / \| Page \d+$/.test(name)) {
let next = $link[0].prev
let nextText = $content(next).text().trim()
if (next.type === 'text' && nextText === '') {
next = next.prev
nextText = $content(next).text().trim()
}
if (next.type !== 'text') {
next = next.prev
nextText = $content(next).text().trim()
}
if (next.type === 'text') {
name = nextText
}
}
if (/^[/](?:threads|posts|s|art)[/]|^[/]index.php[?]topic/.test(url.parse(href).path)) {
fic.addChapter({name, link: href})
}
})
if (!fic.modified && fic.chapters.slice(-1).created) {
fic.modified = fic.chapters.slice(-1).created
}
if (fic.modified || fic.chapters.length === 0) return
const lastChapter = fic.chapters.slice(-1)[0]
return fic.getChapter(link => fetch(link, false), lastChapter.fetchFrom || lastChapter.link).then((chapter) => {
fic.modified = chapter.created
})
})
}
getChapter (fetch, chapter, noCache) {
return fetch(chapter, noCache).catch(err => {
if (err.meta && err.meta.status === 404) {
throw new Error('No chapter found at ' + chapter)
} else {
throw err
}
}).spread((meta, html) => {
const chapterHash = url.parse(chapter).hash
const parsed = url.parse(meta.finalUrl)
let id
if (/^#post/.test(chapterHash)) {
id = chapterHash || parsed.hash || ''
} else {
id = parsed.hash || chapterHash || ''
}
let finalUrl = meta.finalUrl
if (id) {
parsed.hash = id
finalUrl = url.format(parsed)
}
const $ = cheerio.load(html)
let $message
if (id.length > 1) {
$message = $('li.message#' + id.slice(1).replace(/[)]$/, ''))
} else {
$message = $($('li.message')[0])
}
const $content = $message.find('article')
if ($content.length === 0) {
const $error = $('div.errorPanel')
if ($error.length === 0) {
if (noCache || !meta.fromCache) {
throw new Error('No chapter found at ' + chapter)
} else {
return this.getChapter(fetch, chapter, true)
}
} else {
throw new Error('Error fetching ' + chapter + ': ' + $error.text().trim())
}
}
const tat = this.detagTitle(this.scrapeTitle($))
const ficTitle = tat.title
const ficTags = tat.tags
// at least on qq
const $contentWarning = $content.find('dl.adv_accordion')
if ($contentWarning.length) {
const label = $contentWarning.find('dt').html()
const value = $contentWarning.find('dd').html()
$contentWarning.replaceWith(`<div><h3>${label}</h3>${value}</div>`)
}
$content.find('.quoteContainer < aside').each((ii, quote) => {
const $quote = $(quote)
const $attribution = $quote.find('.attribution')
if ($attribution.length !== 0) {
if (!$attribution.text().match(/(.*) said:/)) console.log('QUOTE', $quote.html())
const user = $attribution.text().match(/(.*) said:/)[1].trim()
const postHref = $attribution.find('a').attr('href')
if (postHref) {
const post = postHref.match(/(\d+)/)[1]
$quote.find('.quote').attr('style', `xenforo-quote: ${post} '${user}';`)
} else {
$quote.find('.quote').attr('style', `xenforo-quote: '${user}';`)
}
} else {
$quote.find('.quote').attr('style', `xenforo-quote: true`)
}
$content.find('.quoteExpand').remove()
})
const $spoiler = $content.find('.bbCodeSpoilerContainer')
const spoilerLabel = $spoiler.find('.bbCodeSpoilerButton').text().trim()
$spoiler.attr('style', `border: solid black 1px; xenforo-spoiler: '${spoilerLabel}';`)
if (spoilerLabel === 'Spoiler') {
$spoiler.find('.bbCodeSpoilerButton').remove()
} else {
$spoiler.find('.bbCodeSpoilerButton').replaceWith(`<b>${spoilerLabel}</b><br/>`)
}
const base = $('base').attr('href') || finalUrl
const $author = $($message.find('a.username')[0])
const authorUrl = url.resolve(base, $author.attr('href'))
const authorName = $author.text()
const messageDate = this.dateTime($message.find('a.datePermalink .DateTime'))
let baseLightness = 0
if (/spacebattles/.test(chapter)) {
baseLightness = color.lightness(color.rgb(204, 204, 204))
} else if (/questionablequesting/.test(chapter)) {
baseLightness = color.lightness(color.rgb(86, 86, 86))
} else if (/sufficientvelocity/.test(chapter)) {
baseLightness = color.lightness(color.rgb(230, 230, 230))
}
$content.find('[style *= color]').each((ii, vv) => {
const style = $(vv).attr('style')
let ns = `xenforo-color: ${style};`
const colorMatch = style.match(/color: #(\S\S)(\S\S)(\S\S)/)
let opacity = 1
if (colorMatch) {
const r = Number('0x' + colorMatch[1])
const g = Number('0x' + colorMatch[2])
const b = Number('0x' + colorMatch[3])
const lightness = color.lightness(color.rgb(r, g, b))
opacity = lightness / baseLightness
if (baseLightness < 0.5) opacity = 1 - opacity
if (opacity < 0.25) opacity = 0.25
ns += 'opacity: ' + opacity
} else if (style === 'color: transparent') {
opacity = 0.25
ns += 'text-decoration: line-through; font-style: oblique; opacity: 0.25;'
}
if (opacity > 1) {
ns += 'font-weight: bolder;'
}
if (style === 'color: #ffcc99') {
ns += 'font-style: italic;'
} else if (style === 'color: #99ffff') {
ns += 'font-style: italic;'
} else if (style === 'color: #9999ff') {
ns += 'font-family: fantasy; font-style: italic;'
} else if (style === 'color: #4d4dff') {
ns += 'border-style: hidden dashed;'
} else if (style === 'color: #b3b300') {
ns += 'border-style: hidden double;'
} else if (style === 'color: #b30000') {
ns += 'border-style: hidden solid;'
}
$(vv).attr('style', ns)
})
$content.find('div.messageTextEndMarker').remove()
return {
ficTitle: ficTitle,
ficTags: ficTags,
chapterLink: chapter,
finalUrl: finalUrl,
base: base,
author: authorName,
authorUrl: authorUrl,
created: messageDate,
raw: html,
content: $content.html().trim()
// content is blockquoted, for some reason
.replace(/^\s*<blockquote[^>]*>([\s\S]+)<[/]blockquote>\s*$/, '$1')
// bullshit sv holloween thingy
.replace(/^<p style="padding: 5px 0px; font-weight: bold; font-style: oblique; text-align: center; font-size: 12pt">.*?<[/]p>/g, '')
// trim the lines
.replace(/^\s+|\s+$/mg, '')
}
})
}
sanitizeHtmlConfig () {
const config = super.sanitizeHtmlConfig()
config.transformTags.img = (tagName, attribs) => { return this.cleanImages(tagName, attribs) }
return config
}
cleanImages (tagName, attribs) {
if (attribs.class) {
const classes = attribs.class.trim().split(/\s+/)
if (classes.some(this.andMatches(/^mceSmilieSprite$/))) {
const smilies = classes.filter(this.andMatches(/^mceSmilie\d+$/))
let text
switch (smilies && smilies[0]) {
case 'mceSmilie1': text = '🙂'; break
case 'mceSmilie2': text = '😉'; break
case 'mceSmilie3': text = '🙁'; break
case 'mceSmilie4': text = '😡'; break
case 'mceSmilie5': text = '🙃'; break
case 'mceSmilie6': text = '😎'; break
case 'mceSmilie7': text = '😛'; break
case 'mceSmilie8': text = '😆'; break
case 'mceSmilie9': text = '😮'; break
case 'mceSmilie10': text = '😳'; break
case 'mceSmilie11': text = '🙄'; break
case 'mceSmilie12': text = '😝'; break
case 'mceSmilie58': text = '😭'; break
case 'mceSmilie59': text = '😏'; break
case 'mceSmilie60': text = '😇'; break
case 'mceSmilie62': text = '😂'; break
case 'mceSmilie63': text = '😆😂'; break
default: text = attribs.alt
}
return {tagName: 'span', text: text}
}
}
if (!attribs.src || /^http/.test(attribs.src)) {
return {tagName: 'span', text: ''}
}
return {tagName: tagName, attribs: attribs}
}
andMatches (pattern) {
return (item) => { return pattern.test(item) }
}
threadmarkUrl () {
const threadUrl = url.parse(this.raw)
const path = threadUrl.pathname || threadUrl.path
const threadMatch = /^([/]threads[/][^/]+\.\d+)(?:[/].*)?$/
if (threadMatch.test(path)) {
threadUrl.hash = ''
threadUrl.pathname = threadUrl.pathname.replace(threadMatch, '$1/threadmarks')
} else {
this.warnings.push("This does not appear to be a thread Url, can't find threadmarks: ", threadUrl)
}
return url.format(threadUrl)
}
scrapeDateTime (elem) {
if (elem.attr('data-datestring')) {
return new Date(elem.attr('data-datestring') + ' ' + elem.attr('data-timestring'))
} else if (elem.attr('title')) {
return new Date(elem.attr('title').replace(/ at/, ''))
}
}
scrapeTitle ($) {
// sv, sb
try {
return $('meta[property="og:title"]').attr('content').replace(/Threadmarks for: /i, '')
} catch (_) {
// qq
try {
return $('div.titleBar h1').text().replace(/^\[\w+\] /, '').replace(/Threadmarks for: /i, '')
} catch (_) {
return
}
}
}
detagTitle (title) {
const tagExp = /[\[(](.*?)[\])]/
const tagMatch = title.match(tagExp)
let tags = []
if (tagMatch) {
title = title.replace(tagExp, '').trim()
tags = tagMatch[1].split(/[/,]/).map(tag => tag.trim())
}
return {title, tags}
}
normalizeLink (href, base) {
// force ssl
if (!/index.php/.test(href)) href = href.replace(/^http:/, 'https:')
// resolve base url
if (base) href = url.resolve(base, href)
// normalize post urls
href = href.replace(/[/]threads[/][^/]+[/](?:page-\d+)?#post-(\d+)$/, '/posts/$1')
.replace(/([/]posts[/][^/]+)[/]$/, '$1')
.replace(/[/]goto[/]post[?]id=(\d+).*?$/, '/posts/$1')
return href
}
dateTime (elem) {
if (elem.attr('data-datestring')) {
return new Date(elem.attr('data-datestring') + ' ' + elem.attr('data-timestring'))
} else if (elem.attr('title')) {
return new Date(elem.attr('title').replace(/ at/, ''))
}
}
}
module.exports = Xenforo