wtf_wikipedia
Version:
parse wikiscript into json
95 lines (91 loc) • 2.62 kB
JavaScript
const parse_interwiki = require('./interwiki')
const ignore_links =
/^:?(category|catégorie|kategorie|categoría|categoria|categorie|kategoria|تصنيف|image|file|fichier|datei|media):/i
const external_link = /\[(https?|news|ftp|mailto|gopher|irc)(:\/\/[^\]| ]{4,1500})([| ].*?)?\]/g
const link_reg = /\[\[(.{0,160}?)\]\]([a-z]+)?/gi //allow dangling suffixes - "[[flanders]]s"
const external_links = function (links, str) {
str.replace(external_link, function (raw, protocol, link, text) {
text = text || ''
links.push({
type: 'external',
site: protocol + link,
text: text.trim(),
raw: raw,
})
return text
})
return links
}
const internal_links = function (links, str) {
//regular links
str.replace(link_reg, function (raw, s, suffix) {
let txt = null
//make a copy of original
let link = s
if (s.match(/\|/)) {
//replacement link [[link|text]]
s = s.replace(/\[\[(.{2,100}?)\]\](\w{0,10})/g, '$1$2') //remove ['s and keep suffix
link = s.replace(/(.{2,100})\|.{0,200}/, '$1') //replaced links
txt = s.replace(/.{2,100}?\|/, '')
//handle funky case of [[toronto|]]
if (txt === null && link.match(/\|$/)) {
link = link.replace(/\|$/, '')
txt = link
}
}
//kill off non-wikipedia namespaces
if (link.match(ignore_links)) {
return s
}
//kill off just these just-anchor links [[#history]]
// if (link.match(/^#/i)) {
// console.log(s)
// return s
// }
//remove anchors from end [[toronto#history]]
let obj = {
page: link,
raw: raw,
}
obj.page = obj.page.replace(/#(.*)/, (a, b) => {
obj.anchor = b
return ''
})
//grab any fr:Paris parts
obj = parse_interwiki(obj)
if (obj.wiki) {
obj.type = 'interwiki'
}
if (txt !== null && txt !== obj.page) {
obj.text = txt
}
//finally, support [[link]]'s apostrophe
if (suffix) {
obj.text = obj.text || obj.page
obj.text += suffix.trim()
}
//titlecase it, if necessary
if (obj.page && /^[A-Z]/.test(obj.page) === false) {
if (!obj.text) {
obj.text = obj.page
}
obj.page = obj.page
}
links.push(obj)
return s
})
return links
}
//grab an array of internal links in the text
const parse_links = function (str) {
let links = []
//first, parse external links
links = external_links(links, str)
//internal links
links = internal_links(links, str)
if (links.length === 0) {
return undefined
}
return links
}
module.exports = parse_links