wtf_wikipedia
Version:
parse wikiscript into json
538 lines (497 loc) • 15.1 kB
JavaScript
const sectionMap = require('./_sectionMap')
const toJSON = require('./toJson')
const isDisambig = require('./isDisambig')
const setDefaults = require('../_lib/setDefaults')
const Image = require('../image/Image')
const redirects = require('./redirects')
const preProcess = require('./preProcess')
const parse = {
section: require('../02-section'),
categories: require('./categories'),
}
const defaults = {
tables: true,
lists: true,
paragraphs: true,
}
/**
* The document class is the main entry point of wtf_wikipedia.
* this class represents an article of wikipedia.
* from here you can go to the infoboxes or paragraphs
*
* @class
*/
class Document {
/**
* The constructor for the document class
* This function starts parsing the wiki text and sets the options in the class
*
* @param {string} [wiki] The wiki text
* @param {object} [options] The options for the parser
*/
constructor(wiki, options) {
options = options || {}
let props = {
pageID: options.pageID || options.id || null,
namespace: options.namespace || options.ns || null,
lang: options.lang || options.language || null,
domain: options.domain || null,
title: options.title || null,
type: 'page',
redirectTo: null,
wikidata: options.wikidata || null,
wiki: wiki || '',
categories: [],
sections: [],
coordinates: [],
}
// this._missing_templates = {} //for stats+debugging purposes
Object.keys(props).forEach((k) => {
Object.defineProperty(this, '_' + k, {
enumerable: false,
writable: true,
value: props[k],
})
})
//detect if page is just redirect, and return it
if (redirects.isRedirect(this._wiki) === true) {
this._type = 'redirect'
this._redirectTo = redirects.parse(this._wiki)
const [categories, newWiki] = parse.categories(this._wiki)
this._categories = categories
this._wiki = newWiki
return
}
//give ourselves a little head-start
this._wiki = preProcess(this._wiki)
//pull-out [[category:whatevers]]
const [categories, newWiki] = parse.categories(this._wiki)
this._categories = categories
this._wiki = newWiki
//parse all the headings, and their texts/sentences
this._sections = parse.section(this)
}
/**
* Getter and setter for the tile.
* If string is given then this function is a setter and sets the variable and returns the set value
* If the string is not given then it will check if the title is available
* If it is available it returns the title.
* Else it will look if the first sentence contains a bolded phrase and assumes that's the title and returns it
*
* @param {string} [str] The title that will be set
* @returns {null|string} The title found or given
*/
title(str) {
//use like a setter
if (str !== undefined) {
this._title = str
return str
}
//if we have it already
if (this._title) {
return this._title
}
//guess the title of this page from first sentence bolding
let guess = null
let sen = this.sentence()
if (sen) {
guess = sen.bold()
}
return guess
}
/**
* If an pageID is given then it sets the pageID and returns the given pageID
* Else if the pageID is already set it returns the pageID
*
* @param {number} [id] The pageID that will be set
* @returns {number|null} The given or found pageID
*/
pageID(id) {
if (id !== undefined) {
this._pageID = id
}
return this._pageID || null
}
/**
* If an WikidataID is given then it sets the WikidataID and returns the given WikidataID
* Else if the WikidataID is already set it returns the WikidataID
*
* @param {string} [id] The WikidataID that will be set
* @returns {string|null} The given or found WikidataID
*/
wikidata(id) {
if (id !== undefined) {
this._wikidata = id
}
return this._wikidata || null
}
/**
* If an domain is given then it sets the domain and returns the given domain
* Else if the domain is already set it returns the domain
*
* @param {string} [str] The domain that will be set
* @returns {string|null} The given or found domain
*/
domain(str) {
if (str !== undefined) {
this._domain = str
}
return this._domain || null
}
/**
* If an language is given then it sets the language and returns the given language
* Else if the language is already set it returns the language
*
* @param {string} [lang] The language that will be set
* @returns {string|null} The given or found language
*/
language(lang) {
if (lang !== undefined) {
this._lang = lang
}
return this._lang || null
}
/**
* Gets the url of the page
* If the language or domain is not available we substitute 'en' and 'wikipedia.org'
* Then we use the template of `https://${lang}.${domain}/wiki/${title}` to make the url
*
* @returns {string|null} The url of the page
*/
url() {
let title = this.title()
if (!title) {
return null
}
let lang = this.language() || 'en'
let domain = this.domain() || 'wikipedia.org'
//replace blank to underscore
title = title.replace(/ /g, '_')
title = encodeURIComponent(title)
return `https://${lang}.${domain}/wiki/${title}`
}
/**
* If an namespace is given then it sets the namespace and returns the given namespace
* Else if the namespace is already set it returns the namespace
*
* @param {string} [ns] The namespace that will be set
* @returns {string|null} The given or found namespace
*/
namespace(ns) {
if (ns !== undefined) {
this._namespace = ns
}
return this._namespace || null
}
/**
* Returns if the page is a redirect
*
* @returns {boolean} Is the page a redirect
*/
isRedirect() {
return this._type === 'redirect'
}
/**
* Returns information about the page this page redirects to
*
* @returns {null|object} The redirected page
*/
redirectTo() {
return this._redirectTo
}
/**
* This function finds out if a page is a disambiguation page
*
* @returns {boolean} Whether the page is a disambiguation page
*/
isDisambiguation() {
return isDisambig(this)
}
/**
* If a clue is available return the category at that index
* Else return all categories
*
* @returns {string | string[]} The category at the provided index or all categories
*/
categories(clue) {
let arr = this._categories || []
if (typeof clue === 'number') {
return [arr[clue]]
}
return arr
}
/**
* returns the sections of the document
*
* If the clue is a string then it will return the section with that exact title
* Else if the clue is a number then it returns the section at that index
* Else it returns all the sections
*
* @param {number | string} [clue] A title of a section or a index of a wanted section
* @returns {object | object[]} A section or a array of sections
*/
sections(clue) {
let arr = this._sections || []
arr.forEach((sec) => {
// link-up parent and child
sec._doc = this
})
//grab a specific section, by its title
if (typeof clue === 'string') {
let str = clue.toLowerCase().trim()
return arr.filter((s) => {
return s.title().toLowerCase() === str
})
} else if (typeof clue === 'number') {
return [arr[clue]]
}
return arr
}
/**
* Returns the paragraphs in the document
*
* If the clue is a number then it returns the paragraph at that index
* Else it returns all paragraphs in an array
* @param {number | string} [clue] given index of a paragraph
* @returns {object | object[]} the selected paragraph or an array of all paragraphs
*/
paragraphs(clue) {
let arr = []
this.sections().forEach((s) => {
arr = arr.concat(s.paragraphs())
})
if (typeof clue === 'number') {
return [arr[clue]]
}
return arr
}
/**
* if no clue is provided, it compiles an array of sentences in the wiki text.
* if the clue is provided it return the sentence at the provided index
* @param {number | string} [clue] given index of a sentence
* @returns {object[]|object} an array of sentences or a single sentence
*/
sentences(clue) {
let arr = []
this.sections().forEach((sec) => {
arr = arr.concat(sec.sentences())
})
if (typeof clue === 'number') {
return [arr[clue]]
}
return arr
}
/**
* This function search the whole page, including the infobox and image gallery templates for images
* and then returns them in an array if no clue is provided.
* if an clue is profieded then it returns the image at the clue-th index
*
* @returns {Image[]|Image} a single image or an array of images
*/
images(clue) {
let arr = sectionMap(this, 'images', null)
//grab image from infobox, first
this.infoboxes().forEach((info) => {
let img = info.image()
if (img) {
arr.unshift(img) //put it at the top
}
})
//look for 'gallery' templates, too
this.templates().forEach((obj) => {
if (obj.data.template === 'gallery') {
obj.data.images = obj.data.images || []
obj.data.images.forEach((img) => {
if (!(img instanceof Image)) {
img.language = this.language()
img.domain = this.domain()
img = new Image(img)
}
arr.push(img)
})
}
})
if (typeof clue === 'number') {
return [arr[clue]]
}
return arr
}
/**
* Return all links or if a clue is provided only the link at that index
*
* @param {number} [clue] the index of the wanted link
* @returns {string[]|string} all the links or the selected link
*/
links(clue) {
return sectionMap(this, 'links', clue)
}
/**
* Return all inter wiki links or if a clue is provided only the inter wiki link at that index
*
* @param {number} [clue] the index of the wanted inter wiki link
* @returns {string[]|string} all the inter wiki links or the selected inter wiki link
*/
interwiki(clue) {
return sectionMap(this, 'interwiki', clue)
}
/**
* If a clue is available return the list at that index
* Else return all lists
*
* @param {number} [clue] The index of the wanted list
* @returns {object | object[]} The list at the provided index or all lists
*/
lists(clue) {
return sectionMap(this, 'lists', clue)
}
/**
* If a clue is available return the tables at that index
* Else return all tables
*
* @param {number} [clue] The index of the wanted table
* @returns {object | object[]} The table at the provided index or all tables
*/
tables(clue) {
return sectionMap(this, 'tables', clue)
}
/**
* If a clue is available return the template at that index
* Else return all templates
*
* @param {number} [clue] The index of the wanted template
* @returns {object | object[]} The category at the provided index or all categories
*/
templates(clue) {
return sectionMap(this, 'templates', clue)
}
/**
* If a clue is available return the references at that index
* Else return all references
*
* @param {number} [clue] The index of the wanted references
* @returns {object | object[]} The category at the provided index or all references
*/
references(clue) {
return sectionMap(this, 'references', clue)
}
/**
* Returns the 0th or clue-th reference
*
* @param {number} [clue] The index of the wanted reference
* @returns {object|string|number} The reference at the provided index
*/
citations(clue) {
return this.references(clue)
}
/**
* finds and returns all coordinates
* or if an clue is given, the coordinate at the index
*
* @param {number} [clue] the index of the coordinate returned
* @returns {object[]|object|null} if a clue is given, the coordinate of null, else an array of coordinates
*/
coordinates(clue) {
return sectionMap(this, 'coordinates', clue)
}
/**
* If clue is unidentified then it returns all infoboxes
* If clue is a number then it returns the infobox at that index
* It always sorts the infoboxes by size
*
* @param {number} [clue] the index of the infobox you want to select
* @returns {object | object[]} the selected infobox or an array of infoboxes
*/
infoboxes(clue) {
let arr = sectionMap(this, 'infoboxes', clue)
//sort them by biggest-first
arr = arr.sort((a, b) => {
if (Object.keys(a.data).length > Object.keys(b.data).length) {
return -1
}
return 1
})
return arr
}
/**
* return a plain text version of the wiki article
*
* @param {object} [options] the options for the parser
* @returns {string} the plain text version of the article
*/
text(options) {
options = setDefaults(options, defaults)
//nah, skip these.
if (this.isRedirect() === true) {
return ''
}
let arr = this.sections().map((sec) => sec.text(options))
return arr.join('\n\n')
}
/**
* return a json version of the Document class
*
* @param {object} [options] options for the rendering
* @returns {object} this document as json
*/
json(options) {
options = setDefaults(options, defaults)
return toJSON(this, options)
}
/**
* return original wiki markup
*
* @returns {string} markup text
*/
wikitext() {
return this._wiki || ''
}
/**
* prints the title of every section
*
* @returns {Document} the document itself
*/
debug() {
console.log('\n')
this.sections().forEach((sec) => {
let indent = ' - '
for (let i = 0; i < sec.depth(); i += 1) {
indent = ' -' + indent
}
console.log(indent + (sec.title() || '(Intro)'))
})
return this
}
}
// aliases
const singular = {
categories: 'category',
sections: 'section',
paragraphs: 'paragraph',
sentences: 'sentence',
images: 'image',
links: 'link',
// interwiki
lists: 'list',
tables: 'table',
templates: 'template',
references: 'reference',
citations: 'citation',
coordinates: 'coordinate',
infoboxes: 'infobox',
}
Object.keys(singular).forEach((k) => {
let sing = singular[k]
Document.prototype[sing] = function (clue) {
let arr = this[k](clue)
return arr[0] || null
}
})
Document.prototype.lang = Document.prototype.language
Document.prototype.ns = Document.prototype.namespace
Document.prototype.plaintext = Document.prototype.text
Document.prototype.isDisambig = Document.prototype.isDisambiguation
Document.prototype.citations = Document.prototype.references
Document.prototype.redirectsTo = Document.prototype.redirectTo
Document.prototype.redirect = Document.prototype.redirectTo
Document.prototype.redirects = Document.prototype.redirectTo
module.exports = Document