UNPKG

mwn

Version:

JavaScript & TypeScript MediaWiki bot framework for Node.js

459 lines 16.4 kB
"use strict"; /** * Class for some basic wikitext parsing, involving * links, files, categories, templates and simple tables * and sections. * * For more advanced and sophisticated wikitext parsing, use * mwparserfromhell <https://github.com/earwig/mwparserfromhell> * implemented in python (which you can use within node.js using * the child_process interface). However, mwparserfromhell doesn't * recognize localised namespaces and wiki-specific configs. * * This class is for methods for parsing wikitext, for the * static methods for creating wikitext, see static_utils.js. */ Object.defineProperty(exports, "__esModule", { value: true }); exports.Unbinder = exports.Parameter = exports.Template = void 0; exports.parseTemplates = parseTemplates; exports.parseTable = parseTable; exports.parseSections = parseSections; exports.default = default_1; // Adapted from https://en.wikipedia.org/wiki/MediaWiki:Gadget-libExtraUtil.js // by Evad37 (cc-by-sa-3.0/GFDL) // TODO: expand from evad37/xfdcloser /** * Represents the wikitext of template transclusion. Used by {@link parseTemplates}. */ class Template { /** * @param {String} wikitext Wikitext of a template transclusion, * starting with '{{' and ending with '}}'. */ constructor(wikitext) { this.wikitext = wikitext; this.parameters = []; } addParam(name, val, wikitext) { this.parameters.push(new Parameter(name, val, wikitext)); } getParam(paramName) { return this.parameters.find((p) => { return p.name == paramName; // == is intentional }); } getValue(paramName) { let param = this.getParam(paramName); return param ? param.value : null; } setName(name) { name = name.trim(); this.name = name[0] ? name[0].toUpperCase() + name.slice(1) : name; } } exports.Template = Template; /** * Represents a template parameter */ class Parameter { constructor(name, val, wikitext) { this.name = name; this.value = val; this.wikitext = '|' + wikitext; } } exports.Parameter = Parameter; // parseTemplates() and processTemplateText() are adapted from // https://en.wikipedia.org/wiki/MediaWiki:Gadget-libExtraUtil.js written by Evad37 // which was in turn adapted from https://en.wikipedia.org/wiki/User:SD0001/parseAllTemplates.js // written by me. (cc-by-sa/GFDL) /** See {@link MwnWikitext.parseTemplates} */ function parseTemplates(wikitext, config) { config = config || { recursive: false, namePredicate: null, templatePredicate: null, count: null, }; const result = []; const n = wikitext.length; // number of unclosed braces let numUnclosed = 0; // are we inside a comment, or between nowiki tags, or in a {{{parameter}}}? let inComment = false; let inNowiki = false; let inParameter = false; let startIdx, endIdx; for (let i = 0; i < n; i++) { if (!inComment && !inNowiki && !inParameter) { if (wikitext[i] === '{' && wikitext[i + 1] === '{' && wikitext[i + 2] === '{' && wikitext[i + 3] !== '{') { inParameter = true; i += 2; } else if (wikitext[i] === '{' && wikitext[i + 1] === '{') { if (numUnclosed === 0) { startIdx = i + 2; } numUnclosed += 2; i++; } else if (wikitext[i] === '}' && wikitext[i + 1] === '}') { if (numUnclosed === 2) { endIdx = i; let templateWikitext = wikitext.slice(startIdx, endIdx); // without braces let processed = processTemplateText(templateWikitext, config.namePredicate, config.templatePredicate); if (processed) { result.push(processed); } if (config.count && result.length === config.count) { return result; } } numUnclosed -= 2; i++; } else if (wikitext[i] === '|' && numUnclosed > 2) { // swap out pipes in nested templates with \x01 character wikitext = strReplaceAt(wikitext, i, '\x01'); } else if (/^<!--/.test(wikitext.slice(i, i + 4))) { inComment = true; i += 3; } else if (/^<nowiki ?>/.test(wikitext.slice(i, i + 9))) { inNowiki = true; i += 7; } } else { // we are in a comment or nowiki or {{{parameter}}} if (wikitext[i] === '|') { // swap out pipes with \x01 character wikitext = strReplaceAt(wikitext, i, '\x01'); } else if (/^-->/.test(wikitext.slice(i, i + 3))) { inComment = false; i += 2; } else if (/^<\/nowiki ?>/.test(wikitext.slice(i, i + 10))) { inNowiki = false; i += 8; } else if (wikitext[i] === '}' && wikitext[i + 1] === '}' && wikitext[i + 2] === '}') { inParameter = false; i += 2; } } } if (config.recursive) { let subtemplates = result .map((template) => { return template.wikitext.slice(2, -2); }) .filter((templateWikitext) => { return /\{\{.*\}\}/s.test(templateWikitext); }) .map((templateWikitext) => { return parseTemplates(templateWikitext, config); }); return result.concat(...subtemplates); } return result; } /** * @param {string} text - template wikitext without braces, with the pipes in * nested templates replaced by \x01 * @param {Function} [namePredicate] * @param {Function} [templatePredicate] * @returns {Template} */ function processTemplateText(text, namePredicate, templatePredicate) { // eslint-disable-next-line no-control-regex const template = new Template('{{' + text.replace(/\x01/g, '|') + '}}'); // swap out pipe in links with \x01 control character // [[File: ]] can have multiple pipes, so might need multiple passes while (/(\[\[[^\]]*?)\|(.*?\]\])/g.test(text)) { text = text.replace(/(\[\[[^\]]*?)\|(.*?\]\])/g, '$1\x01$2'); } const [name, ...parameterChunks] = text.split('|').map((chunk) => { // change '\x01' control characters back to pipes // eslint-disable-next-line no-control-regex return chunk.replace(/\x01/g, '|'); }); template.setName(name); if (namePredicate && !namePredicate(template.name)) { return null; } let unnamedIdx = 1; parameterChunks.forEach(function (chunk) { let indexOfEqualTo = chunk.indexOf('='); let indexOfOpenBraces = chunk.indexOf('{{'); let isWithoutEquals = !chunk.includes('='); let hasBracesBeforeEquals = chunk.includes('{{') && indexOfOpenBraces < indexOfEqualTo; let isUnnamedParam = isWithoutEquals || hasBracesBeforeEquals; let pName, pNum, pVal; if (isUnnamedParam) { // Get the next number not already used by either an unnamed parameter, // or by a named parameter like `|1=val` while (template.getParam(unnamedIdx)) { unnamedIdx++; } pNum = unnamedIdx; pVal = chunk.trim(); } else { pName = chunk.slice(0, indexOfEqualTo).trim(); pVal = chunk.slice(indexOfEqualTo + 1).trim(); } template.addParam(pName || pNum, pVal, chunk); }); if (templatePredicate && !templatePredicate(template)) { return null; } return template; } /** See {@link MwnWikitextStatic.parseTable} */ function parseTable(text) { text = text.trim(); const indexOfRawPipe = function (text) { // number of unclosed brackets let tlevel = 0, llevel = 0; let n = text.length; for (let i = 0; i < n; i++) { if (text[i] === '{' && text[i + 1] === '{') { tlevel++; i++; } else if (text[i] === '[' && text[i + 1] === '[') { llevel++; i++; } else if (text[i] === '}' && text[i + 1] === '}') { tlevel--; i++; } else if (text[i] === ']' && text[i + 1] === ']') { llevel--; i++; } else if (text[i] === '|' && tlevel === 0 && llevel === 0) { return i; } } }; if (!text.startsWith('{|') || !text.endsWith('|}')) { throw new Error('failed to parse table. Unexpected starting or ending'); } // remove front matter and final matter // including table attributes and caption, and unnecessary |- at the top text = text.replace(/^\{\|.*$((\n\|-)?\n\|\+.*$)?(\n\|-)?/m, '').replace(/^\|\}$/m, ''); let [header, ...rows] = text.split(/^\|-/m).map((r) => r.trim()); // remove cell attributes, extracts data const extractData = (cell) => { return cell.slice(indexOfRawPipe(cell) + 1).trim(); }; // XXX: handle the case where there are is no header row let cols = header.split('\n').map((e) => e.replace(/^!/, '')); if (cols.length === 1) { // non-multilined table? cols = cols[0].split('!!'); } cols = cols.map(extractData); let numcols = cols.length; let output = new Array(rows.length); rows.forEach((row, idx) => { let cells = row.split(/^\|/m).slice(1); // slice(1) removes the emptiness or the row styles if present if (cells.length === 1) { // non-multilined // cells are separated by || cells = cells[0].replace(/^\|/, '').split('||'); } cells = cells.map(extractData); if (cells.length !== numcols) { throw new Error(`failed to parse table: found ${cells.length} cells on row ${idx}, expected ${numcols}`); } output[idx] = {}; // output[idx] represents a row for (let i = 0; i < numcols; i++) { output[idx][cols[i]] = cells[i]; } }); return output; } /** See {@link MwnWikitext.parseSections} */ function parseSections(text) { const rgx = /^(=+)(.*?)\1/gm; let sections = [ { level: 1, header: null, index: 0, }, ]; let match; while ((match = rgx.exec(text))) { // eslint-disable-line no-cond-assign sections.push({ level: match[1].length, header: match[2].trim(), index: match.index, }); } let n = sections.length; for (let i = 0; i < n - 1; i++) { sections[i].content = text.slice(sections[i].index, sections[i + 1].index); } sections[n - 1].content = text.slice(sections[n - 1].index); return sections; } // Attribution: https://en.wikipedia.org/wiki/MediaWiki:Gadget-morebits.js (cc-by-sa 3.0/GFDL) class Unbinder { constructor(text) { this.text = text; } /** * Temporarily hide a part of the string while processing the rest of it. * * eg. let u = new bot.Wikitext("Hello world <!-- world --> world"); * u.unbind('<!--','-->'); * u.content = u.content.replace(/world/g, 'earth'); * u.rebind(); // gives "Hello earth <!-- world --> earth" * * Text within the 'unbinded' part (in this case, the HTML comment) remains intact * unbind() can be called multiple times to unbind multiple parts of the string. * * @param {string} prefix * @param {string} postfix */ unbind(prefix, postfix) { if (!this.unbinder) { this.unbinder = { counter: 0, history: {}, prefix: '%UNIQ::' + Math.random() + '::', postfix: '::UNIQ%', }; } let re = new RegExp(prefix + '([\\s\\S]*?)' + postfix, 'g'); this.text = this.text.replace(re, (match) => { let current = this.unbinder.prefix + this.unbinder.counter + this.unbinder.postfix; this.unbinder.history[current] = match; ++this.unbinder.counter; return current; }); } /** * Rebind after unbinding. */ rebind() { let content = this.text; for (let [current, replacement] of Object.entries(this.unbinder.history)) { content = content.replace(current, replacement); } this.text = content; return this.text; } /** Get the updated text */ getText() { return this.text; } } exports.Unbinder = Unbinder; function default_1(bot) { class Wikitext extends Unbinder { constructor(wikitext) { if (typeof wikitext !== 'string') { throw new Error('non-string constructor for wikitext class'); } super(wikitext); } /** @inheritDoc */ parseLinks() { this.links = []; this.files = []; this.categories = []; let n = this.text.length; // files can have links in captions; use a stack to handle the nesting let stack = new Stack(); for (let i = 0; i < n; i++) { if (this.text[i] === '[' && this.text[i + 1] === '[') { stack.push({ startIdx: i, }); i++; } else if (this.text[i] === ']' && this.text[i + 1] === ']' && stack.top()) { stack.top().endIdx = i + 1; processLink(this, stack.top().startIdx, stack.top().endIdx); stack.pop(); i++; // necessary to handle cases like [[File:ImageName|thumb|A [[hill]]]] } } } /** @inheritDoc */ parseTemplates(config) { return (this.templates = parseTemplates(this.text, config)); } /** @inheritDoc */ removeEntity(entity) { this.text = this.text.replace(entity.wikitext, ''); } /** @inheritDoc */ apiParse(options) { return bot.parseWikitext(this.text, options); } /** @inheritDoc */ parseSections() { return (this.sections = parseSections(this.text)); } } Wikitext.parseTemplates = parseTemplates; Wikitext.parseTable = parseTable; Wikitext.parseSections = parseSections; /**** Private members *****/ function processLink(self, startIdx, endIdx) { let linktext = self.text.slice(startIdx, endIdx + 1); let [target, displaytext] = linktext.slice(2, -2).split('|'); let noSortkey = false; if (!displaytext) { displaytext = target[0] === ':' ? target.slice(1) : target; noSortkey = true; } let title = bot.Title.newFromText(target); if (!title) { return; } if (target[0] !== ':') { if (title.namespace === 6) { self.files.push({ wikitext: linktext, target: title, props: linktext.slice(linktext.indexOf('|') + 1, -2), }); return; } else if (title.namespace === 14) { self.categories.push({ wikitext: linktext, target: title, sortkey: noSortkey ? '' : displaytext, }); return; } } self.links.push({ wikitext: linktext, target: title, displaytext: displaytext, }); } return Wikitext; } class Stack extends Array { top() { return this[this.length - 1]; } } function strReplaceAt(string, index, char) { return string.slice(0, index) + char + string.slice(index + 1); } //# sourceMappingURL=wikitext.js.map