mwn
Version:
JavaScript & TypeScript MediaWiki bot framework for Node.js
459 lines • 16.4 kB
JavaScript
"use strict";
/**
* Class for some basic wikitext parsing, involving
* links, files, categories, templates and simple tables
* and sections.
*
* For more advanced and sophisticated wikitext parsing, use
* mwparserfromhell <https://github.com/earwig/mwparserfromhell>
* implemented in python (which you can use within node.js using
* the child_process interface). However, mwparserfromhell doesn't
* recognize localised namespaces and wiki-specific configs.
*
* This class is for methods for parsing wikitext, for the
* static methods for creating wikitext, see static_utils.js.
*/
Object.defineProperty(exports, "__esModule", { value: true });
exports.Unbinder = exports.Parameter = exports.Template = void 0;
exports.parseTemplates = parseTemplates;
exports.parseTable = parseTable;
exports.parseSections = parseSections;
exports.default = default_1;
// Adapted from https://en.wikipedia.org/wiki/MediaWiki:Gadget-libExtraUtil.js
// by Evad37 (cc-by-sa-3.0/GFDL)
// TODO: expand from evad37/xfdcloser
/**
* Represents the wikitext of template transclusion. Used by {@link parseTemplates}.
*/
class Template {
/**
* @param {String} wikitext Wikitext of a template transclusion,
* starting with '{{' and ending with '}}'.
*/
constructor(wikitext) {
this.wikitext = wikitext;
this.parameters = [];
}
addParam(name, val, wikitext) {
this.parameters.push(new Parameter(name, val, wikitext));
}
getParam(paramName) {
return this.parameters.find((p) => {
return p.name == paramName; // == is intentional
});
}
getValue(paramName) {
let param = this.getParam(paramName);
return param ? param.value : null;
}
setName(name) {
name = name.trim();
this.name = name[0] ? name[0].toUpperCase() + name.slice(1) : name;
}
}
exports.Template = Template;
/**
* Represents a template parameter
*/
class Parameter {
constructor(name, val, wikitext) {
this.name = name;
this.value = val;
this.wikitext = '|' + wikitext;
}
}
exports.Parameter = Parameter;
// parseTemplates() and processTemplateText() are adapted from
// https://en.wikipedia.org/wiki/MediaWiki:Gadget-libExtraUtil.js written by Evad37
// which was in turn adapted from https://en.wikipedia.org/wiki/User:SD0001/parseAllTemplates.js
// written by me. (cc-by-sa/GFDL)
/** See {@link MwnWikitext.parseTemplates} */
function parseTemplates(wikitext, config) {
config = config || {
recursive: false,
namePredicate: null,
templatePredicate: null,
count: null,
};
const result = [];
const n = wikitext.length;
// number of unclosed braces
let numUnclosed = 0;
// are we inside a comment, or between nowiki tags, or in a {{{parameter}}}?
let inComment = false;
let inNowiki = false;
let inParameter = false;
let startIdx, endIdx;
for (let i = 0; i < n; i++) {
if (!inComment && !inNowiki && !inParameter) {
if (wikitext[i] === '{' && wikitext[i + 1] === '{' && wikitext[i + 2] === '{' && wikitext[i + 3] !== '{') {
inParameter = true;
i += 2;
}
else if (wikitext[i] === '{' && wikitext[i + 1] === '{') {
if (numUnclosed === 0) {
startIdx = i + 2;
}
numUnclosed += 2;
i++;
}
else if (wikitext[i] === '}' && wikitext[i + 1] === '}') {
if (numUnclosed === 2) {
endIdx = i;
let templateWikitext = wikitext.slice(startIdx, endIdx); // without braces
let processed = processTemplateText(templateWikitext, config.namePredicate, config.templatePredicate);
if (processed) {
result.push(processed);
}
if (config.count && result.length === config.count) {
return result;
}
}
numUnclosed -= 2;
i++;
}
else if (wikitext[i] === '|' && numUnclosed > 2) {
// swap out pipes in nested templates with \x01 character
wikitext = strReplaceAt(wikitext, i, '\x01');
}
else if (/^<!--/.test(wikitext.slice(i, i + 4))) {
inComment = true;
i += 3;
}
else if (/^<nowiki ?>/.test(wikitext.slice(i, i + 9))) {
inNowiki = true;
i += 7;
}
}
else {
// we are in a comment or nowiki or {{{parameter}}}
if (wikitext[i] === '|') {
// swap out pipes with \x01 character
wikitext = strReplaceAt(wikitext, i, '\x01');
}
else if (/^-->/.test(wikitext.slice(i, i + 3))) {
inComment = false;
i += 2;
}
else if (/^<\/nowiki ?>/.test(wikitext.slice(i, i + 10))) {
inNowiki = false;
i += 8;
}
else if (wikitext[i] === '}' && wikitext[i + 1] === '}' && wikitext[i + 2] === '}') {
inParameter = false;
i += 2;
}
}
}
if (config.recursive) {
let subtemplates = result
.map((template) => {
return template.wikitext.slice(2, -2);
})
.filter((templateWikitext) => {
return /\{\{.*\}\}/s.test(templateWikitext);
})
.map((templateWikitext) => {
return parseTemplates(templateWikitext, config);
});
return result.concat(...subtemplates);
}
return result;
}
/**
* @param {string} text - template wikitext without braces, with the pipes in
* nested templates replaced by \x01
* @param {Function} [namePredicate]
* @param {Function} [templatePredicate]
* @returns {Template}
*/
function processTemplateText(text, namePredicate, templatePredicate) {
// eslint-disable-next-line no-control-regex
const template = new Template('{{' + text.replace(/\x01/g, '|') + '}}');
// swap out pipe in links with \x01 control character
// [[File: ]] can have multiple pipes, so might need multiple passes
while (/(\[\[[^\]]*?)\|(.*?\]\])/g.test(text)) {
text = text.replace(/(\[\[[^\]]*?)\|(.*?\]\])/g, '$1\x01$2');
}
const [name, ...parameterChunks] = text.split('|').map((chunk) => {
// change '\x01' control characters back to pipes
// eslint-disable-next-line no-control-regex
return chunk.replace(/\x01/g, '|');
});
template.setName(name);
if (namePredicate && !namePredicate(template.name)) {
return null;
}
let unnamedIdx = 1;
parameterChunks.forEach(function (chunk) {
let indexOfEqualTo = chunk.indexOf('=');
let indexOfOpenBraces = chunk.indexOf('{{');
let isWithoutEquals = !chunk.includes('=');
let hasBracesBeforeEquals = chunk.includes('{{') && indexOfOpenBraces < indexOfEqualTo;
let isUnnamedParam = isWithoutEquals || hasBracesBeforeEquals;
let pName, pNum, pVal;
if (isUnnamedParam) {
// Get the next number not already used by either an unnamed parameter,
// or by a named parameter like `|1=val`
while (template.getParam(unnamedIdx)) {
unnamedIdx++;
}
pNum = unnamedIdx;
pVal = chunk.trim();
}
else {
pName = chunk.slice(0, indexOfEqualTo).trim();
pVal = chunk.slice(indexOfEqualTo + 1).trim();
}
template.addParam(pName || pNum, pVal, chunk);
});
if (templatePredicate && !templatePredicate(template)) {
return null;
}
return template;
}
/** See {@link MwnWikitextStatic.parseTable} */
function parseTable(text) {
text = text.trim();
const indexOfRawPipe = function (text) {
// number of unclosed brackets
let tlevel = 0, llevel = 0;
let n = text.length;
for (let i = 0; i < n; i++) {
if (text[i] === '{' && text[i + 1] === '{') {
tlevel++;
i++;
}
else if (text[i] === '[' && text[i + 1] === '[') {
llevel++;
i++;
}
else if (text[i] === '}' && text[i + 1] === '}') {
tlevel--;
i++;
}
else if (text[i] === ']' && text[i + 1] === ']') {
llevel--;
i++;
}
else if (text[i] === '|' && tlevel === 0 && llevel === 0) {
return i;
}
}
};
if (!text.startsWith('{|') || !text.endsWith('|}')) {
throw new Error('failed to parse table. Unexpected starting or ending');
}
// remove front matter and final matter
// including table attributes and caption, and unnecessary |- at the top
text = text.replace(/^\{\|.*$((\n\|-)?\n\|\+.*$)?(\n\|-)?/m, '').replace(/^\|\}$/m, '');
let [header, ...rows] = text.split(/^\|-/m).map((r) => r.trim());
// remove cell attributes, extracts data
const extractData = (cell) => {
return cell.slice(indexOfRawPipe(cell) + 1).trim();
};
// XXX: handle the case where there are is no header row
let cols = header.split('\n').map((e) => e.replace(/^!/, ''));
if (cols.length === 1) {
// non-multilined table?
cols = cols[0].split('!!');
}
cols = cols.map(extractData);
let numcols = cols.length;
let output = new Array(rows.length);
rows.forEach((row, idx) => {
let cells = row.split(/^\|/m).slice(1); // slice(1) removes the emptiness or the row styles if present
if (cells.length === 1) {
// non-multilined
// cells are separated by ||
cells = cells[0].replace(/^\|/, '').split('||');
}
cells = cells.map(extractData);
if (cells.length !== numcols) {
throw new Error(`failed to parse table: found ${cells.length} cells on row ${idx}, expected ${numcols}`);
}
output[idx] = {}; // output[idx] represents a row
for (let i = 0; i < numcols; i++) {
output[idx][cols[i]] = cells[i];
}
});
return output;
}
/** See {@link MwnWikitext.parseSections} */
function parseSections(text) {
const rgx = /^(=+)(.*?)\1/gm;
let sections = [
{
level: 1,
header: null,
index: 0,
},
];
let match;
while ((match = rgx.exec(text))) {
// eslint-disable-line no-cond-assign
sections.push({
level: match[1].length,
header: match[2].trim(),
index: match.index,
});
}
let n = sections.length;
for (let i = 0; i < n - 1; i++) {
sections[i].content = text.slice(sections[i].index, sections[i + 1].index);
}
sections[n - 1].content = text.slice(sections[n - 1].index);
return sections;
}
// Attribution: https://en.wikipedia.org/wiki/MediaWiki:Gadget-morebits.js (cc-by-sa 3.0/GFDL)
class Unbinder {
constructor(text) {
this.text = text;
}
/**
* Temporarily hide a part of the string while processing the rest of it.
*
* eg. let u = new bot.Wikitext("Hello world <!-- world --> world");
* u.unbind('<!--','-->');
* u.content = u.content.replace(/world/g, 'earth');
* u.rebind(); // gives "Hello earth <!-- world --> earth"
*
* Text within the 'unbinded' part (in this case, the HTML comment) remains intact
* unbind() can be called multiple times to unbind multiple parts of the string.
*
* @param {string} prefix
* @param {string} postfix
*/
unbind(prefix, postfix) {
if (!this.unbinder) {
this.unbinder = {
counter: 0,
history: {},
prefix: '%UNIQ::' + Math.random() + '::',
postfix: '::UNIQ%',
};
}
let re = new RegExp(prefix + '([\\s\\S]*?)' + postfix, 'g');
this.text = this.text.replace(re, (match) => {
let current = this.unbinder.prefix + this.unbinder.counter + this.unbinder.postfix;
this.unbinder.history[current] = match;
++this.unbinder.counter;
return current;
});
}
/**
* Rebind after unbinding.
*/
rebind() {
let content = this.text;
for (let [current, replacement] of Object.entries(this.unbinder.history)) {
content = content.replace(current, replacement);
}
this.text = content;
return this.text;
}
/** Get the updated text */
getText() {
return this.text;
}
}
exports.Unbinder = Unbinder;
function default_1(bot) {
class Wikitext extends Unbinder {
constructor(wikitext) {
if (typeof wikitext !== 'string') {
throw new Error('non-string constructor for wikitext class');
}
super(wikitext);
}
/** @inheritDoc */
parseLinks() {
this.links = [];
this.files = [];
this.categories = [];
let n = this.text.length;
// files can have links in captions; use a stack to handle the nesting
let stack = new Stack();
for (let i = 0; i < n; i++) {
if (this.text[i] === '[' && this.text[i + 1] === '[') {
stack.push({
startIdx: i,
});
i++;
}
else if (this.text[i] === ']' && this.text[i + 1] === ']' && stack.top()) {
stack.top().endIdx = i + 1;
processLink(this, stack.top().startIdx, stack.top().endIdx);
stack.pop();
i++; // necessary to handle cases like [[File:ImageName|thumb|A [[hill]]]]
}
}
}
/** @inheritDoc */
parseTemplates(config) {
return (this.templates = parseTemplates(this.text, config));
}
/** @inheritDoc */
removeEntity(entity) {
this.text = this.text.replace(entity.wikitext, '');
}
/** @inheritDoc */
apiParse(options) {
return bot.parseWikitext(this.text, options);
}
/** @inheritDoc */
parseSections() {
return (this.sections = parseSections(this.text));
}
}
Wikitext.parseTemplates = parseTemplates;
Wikitext.parseTable = parseTable;
Wikitext.parseSections = parseSections;
/**** Private members *****/
function processLink(self, startIdx, endIdx) {
let linktext = self.text.slice(startIdx, endIdx + 1);
let [target, displaytext] = linktext.slice(2, -2).split('|');
let noSortkey = false;
if (!displaytext) {
displaytext = target[0] === ':' ? target.slice(1) : target;
noSortkey = true;
}
let title = bot.Title.newFromText(target);
if (!title) {
return;
}
if (target[0] !== ':') {
if (title.namespace === 6) {
self.files.push({
wikitext: linktext,
target: title,
props: linktext.slice(linktext.indexOf('|') + 1, -2),
});
return;
}
else if (title.namespace === 14) {
self.categories.push({
wikitext: linktext,
target: title,
sortkey: noSortkey ? '' : displaytext,
});
return;
}
}
self.links.push({
wikitext: linktext,
target: title,
displaytext: displaytext,
});
}
return Wikitext;
}
class Stack extends Array {
top() {
return this[this.length - 1];
}
}
function strReplaceAt(string, index, char) {
return string.slice(0, index) + char + string.slice(index + 1);
}
//# sourceMappingURL=wikitext.js.map