rst2mdown
Version:
A conversion utility for converting reStructuredText markup to markdown
580 lines (500 loc) • 12.8 kB
JavaScript
String.prototype.repeat = function(num){
// summary:
// Duplicates a string *n* times
return new Array(num + 1).join(this);
};
var headingLevels;
// All the block RegEx are designed to "consume" from the start of the line, where they will be used in
// order to until what remains to be "consumed" is the most non-specific text.
var block = {
// Consumes any "blank" newlines, this is usually because the preceding text has already been consumed
newline: /^\n+/,
// Consumes section titles where the seprator is above or blow:
//
// =======
// Title 1
// =======
//
// RegEx:
// ^ - Match start of line
// (=|-|`|:|\.|'|"|~|^|_|\*|\+|#){3,} - Capture
// =|-|`|:|\.|'|"|~|^|_|\*|\+|# - A marker
// {3,} - Repeated at least 3 times
// * - Any number of trailing spaces
// \n - One new line
// * - Any number of leading spaces
// ([^\n]+) - Capture
// [^\n]+ - One or more characters not a new line
// \n - A new line
// \1+ - One or more characters captured above
// * - Any trailing spaces
// \n* - Any trailing newlines
sectionAboveBelow: /^(=|-|`|:|\.|'|"|~|^|_|\*|\+|#){3,} *\n *([^\n]+)\n\1+ *\n*/,
// Consumes section titles where the seperator is only below the text:
//
// Title 1
// -------
//
// RegEx:
// ^ - Match start of line
// ([^\n]+) - Capture
// [^\n]+ - One or more characters not a new line
// \n - One new line
// (=|-|`|:|\.|'|"|~|\^|_|\*|\+|#){3,} - Capture
// =|-|`|:|\.|'|"|~|^|_|\*|\+|# - A marker
// {3,} - Repeated at least 3 times
// * - Any trailing spaces
// \n* - Any trailing newlines
sectionBelow: /^([^\n]+)\n(=|-|`|:|\.|'|"|~|\^|_|\*|\+|#){3,} *\n*/,
// Consumes any text where it has been indented by two or more spaces:
//
// Not captured
// Captured
// Not captured
//
// RegEx:
// Start of Line
// Followed by at least two spaces
// Followed by at least 1 charecture not a new line
// Followed by as many lines that are indented by at least two spaces
// Consuming any trailing newlines
blockquote: /^( {2,}[^\n]+(\n {2,}[^\n]+)*\n*)+/,
// Consumes text which is considered a transistion:
//
// ----
//
// RegEx:
// Start of line
// Followed by at least 4 of =-`:.'"~^_*+#
// Followed by any trailing spaces and then a newline
// Followed by any leading spaces and then another newline or the end of the line
transistion: /^(?:=|-|`|:|\.|'|"|~|\^|_|\*|\+|#){4,} *\n *(?:\n+|$)/,
directive: /^\.\. +(.+) *:: +(?:\|([^\|])\| +)?([^\n]*)?(?:\n+|$)/,
// Consumes text which appears to be a list (either ordered or unordered):
//
list: /^( *)(bull) [^\0]+?(?:hr|\n{2,}(?! )(?!\1bull )\n*|\s*$)/,
definition: /^([^\n]+)\n+( {2,}[^\n]+(\n {2,}[^\n]+)*\n*)+/,
definitionblocks: /( {2,}[^\n]+(\n {2,}[^\n]+)*\n*)+/,
optionlist: /^(?:-|-{2}|\/)[^\n]+(\n|$)+(?: {2,}[^\n]+(?:\n {2,}[^\n]+)*\n*)*/,
fieldlist: /^:([^:]+): +[^\n]+\n(?:( {2,})[^\n]+\n(?:\2[^\n]+\n)*)*/,
// Consumes any text that isn't a newline:
//
// anything
//
// RegEx:
// ^ - Match start of line
// [^\n]+ - One or more charecters that are not a new line
text: /^[^\n]+/,
// This is used to be subtituted in other RexEx and represents (most) valid bullets in reStructuredText
//
// *
// 1.
// (a)
// B)
//
// RegEx:
// Non capturing
// One of *+-•‣⁃ or
// Possibly a (
// Followed by one or more of 0-9 A-Z or a-z
// Followed by a . or a )
//
// Note: This does match some odd things, but they are unlikely to occur, for example "(1bA." is also valid
// Note: reStructuredText allows roman numerals, this does not match those
bullet: /(?:[*+-•‣⁃]|[\(]?[0-9A-Za-z]+[\.\)])/,
item: /^( *)(bull) [^\n]*(?:\n(?!\1bull )[^\n]*)*/
};
var listhr = /\n+(?=(?: *[=-`:\.~'"\^_*+#]){3,} *(?:\n+|$))/,
listbullet = /^ *([*+-•‣⁃]|[\(]?[0-9A-Za-z]+[\.\)]) +/; // Does not match roman numberals as list bullets
block.item = replace(block.item, "gm")
(/bull/g, block.bullet)
();
block.list = replace(block.list)
(/bull/g, block.bullet)
("hr", listhr)
();
block.directives = {
admonitions: [
"attention",
"caution",
"danger",
"error",
"hint",
"important",
"note",
"tip",
"warning",
"admonition"],
images: [
"image",
"figure"],
body: [
"topic",
"sidebar",
"line-block",
"parsed-literal",
"code",
"math",
"rubric",
"epigraph",
"highlights",
"pull-quote",
"compound",
"container"],
tables: [
"table",
"csv-table",
"list-table"],
parts: [
"contents",
"sectnum",
"section-autonumbering",
"header",
"footer"],
references: [
"taget-nodes",
"footnotes",
"citations"],
html: [ "meta" ],
substitution: [
"replace",
"unicode",
"date"],
misc: [ "include" ]
};
block.lexer = function(rst){
var tokens = [];
headingLevels = [];
rst = rst.replace(/\r\n|\r/g, "\n").replace(/\t/g, " ");
return block.token(rst, tokens, true);
};
block.token = function(rst, tokens, top){
var m,
l,
i,
d,
item,
space,
loose,
term,
next;
function getDepth(c, top){
var i = headingLevels.indexOf(top ? c + c : c);
if(~i){
return i + 1;
}else{
headingLevels.push(top ? c + c : c);
return headingLevels.length;
}
}
while(rst){
if(m = block.newline.exec(rst)){
rst = rst.substring(m[0].length);
if(m[0].length > 1){
tokens.push({
type: "space"
});
}
}
if(m = block.sectionAboveBelow.exec(rst)){
rst = rst.substring(m[0].length);
tokens.push({
type: "section",
depth: getDepth(m[1], true),
text: m[2]
});
continue;
}
if(m = block.sectionBelow.exec(rst)){
rst = rst.substring(m[0].length);
tokens.push({
type: "section",
depth: getDepth(m[2]),
text: m[1]
});
continue;
}
if(m = block.blockquote.exec(rst)){
rst = rst.substring(m[0].length);
tokens.push({
type: "blockquote_start"
});
space = m[0].match(/^ */)[0].length;
m = m[0].replace(new RegExp("^ {" + space + "}", "gm"), "");
block.token(m, tokens, top);
tokens.push({
type: "blockquote_end"
});
continue;
}
if(m = block.transistion.exec(rst)){
rst = rst.substring(m[0].length);
tokens.push({
type: "transistion"
});
continue;
}
if(m = block.directive.exec(rst)){
rst = rst.substring(m[0].length);
tokens.push({
type: "directive_start",
directive: m[1],
element: m[2]
});
tokens.push({
type: "directive_stop"
});
}
if(m = block.fieldlist.exec(rst)){
rst = rst.substring(m[0].length);
tokens.push({
type: "fieldlist_start",
field: m[1]
});
m = m[0].substring(m[0].match(/:[^:]+: +/)[0].length);
space = m.match(/\n */);
if(space && (space[0].length > 1)){
space = space[0].length - 1;
m = m.replace(new RegExp("^ {" + space + "}", "gm"), "");
}
block.token(m, tokens, top);
tokens.push({
type: "fieldlist_end"
});
continue;
}
if(m = block.list.exec(rst)){
rst = rst.substring(m[0].length);
tokens.push({
type: "list_start",
ordered: /^[0-9A-Za-z]+/.test(m[2])
});
m = m[0].match(block.item);
next = false;
l = m.length;
i = 0;
for(; i < l; i++){
item = m[i];
space = item.length;
item = item.replace(listbullet, "");
if(~item.indexOf("\n ")){
space -= item.length;
item = item.replace(new RegExp("^ {1," + space + "}", "gm"), "");
}
loose = next || /\n\n(!\s*$)/.test(item);
if(1 !== l - 1){
next = item[item.length-1] === "\n";
if(!loose) loose = next;
}
tokens.push({
type: loose ? "loose_item_start" : "list_item_start"
});
block.token(item, tokens);
tokens.push({
type: "list_item_end"
});
}
tokens.push({
type: "list_end"
});
continue;
}
if(m = block.optionlist.exec(rst)){
rst = rst.substring(m[0].length);
d = m[0].match(/^[^\n]*?(?= {2,}|\n|$)/)[0];
console.log(m);
tokens.push({
type: "option_start",
options: d.split(/ *, */)
});
m = m[0].substring(d.length).replace(/^ */, "").replace(/\n*$/, "");
space = m.match(/\n */);
if(space && (space[0].length > 1)){
space = space[0].length - 1;
m = m.replace(new RegExp("^ {" + space + "}", "gm"), "");
}
block.token(m, tokens, top);
tokens.push({
type: "option_end"
});
continue;
}
if(m = block.definition.exec(rst)){
rst = rst.substring(m[0].length);
d = m[1].split(/ *: */);
term = d.shift();
tokens.push({
type: "definition_start",
term: term,
classifiers: d
});
m = m[0].match(block.definitionblocks);
tokens.push({
type: "definition",
text: m[0].replace(/^ */, "").replace(/\n*$/, "").split(/ *\n+ {2,}/)
});
tokens.push({
type: "definition_end"
});
continue;
}
if(m = block.text.exec(rst)){
rst = rst.substring(m[0].length);
tokens.push({
type: "text",
text: m[0]
});
}
}
return tokens;
};
var inline = {};
inline.lexer = function(src){
};
var markdown = function(tokens){
var doc = "",
lists = [],
blockDepth = 0,
indent = 0,
listItem,
noSpace,
supressNewline,
priorblock,
heading = "######";
function space(up){
return blockDepth ? " > ".repeat(blockDepth) : "" +
(indent ? " ".repeat(indent) : "") +
(lists.length ? (lists[lists.length - 1].ordered ? (" ".repeat(up ? (lists.length - 1) : lists.length)) :
(" ".repeat(up ? (lists.length - 1) : lists.length))) : "");
}
function docadd(txt, up){
if(noSpace){
doc += txt;
noSpace = false;
}else{
doc += space(up) + txt.replace(/\n(?=.)/, "\n" + space(up));
}
}
tokens.forEach(function(token){
if(priorblock === "option" && token.type !== "option_start"){
docadd("\n </tbody>\n</table>\n");
priorblock = "";
}
switch(token.type){
case "space":
docadd("\n");
break;
case "text":
docadd(token.text + "\n");
break;
case "section":
docadd(heading.substring(0, token.depth) + " " + token.text + " " +
heading.substring(0, token.depth) + "\n");
docadd("\n");
priorblock = "section";
break;
case "transistion":
docadd("---\n");
docadd("\n");
priorblock = "transistion";
break;
case "list_start":
lists.push({
ordered: token.ordered,
count: 1
});
break;
case "list_end":
doc += !supressNewline ? "\n" : "";
lists.pop();
priorblock = "list";
break;
case "loose_item_start":
case "list_item_start":
// Note, markdown, but standard and GFM do not allow alpha ordered lists, so each ordered list is
// output as a numeric.
docadd(lists[lists.length - 1].ordered ? lists[lists.length - 1].count + ". " : "- ", true);
noSpace = true;
lists[lists.length - 1].count++;
break;
case "list_item_end":
break;
case "blockquote_start":
blockDepth++;
break;
case "blockquote_end":
blockDepth--;
priorblock = "blockquote";
break;
case "definition_start":
if(options.definitionListAsHTML){
docadd("<dl>\n");
docadd(" <dt>" + token.term + (token.classifiers.length ? " : " + token.classifiers.join(" : ") : "") + "</dt>\n");
}else{
docadd("***" + token.term + (token.classifiers.length ? " : " + token.classifiers.join(" : ") : "") + "***\n");
}
indent++;
break;
case "definition_end":
priorblock = "definition";
indent--;
if(options.definitionListAsHTML){
docadd("</dl>\n");
}
break;
case "definition":
if(options.definitionListAsHTML){
docadd("<dd>" +
(token.text.length > 1 ?
("\n <p>" + token.text.join("</p>\n <p>") + "</p>\n ") :
token.text[0]) +
"</dd>\n");
}else{
token.text.forEach(function(d){
docadd(d + "\n\n");
});
}
break;
case "fieldlist_start":
docadd(":" + token.field + ":");
indent++;
supressNewline = true;
break;
case "fieldlist_end":
supressNewline = false;
indent--;
priorblock = "field";
break;
case "option_start":
if(priorblock !== "option"){
docadd("<table>\n <tbody>\n");
}
docadd(" <tr>\n <td>`" + token.options.join("`, `") + "`</td>\n <td>");
break;
case "option_end":
priorblock = "option";
break;
}
});
return doc;
};
function replace(regex, opt){
regex = regex.source;
opt = opt || '';
return function self(name, val){
if (!name) return new RegExp(regex, opt);
val = val.source || val;
val = val.replace(/(^|[^\[])\^/g, '$1');
regex = regex.replace(name, val);
return self;
};
}
var options = {
definitionListAsHTML: true
};
var rst2mdown = module.exports = function(rst){
return markdown(block.lexer(rst));
};
rst2mdown.lexer = block.lexer;
rst2mdown.inline = inline.lexer;
rst2mdown.markdown = rst2mdown;