@tricoteuses/arbre-de-la-loi
Version:
Generate ASTs from the French bills & laws; manipulate & export them to Markdown, etc.
213 lines (212 loc) • 9.15 kB
JavaScript
export var NodeType;
(function (NodeType) {
NodeType[NodeType["TEXTE"] = 0] = "TEXTE";
NodeType[NodeType["TOME"] = 1] = "TOME";
NodeType[NodeType["PARTIE"] = 2] = "PARTIE";
NodeType[NodeType["LIVRE"] = 3] = "LIVRE";
NodeType[NodeType["TITRE"] = 4] = "TITRE";
NodeType[NodeType["SOUS_TITRE"] = 5] = "SOUS_TITRE";
NodeType[NodeType["CHAPITRE"] = 6] = "CHAPITRE";
NodeType[NodeType["SECTION"] = 7] = "SECTION";
NodeType[NodeType["SOUS_SECTION"] = 8] = "SOUS_SECTION";
NodeType[NodeType["ARTICLE"] = 9] = "ARTICLE";
// Nodes that are always direct children of "TEXTE":
NodeType[NodeType["ANNEXE"] = 10] = "ANNEXE";
NodeType[NodeType["ETAT"] = 11] = "ETAT";
// Nodes that are always root nodes:
NodeType[NodeType["EXPOSE_DES_MOTIFS"] = 12] = "EXPOSE_DES_MOTIFS";
NodeType[NodeType["AUTRE"] = 13] = "AUTRE";
})(NodeType || (NodeType = {}));
function headerFromLine(line) {
const header = line
// Remove bold (generated by Turndown) from header.
.replace(/^\*\*(.+)\*\*$/, "$1");
return header;
}
export function parseTexteMarkdown(markdown, texteHeaders) {
const rootNodes = [
{
headers: ["Autre"],
type: NodeType.AUTRE,
},
];
const breadcrumb = [rootNodes[0]];
linesLoop: for (const line of markdown.split("\n")) {
let currentRootNode = rootNodes[rootNodes.length - 1];
const lineSimplified = line
.normalize("NFD")
.replace(/[\u0300-\u036f]/g, "")
.replace(/\*\*/g, "") // Remove Markdown bold (added by Turndown).
.replace(/\(nouveau\)/, "")
.replace(/\(Pour coordination\)/, "")
.replace(/\(Supprimés?\)/, "")
.replace(/ /g, " ")
.replace(/[\-,.…]/g, "")
.trim()
.replace(/ {1,}/g, "_")
.toUpperCase();
for (const [regExp, headingType] of [
// "Exposé des motifs" without content:
// * http://www.assemblee-nationale.fr/15/textes/0702.asp
// "Exposé des motifs" with content:
// * http://www.assemblee-nationale.fr/15/textes/1326.asp
// * http://www.assemblee-nationale.fr/15/textes/1610.asp
[/^EXPOSE_DES_MOTIFS$/, NodeType.EXPOSE_DES_MOTIFS],
// "Projet de loi" ou "Proposition de loi":
// * http://www.assemblee-nationale.fr/15/textes/0232.asp
// * http://www.assemblee-nationale.fr/15/textes/0626.asp
// * http://www.assemblee-nationale.fr/15/textes/0676.asp
[/^(PROJET|PROPOSITION)_DE_LOI(_|$)/, NodeType.TEXTE],
[/^TEXTE_(DE_LA_PROPOSITION|DU_PROJET)_DE_LOI(_|$)/, NodeType.TEXTE],
]) {
if (lineSimplified.match(regExp) !== null) {
currentRootNode = {
headers: [headerFromLine(line)],
type: headingType,
};
rootNodes.push(currentRootNode);
breadcrumb[0] = currentRootNode;
breadcrumb.length = 1;
continue linesLoop;
}
}
if ([NodeType.AUTRE, NodeType.TEXTE].includes(currentRootNode.type)) {
for (const [regExp, headingType] of [
[/^ARTICLES?_/, NodeType.ARTICLE],
[/^CHAPITRE_/, NodeType.CHAPITRE],
[/^LIVRE_/, NodeType.LIVRE],
[/^PARTIE_/, NodeType.PARTIE],
[/^(PREMIERE|SECONDE|DEUXIEME|TROISIEME|QUATRIEME)_PARTIE(_|$)/, NodeType.PARTIE],
[/^SECTION_/, NodeType.SECTION],
[/^SOUSSECTION_/, NodeType.SOUS_SECTION],
[/^SOUSTITRE_/, NodeType.SOUS_TITRE],
[/^TITRE_/, NodeType.TITRE],
[/^TOME_/, NodeType.TOME],
]) {
if (lineSimplified.match(regExp) !== null) {
const node = {
headers: [headerFromLine(line)],
type: headingType,
};
if (currentRootNode.type !== NodeType.TEXTE) {
currentRootNode = {
headers: texteHeaders,
type: NodeType.TEXTE,
};
rootNodes.push(currentRootNode);
breadcrumb[0] = currentRootNode;
breadcrumb.length = 1;
}
// Try to retrieve a node of the same type in breadcrumb and
// append new node to it.
for (const [depth, breadcrumbNode] of breadcrumb.entries()) {
if (breadcrumbNode.type === headingType) {
breadcrumb[depth - 1].children.push(node);
breadcrumb[depth] = node;
breadcrumb.length = depth + 1;
continue linesLoop;
}
}
// Create a new level in breadcrumb for the new node
const parentNode = breadcrumb[breadcrumb.length - 1];
console.assert(parentNode.children === undefined);
parentNode.children = [node];
breadcrumb.push(node);
continue linesLoop;
}
}
if (currentRootNode.type === NodeType.TEXTE) {
for (const [regExp, headingType] of [
[/^ETAT_/, NodeType.ETAT],
[/^(RAPPORT_)?ANNEXE(_|$)/, NodeType.ANNEXE],
]) {
if (lineSimplified.match(regExp) !== null) {
const node = {
headers: [headerFromLine(line)],
type: headingType,
};
if (currentRootNode.children === undefined) {
currentRootNode.children = [node];
console.assert(breadcrumb.length === 1);
breadcrumb.push(node);
}
else {
currentRootNode.children.push(node);
console.assert(breadcrumb.length >= 2);
breadcrumb[1] = node;
breadcrumb.length = 2;
}
continue linesLoop;
}
}
const node = breadcrumb[breadcrumb.length - 1];
if (node.alineas === undefined &&
node.headers.length === 1 &&
![NodeType.ARTICLE, NodeType.EXPOSE_DES_MOTIFS].includes(node.type)) {
if (line.trim()) {
// Second title of node
node.headers.push(headerFromLine(line));
}
continue linesLoop;
}
if (node.alineas === undefined) {
node.alineas = [];
}
// Remove (optional) "pastille" (aka alinea number) from alinea.
const alinea = line.replace(/^\(\d+\)\s*/, "");
node.alineas.push(alinea);
continue linesLoop;
}
}
// The currentRootNode is not of type TEXTE.
const node = breadcrumb[breadcrumb.length - 1];
if (node.alineas === undefined) {
node.alineas = [];
}
node.alineas.push(line);
}
// console.log(JSON.stringify(rootNodes, null, 2))
return rootNodes;
}
export function restructureTexteMarkdown(markdown, texteHeaders) {
markdown = markdown
// Replace non-breaking Hyphen with normal hyphen.
.replace(/\u{2011}/gu, "-");
markdown = parseTexteMarkdown(markdown, texteHeaders)
.filter(node => node.type === NodeType.TEXTE)
.map(node => stringifyTree(node))
.join("\n\n")
// Replace multiple spaces with a single space.
.replace(/ {2,}/g, " ")
// Replace sequences of more than 2 \n with exactly 2 \n.
.replace(/\n{3,}/g, "\n\n")
// Remove leading line breaks.
.replace(/^\n+/, "")
// Remove trailing line breaks.
.replace(/\n{2,}$/, "");
return markdown;
}
function stringifyNodeToLines(node, depth, lines) {
for (const header of node.headers) {
if (lines.length > 0 && lines[lines.length - 1] !== "") {
lines.push("");
}
lines.push(`${"#".repeat(depth + 1)} ${header.trim()}`);
lines.push("");
}
if (node.alineas !== undefined) {
for (const alinea of node.alineas) {
lines.push(alinea.trimRight());
}
}
if (node.children !== undefined) {
for (const child of node.children) {
stringifyNodeToLines(child, depth + 1, lines);
}
}
}
export function stringifyTree(node) {
const lines = [];
stringifyNodeToLines(node, 0, lines);
return lines.join("\n");
}