@tricoteuses/arbre-de-la-loi
Version:
Generate ASTs from the French bills & laws; manipulate & export them to Markdown, etc.
478 lines (477 loc) • 20.5 kB
JavaScript
import hastTotString from "hast-util-to-string";
import { DivisionType, divisionTypes, DocumentNodeType, DocumentType, UnstructuredType, } from "./documents";
import { assertNeverHastNode, HastType, } from "./hast";
export function documentsFromAssembleeHast(node, options = {}) {
const document = {
type: UnstructuredType.Unknown,
};
const documents = [document];
const breadcrumb = { node: document };
convertHtmlPage(node, documents, breadcrumb, options);
return documents;
}
function convertBody(node, documents, breadcrumb, options = {}) {
console.assert(node.tagName === "body", `convertBody: Unexpected node tag: ${node.tagName}`);
for (const child of node.children) {
switch (child.type) {
case HastType.Comment:
warnUnexpectedNode(child, node);
break;
case HastType.Element:
switch (child.tagName) {
case "br":
break;
case "div":
breadcrumb = convertBodyDiv(child, documents, breadcrumb, options);
break;
default:
warnUnexpectedElement(child, node);
}
break;
case HastType.Text:
warnWhenNonEmptyText(child, node);
break;
default:
assertNeverHastNode(child);
}
}
return breadcrumb;
}
function convertBodyDiv(node, documents, breadcrumb, _options = {}) {
console.assert(node.tagName === "div", `convertBodyDiv: Unexpected node tag: ${node.tagName}`);
let document = documents[documents.length - 1];
iterLines: for (const element of iterMinimalBlockElements(node)) {
// If one of the lines (simplified) contained in the element
// starts with "PROJET DE LOI", etc, assume that the whole element
// is a heading/preamble of the document and put it in the lines
// of the document.
for (const lineSimplified of iterMinimalLinesSimplified(element)) {
if (!lineSimplified) {
continue;
}
for (const [regExp, documentType] of [
// "Exposé des motifs" without content:
// * http://www.assemblee-nationale.fr/15/textes/0702.asp
// "Exposé des motifs" with content:
// * http://www.assemblee-nationale.fr/15/textes/1326.asp
// * http://www.assemblee-nationale.fr/15/textes/1610.asp
[/^EXPOSE_(GENERAL_)DES_MOTIFS$/, DocumentType.BillStatementOfReasons],
// "Projet de loi" ou "Proposition de loi":
// * http://www.assemblee-nationale.fr/15/textes/0232.asp
// * http://www.assemblee-nationale.fr/15/textes/0626.asp
// * http://www.assemblee-nationale.fr/15/textes/0676.asp
[/^(PROJET|PROPOSITION)_DE_LOI(_|$)/, DocumentType.Bill],
[/^TEXTE_(DE_LA_PROPOSITION|DU_PROJET)_DE_LOI(_|$)/, DocumentType.Bill],
]) {
if (lineSimplified.match(regExp) !== null) {
if (documentType === DocumentType.BillStatementOfReasons &&
breadcrumb.node.type === DivisionType.Article) {
// The title "Exposé des motifs" occurs in a article.
// => There is not a single document for the statement of reasons,
// but a different one for each article.
// So, it should be handled not as a BillStatementOfReasons, but
// as a DivisionStatementOfReasons.
continue;
}
if (breadcrumb.node === document &&
document.type === DocumentNodeType.Unknown) {
// Type of current document is unknown and we are still in preamble (aka in lines).
// Change document type.
;
document.type = documentType;
if (document.lines === undefined) {
document.lines = [];
}
document.lines.push(element);
if (document.children === undefined) {
document.children = [];
}
}
else {
// Create a new document.
document = {
type: documentType,
lines: [element],
children: [],
};
documents.push(document);
breadcrumb = {
node: document,
};
}
continue iterLines;
}
}
}
const text = hastTotString(element);
const textSimplified = simplifyText(text);
if (textSimplified &&
[DocumentType.Bill, UnstructuredType.Unknown].includes(document.type)) {
for (const [regExp, divisionType] of [
[/^ARTICLES?_/, DivisionType.Article],
[/^CHAPITRE_/, DivisionType.Chapter],
[/^LIVRE_/, DivisionType.Book],
[/^PARTIE_/, DivisionType.Part],
[
/^(PREMIERE|SECONDE|DEUXIEME|TROISIEME|QUATRIEME)_PARTIE(_|$)/,
DivisionType.Part,
],
[/^SECTION_/, DivisionType.Section],
[/^SOUSSECTION_/, DivisionType.Subsection],
[/^SOUSTITRE_/, DivisionType.Subtitle],
[/^TITRE_/, DivisionType.Title],
[/^TOME_/, DivisionType.Tome],
]) {
if (textSimplified.match(regExp) !== null) {
if (document.type !== DocumentType.Bill) {
// Current document was not a bill => Create a new bill.
document = {
type: DocumentType.Bill,
children: [],
};
documents.push(document);
breadcrumb = { node: document };
}
// Document is a bill.
// Create a new division.
const division = {
type: divisionType,
headlines: [element],
children: [],
};
// Try to retrieve a division of the same type in breadcrumb and
// append new division next to it.
for (let ancestorBreadcrumb = breadcrumb; ancestorBreadcrumb !== undefined; ancestorBreadcrumb = ancestorBreadcrumb.parent) {
if (ancestorBreadcrumb.node.type === division.type) {
ancestorBreadcrumb.parent.node.children.push(division);
breadcrumb = {
node: division,
parent: ancestorBreadcrumb.parent,
};
continue iterLines;
}
}
// The type of the division doesn't exit in the breadcrumb.
// When the new division must be above the current node, climb the
// breadcrumb hierarchy.
for (let ancestorBreadcrumb = breadcrumb; ancestorBreadcrumb !== undefined; ancestorBreadcrumb = ancestorBreadcrumb?.parent) {
if (![
DivisionType.Article,
UnstructuredType.DivisionStatementOfReasons,
].includes(ancestorBreadcrumb.node.type)) {
breadcrumb = ancestorBreadcrumb;
break;
}
// An article can not contain a sub-division (except for
// a division-related statement of reason).
// Neither can the statement of reasons of a division.
continue;
}
// Create a new level in breadcrumb for the new division.
if (breadcrumb.node.children === undefined) {
breadcrumb.node.children = [];
}
breadcrumb.node.children.push(division);
breadcrumb = { node: division, parent: breadcrumb };
continue iterLines;
}
}
if (textSimplified && document.type === DocumentType.Bill) {
for (const [regExp, unstructuredType] of [
[/^ETAT_/, UnstructuredType.Annex],
[/^(RAPPORT_)?ANNEXE(_|$)/, UnstructuredType.Annex],
]) {
if (textSimplified.match(regExp) !== null) {
// Add a new "état" or "rapport" to the current bill (as a direct child of bill).
const unstructured = {
type: unstructuredType,
headlines: [element],
};
let documentBreadcrumb = breadcrumb;
while (documentBreadcrumb.parent !== undefined) {
documentBreadcrumb = documentBreadcrumb.parent;
}
documentBreadcrumb.node.children.push(unstructured);
breadcrumb = { node: unstructured, parent: documentBreadcrumb };
continue iterLines;
}
}
for (const [regExp, unstructuredType] of [
[
/^EXPOSE_DES_MOTIFS(_|$)/,
UnstructuredType.DivisionStatementOfReasons,
],
]) {
if (textSimplified.match(regExp) !== null) {
// Add a new "exposé des motifs" as a child of the current node.
const unstructured = {
type: unstructuredType,
headlines: [element],
};
breadcrumb.node.children.push(unstructured);
breadcrumb = { node: unstructured, parent: breadcrumb };
continue iterLines;
}
}
const division = breadcrumb.node;
console.assert(!division.children?.length, `The current division of breadcrumb should never have children: ${JSON.stringify(division, null, 2)}`);
if (divisionTypes.has(division.type) &&
division.type !== DocumentNodeType.Article &&
division.headlines !== undefined &&
division.headlines.length === 1 &&
!division.lines?.length) {
// The current division has no subtitle (and can have one)
// => Add element as subtitle.
if (!text.match(/^[ \t\n\f\r\xa0]*$/)) {
// Second title of division
division.headlines.push(element);
continue iterLines;
}
}
// Otherwise, assume that element is a line or an alinea.
if (division.lines === undefined) {
division.lines = [];
}
// TODO: Remove (optional) "pastille" (aka alinea number) from alinea.
division.lines.push(element);
continue iterLines;
}
}
// The Document is not a bill.
const node = breadcrumb.node;
if (node.lines === undefined) {
node.lines = [];
}
node.lines.push(element);
}
return breadcrumb;
}
// function convertBodyDivP(
// node: HastElement,
// documents: Array<Document | Unknown>,
// breadcrumb: Breadcrumb,
// options: Options = {},
// ): Breadcrumb {
// console.assert(
// node.tagName === "p",
// `convertBodyDivP: Unexpected node tag: ${node.tagName}`,
// )
// // for (const child of node.children) {
// // switch (child.type) {
// // case HastType.Comment:
// // warnUnexpectedNode(child, node)
// // break
// // case HastType.Element:
// // switch (child.tagName) {
// // // case "br":
// // // break
// // // case "div":
// // // breadcrumb = convertDiv(child, bill, options)
// // // break
// // default:
// // warnUnexpectedElement(child, node)
// // }
// // break
// // case HastType.Text:
// // warnWhenNonEmptyText(child, node)
// // break
// // default:
// // assertNeverHastNode(child)
// // }
// // }
// return breadcrumb
// }
function convertHtml(node, documents, breadcrumb, options = {}) {
console.assert(node.tagName === "html", `convertHtml: Unexpected node tag: ${node.tagName}`);
for (const child of node.children) {
switch (child.type) {
case HastType.Comment:
warnUnexpectedNode(child, node);
break;
case HastType.Element:
switch (child.tagName) {
case "head":
break;
case "body":
breadcrumb = convertBody(child, documents, breadcrumb, options);
break;
default:
warnUnexpectedElement(child, node);
}
break;
case HastType.Text:
warnWhenNonEmptyText(child, node);
break;
default:
assertNeverHastNode(child);
}
}
return breadcrumb;
}
function convertHtmlPage(node, documents, breadcrumb, options = {}) {
for (const child of node.children) {
switch (child.type) {
case HastType.Comment:
warnUnexpectedNode(child, node);
break;
case HastType.Doctype:
break;
case HastType.Element:
if (child.tagName === "html") {
breadcrumb = convertHtml(child, documents, breadcrumb, options);
}
else {
warnUnexpectedElement(child, node);
}
break;
case HastType.Text:
warnWhenNonEmptyText(child, node);
break;
default:
assertNeverHastNode(child);
}
}
return breadcrumb;
}
function* iterMinimalBlockElements(node) {
let blockElementsEncountered = false;
for (const child of node.children) {
switch (child.type) {
case HastType.Comment:
// Ignore comment.
break;
case HastType.Element:
switch (child.tagName) {
case "br":
case "hr":
case "table":
blockElementsEncountered = true;
yield child;
break;
case "div":
case "p":
blockElementsEncountered = true;
yield* iterMinimalBlockElements(child);
break;
default:
// Element is not a block.
if (!blockElementsEncountered) {
// => Assume that the line is the parent block and halt iterations.
yield node;
return;
}
// Handle the element as if it is a block element.
yield child;
}
break;
case HastType.Text:
if (child.value.match(/^[ \t\n\f\r\xa0]*$/) === null) {
// Text node is not empty.
if (!blockElementsEncountered) {
// Assume that the line is the parent block and halt iterations.
yield node;
return;
}
// Handle the text as if it is a block element.
yield { type: HastType.Element, tagName: "span", children: [child] };
break;
}
// Ignore empty text node.
break;
default:
assertNeverHastNode(child);
}
}
}
function* iterMinimalLinesSimplified(node) {
for (const child of node.children) {
switch (child.type) {
case HastType.Comment:
// Ignore comment.
break;
case HastType.Element:
switch (child.tagName) {
case "div":
case "p":
case "table":
case "tbody":
case "tr":
yield* iterMinimalLinesSimplified(child);
break;
default:
// Element is not a block.
// => Assume that the line is the parent block and halt iterations.
yield simplifyText(hastTotString(node));
return;
}
break;
case HastType.Text:
if (child.value.match(/^[ \t\n\f\r\xa0]*$/) === null) {
// Text node is not empty => Assume that the line is the parent block
// and halt iterations.
yield simplifyText(hastTotString(node));
return;
}
// Ignore empty text node.
break;
default:
assertNeverHastNode(child);
}
}
}
function simplifyText(text) {
return text
.normalize("NFD")
.replace(/[\u0300-\u036f]/g, "")
.replace(/\(nouveau\)/, "")
.replace(/\(Pour coordination\)/, "")
.replace(/\(Supprimés?\)/, "")
.replace(/[_\t\n\f\r\xa0]/g, " ")
.replace(/[\-,.…]/g, "")
.replace(/ {2,}/g, " ")
.replace(/^\s+/, "") // Trim begin of line.
.replace(/\s+$/, "") // Trim end of line.
.replace(/ /g, "_")
.toUpperCase();
}
function warnUnexpectedElement(node, parent) {
switch (parent.type) {
case HastType.Element:
console.warn(`Unexpected element of tag "${node.tagName}" in node of tag: ${parent.tagName}`);
break;
case HastType.Root:
console.warn(`Unexpected element of tag "${node.tagName}" in node of type: ${parent.type}`);
break;
default:
assertNeverHastNode(parent);
}
// console.warn(JSON.stringify(parent, null, 2))
}
function warnUnexpectedNode(node, parent) {
switch (parent.type) {
case HastType.Element:
console.warn(`Unexpected node of type "${node.type}" in node of tag: ${parent.tagName}`);
break;
case HastType.Root:
console.warn(`Unexpected node of type "${node.type}" in node of type: ${parent.type}`);
break;
default:
assertNeverHastNode(parent);
}
// console.warn(JSON.stringify(parent, null, 2))
}
function warnWhenNonEmptyText(node, parent) {
// See <https://html.spec.whatwg.org/#space-character>.
if (!node.value.match(/^[ \t\n\f\r\xa0]*$/)) {
switch (parent.type) {
case HastType.Element:
console.warn(`Unexpected non empty text "${node.value}" in node of tag: ${parent.tagName}`);
break;
case HastType.Root:
console.warn(`Unexpected non empty text "${node.value}" in node of type: ${parent.type}`);
break;
default:
assertNeverHastNode(parent);
}
}
}