UNPKG

@tricoteuses/arbre-de-la-loi

Version:

Generate ASTs from the French bills & laws; manipulate & export them to Markdown, etc.

478 lines (477 loc) 20.5 kB
import hastTotString from "hast-util-to-string"; import { DivisionType, divisionTypes, DocumentNodeType, DocumentType, UnstructuredType, } from "./documents"; import { assertNeverHastNode, HastType, } from "./hast"; export function documentsFromAssembleeHast(node, options = {}) { const document = { type: UnstructuredType.Unknown, }; const documents = [document]; const breadcrumb = { node: document }; convertHtmlPage(node, documents, breadcrumb, options); return documents; } function convertBody(node, documents, breadcrumb, options = {}) { console.assert(node.tagName === "body", `convertBody: Unexpected node tag: ${node.tagName}`); for (const child of node.children) { switch (child.type) { case HastType.Comment: warnUnexpectedNode(child, node); break; case HastType.Element: switch (child.tagName) { case "br": break; case "div": breadcrumb = convertBodyDiv(child, documents, breadcrumb, options); break; default: warnUnexpectedElement(child, node); } break; case HastType.Text: warnWhenNonEmptyText(child, node); break; default: assertNeverHastNode(child); } } return breadcrumb; } function convertBodyDiv(node, documents, breadcrumb, _options = {}) { console.assert(node.tagName === "div", `convertBodyDiv: Unexpected node tag: ${node.tagName}`); let document = documents[documents.length - 1]; iterLines: for (const element of iterMinimalBlockElements(node)) { // If one of the lines (simplified) contained in the element // starts with "PROJET DE LOI", etc, assume that the whole element // is a heading/preamble of the document and put it in the lines // of the document. for (const lineSimplified of iterMinimalLinesSimplified(element)) { if (!lineSimplified) { continue; } for (const [regExp, documentType] of [ // "Exposé des motifs" without content: // * http://www.assemblee-nationale.fr/15/textes/0702.asp // "Exposé des motifs" with content: // * http://www.assemblee-nationale.fr/15/textes/1326.asp // * http://www.assemblee-nationale.fr/15/textes/1610.asp [/^EXPOSE_(GENERAL_)DES_MOTIFS$/, DocumentType.BillStatementOfReasons], // "Projet de loi" ou "Proposition de loi": // * http://www.assemblee-nationale.fr/15/textes/0232.asp // * http://www.assemblee-nationale.fr/15/textes/0626.asp // * http://www.assemblee-nationale.fr/15/textes/0676.asp [/^(PROJET|PROPOSITION)_DE_LOI(_|$)/, DocumentType.Bill], [/^TEXTE_(DE_LA_PROPOSITION|DU_PROJET)_DE_LOI(_|$)/, DocumentType.Bill], ]) { if (lineSimplified.match(regExp) !== null) { if (documentType === DocumentType.BillStatementOfReasons && breadcrumb.node.type === DivisionType.Article) { // The title "Exposé des motifs" occurs in a article. // => There is not a single document for the statement of reasons, // but a different one for each article. // So, it should be handled not as a BillStatementOfReasons, but // as a DivisionStatementOfReasons. continue; } if (breadcrumb.node === document && document.type === DocumentNodeType.Unknown) { // Type of current document is unknown and we are still in preamble (aka in lines). // Change document type. ; document.type = documentType; if (document.lines === undefined) { document.lines = []; } document.lines.push(element); if (document.children === undefined) { document.children = []; } } else { // Create a new document. document = { type: documentType, lines: [element], children: [], }; documents.push(document); breadcrumb = { node: document, }; } continue iterLines; } } } const text = hastTotString(element); const textSimplified = simplifyText(text); if (textSimplified && [DocumentType.Bill, UnstructuredType.Unknown].includes(document.type)) { for (const [regExp, divisionType] of [ [/^ARTICLES?_/, DivisionType.Article], [/^CHAPITRE_/, DivisionType.Chapter], [/^LIVRE_/, DivisionType.Book], [/^PARTIE_/, DivisionType.Part], [ /^(PREMIERE|SECONDE|DEUXIEME|TROISIEME|QUATRIEME)_PARTIE(_|$)/, DivisionType.Part, ], [/^SECTION_/, DivisionType.Section], [/^SOUSSECTION_/, DivisionType.Subsection], [/^SOUSTITRE_/, DivisionType.Subtitle], [/^TITRE_/, DivisionType.Title], [/^TOME_/, DivisionType.Tome], ]) { if (textSimplified.match(regExp) !== null) { if (document.type !== DocumentType.Bill) { // Current document was not a bill => Create a new bill. document = { type: DocumentType.Bill, children: [], }; documents.push(document); breadcrumb = { node: document }; } // Document is a bill. // Create a new division. const division = { type: divisionType, headlines: [element], children: [], }; // Try to retrieve a division of the same type in breadcrumb and // append new division next to it. for (let ancestorBreadcrumb = breadcrumb; ancestorBreadcrumb !== undefined; ancestorBreadcrumb = ancestorBreadcrumb.parent) { if (ancestorBreadcrumb.node.type === division.type) { ancestorBreadcrumb.parent.node.children.push(division); breadcrumb = { node: division, parent: ancestorBreadcrumb.parent, }; continue iterLines; } } // The type of the division doesn't exit in the breadcrumb. // When the new division must be above the current node, climb the // breadcrumb hierarchy. for (let ancestorBreadcrumb = breadcrumb; ancestorBreadcrumb !== undefined; ancestorBreadcrumb = ancestorBreadcrumb?.parent) { if (![ DivisionType.Article, UnstructuredType.DivisionStatementOfReasons, ].includes(ancestorBreadcrumb.node.type)) { breadcrumb = ancestorBreadcrumb; break; } // An article can not contain a sub-division (except for // a division-related statement of reason). // Neither can the statement of reasons of a division. continue; } // Create a new level in breadcrumb for the new division. if (breadcrumb.node.children === undefined) { breadcrumb.node.children = []; } breadcrumb.node.children.push(division); breadcrumb = { node: division, parent: breadcrumb }; continue iterLines; } } if (textSimplified && document.type === DocumentType.Bill) { for (const [regExp, unstructuredType] of [ [/^ETAT_/, UnstructuredType.Annex], [/^(RAPPORT_)?ANNEXE(_|$)/, UnstructuredType.Annex], ]) { if (textSimplified.match(regExp) !== null) { // Add a new "état" or "rapport" to the current bill (as a direct child of bill). const unstructured = { type: unstructuredType, headlines: [element], }; let documentBreadcrumb = breadcrumb; while (documentBreadcrumb.parent !== undefined) { documentBreadcrumb = documentBreadcrumb.parent; } documentBreadcrumb.node.children.push(unstructured); breadcrumb = { node: unstructured, parent: documentBreadcrumb }; continue iterLines; } } for (const [regExp, unstructuredType] of [ [ /^EXPOSE_DES_MOTIFS(_|$)/, UnstructuredType.DivisionStatementOfReasons, ], ]) { if (textSimplified.match(regExp) !== null) { // Add a new "exposé des motifs" as a child of the current node. const unstructured = { type: unstructuredType, headlines: [element], }; breadcrumb.node.children.push(unstructured); breadcrumb = { node: unstructured, parent: breadcrumb }; continue iterLines; } } const division = breadcrumb.node; console.assert(!division.children?.length, `The current division of breadcrumb should never have children: ${JSON.stringify(division, null, 2)}`); if (divisionTypes.has(division.type) && division.type !== DocumentNodeType.Article && division.headlines !== undefined && division.headlines.length === 1 && !division.lines?.length) { // The current division has no subtitle (and can have one) // => Add element as subtitle. if (!text.match(/^[ \t\n\f\r\xa0]*$/)) { // Second title of division division.headlines.push(element); continue iterLines; } } // Otherwise, assume that element is a line or an alinea. if (division.lines === undefined) { division.lines = []; } // TODO: Remove (optional) "pastille" (aka alinea number) from alinea. division.lines.push(element); continue iterLines; } } // The Document is not a bill. const node = breadcrumb.node; if (node.lines === undefined) { node.lines = []; } node.lines.push(element); } return breadcrumb; } // function convertBodyDivP( // node: HastElement, // documents: Array<Document | Unknown>, // breadcrumb: Breadcrumb, // options: Options = {}, // ): Breadcrumb { // console.assert( // node.tagName === "p", // `convertBodyDivP: Unexpected node tag: ${node.tagName}`, // ) // // for (const child of node.children) { // // switch (child.type) { // // case HastType.Comment: // // warnUnexpectedNode(child, node) // // break // // case HastType.Element: // // switch (child.tagName) { // // // case "br": // // // break // // // case "div": // // // breadcrumb = convertDiv(child, bill, options) // // // break // // default: // // warnUnexpectedElement(child, node) // // } // // break // // case HastType.Text: // // warnWhenNonEmptyText(child, node) // // break // // default: // // assertNeverHastNode(child) // // } // // } // return breadcrumb // } function convertHtml(node, documents, breadcrumb, options = {}) { console.assert(node.tagName === "html", `convertHtml: Unexpected node tag: ${node.tagName}`); for (const child of node.children) { switch (child.type) { case HastType.Comment: warnUnexpectedNode(child, node); break; case HastType.Element: switch (child.tagName) { case "head": break; case "body": breadcrumb = convertBody(child, documents, breadcrumb, options); break; default: warnUnexpectedElement(child, node); } break; case HastType.Text: warnWhenNonEmptyText(child, node); break; default: assertNeverHastNode(child); } } return breadcrumb; } function convertHtmlPage(node, documents, breadcrumb, options = {}) { for (const child of node.children) { switch (child.type) { case HastType.Comment: warnUnexpectedNode(child, node); break; case HastType.Doctype: break; case HastType.Element: if (child.tagName === "html") { breadcrumb = convertHtml(child, documents, breadcrumb, options); } else { warnUnexpectedElement(child, node); } break; case HastType.Text: warnWhenNonEmptyText(child, node); break; default: assertNeverHastNode(child); } } return breadcrumb; } function* iterMinimalBlockElements(node) { let blockElementsEncountered = false; for (const child of node.children) { switch (child.type) { case HastType.Comment: // Ignore comment. break; case HastType.Element: switch (child.tagName) { case "br": case "hr": case "table": blockElementsEncountered = true; yield child; break; case "div": case "p": blockElementsEncountered = true; yield* iterMinimalBlockElements(child); break; default: // Element is not a block. if (!blockElementsEncountered) { // => Assume that the line is the parent block and halt iterations. yield node; return; } // Handle the element as if it is a block element. yield child; } break; case HastType.Text: if (child.value.match(/^[ \t\n\f\r\xa0]*$/) === null) { // Text node is not empty. if (!blockElementsEncountered) { // Assume that the line is the parent block and halt iterations. yield node; return; } // Handle the text as if it is a block element. yield { type: HastType.Element, tagName: "span", children: [child] }; break; } // Ignore empty text node. break; default: assertNeverHastNode(child); } } } function* iterMinimalLinesSimplified(node) { for (const child of node.children) { switch (child.type) { case HastType.Comment: // Ignore comment. break; case HastType.Element: switch (child.tagName) { case "div": case "p": case "table": case "tbody": case "tr": yield* iterMinimalLinesSimplified(child); break; default: // Element is not a block. // => Assume that the line is the parent block and halt iterations. yield simplifyText(hastTotString(node)); return; } break; case HastType.Text: if (child.value.match(/^[ \t\n\f\r\xa0]*$/) === null) { // Text node is not empty => Assume that the line is the parent block // and halt iterations. yield simplifyText(hastTotString(node)); return; } // Ignore empty text node. break; default: assertNeverHastNode(child); } } } function simplifyText(text) { return text .normalize("NFD") .replace(/[\u0300-\u036f]/g, "") .replace(/\(nouveau\)/, "") .replace(/\(Pour coordination\)/, "") .replace(/\(Supprimés?\)/, "") .replace(/[_\t\n\f\r\xa0]/g, " ") .replace(/[\-,.…]/g, "") .replace(/ {2,}/g, " ") .replace(/^\s+/, "") // Trim begin of line. .replace(/\s+$/, "") // Trim end of line. .replace(/ /g, "_") .toUpperCase(); } function warnUnexpectedElement(node, parent) { switch (parent.type) { case HastType.Element: console.warn(`Unexpected element of tag "${node.tagName}" in node of tag: ${parent.tagName}`); break; case HastType.Root: console.warn(`Unexpected element of tag "${node.tagName}" in node of type: ${parent.type}`); break; default: assertNeverHastNode(parent); } // console.warn(JSON.stringify(parent, null, 2)) } function warnUnexpectedNode(node, parent) { switch (parent.type) { case HastType.Element: console.warn(`Unexpected node of type "${node.type}" in node of tag: ${parent.tagName}`); break; case HastType.Root: console.warn(`Unexpected node of type "${node.type}" in node of type: ${parent.type}`); break; default: assertNeverHastNode(parent); } // console.warn(JSON.stringify(parent, null, 2)) } function warnWhenNonEmptyText(node, parent) { // See <https://html.spec.whatwg.org/#space-character>. if (!node.value.match(/^[ \t\n\f\r\xa0]*$/)) { switch (parent.type) { case HastType.Element: console.warn(`Unexpected non empty text "${node.value}" in node of tag: ${parent.tagName}`); break; case HastType.Root: console.warn(`Unexpected non empty text "${node.value}" in node of type: ${parent.type}`); break; default: assertNeverHastNode(parent); } } }