UNPKG

@tricoteuses/senat

Version:

Handle French Sénat's open data

240 lines (239 loc) 9.4 kB
import { JSDOM } from "jsdom"; import { AKN_IDENTIFICATION_STRUCTURE_REGEXP } from "../scripts/datautil"; import { DivisionType, } from "../types/texte"; function buildDivision(node, index) { const eId = node.getAttribute("eId"); const tag = node.nodeName; const level = DivisionType[tag]; const titleNode = node.querySelector("num"); const subtitleNode = node.querySelector("heading"); const mention = node.getAttribute("data:mention") ?? null; const headings = [ ...(titleNode ? [ { text: titleNode.textContent?.trim() ?? null, html: titleNode.innerHTML?.trim() ?? null, }, ] : []), ...(subtitleNode ? [ { text: subtitleNode.textContent?.trim() ?? null, html: subtitleNode.innerHTML?.trim() ?? null, }, ] : []), ]; if (mention?.includes("(nouveau)") && headings.length > 0) { headings[0] = { text: headings[0].text != null ? `${headings[0].text} (nouveau)` : "(nouveau)", html: headings[0].html != null ? `${headings[0].html} (nouveau)` : "(nouveau)", }; } const division = { index, eId, tag, level, headings, }; if (tag === "article") { ; division.alineas = []; } return division; } function buildAlinea(contentNode, alineaNode) { const eId = alineaNode.getAttribute("eId"); const heading = { text: alineaNode.querySelector("num")?.textContent ?? null, }; const pastille = alineaNode.getAttribute("data:pastille") ?? null; return { eId, heading, text: contentNode.textContent?.trim() ?? null, html: contentNode.innerHTML?.trim() ?? null, pastille, }; } function buildEmptyArticle(index) { return { index: index, eId: "", tag: "article", level: DivisionType["article"], headings: [], alineas: [], }; } function splitTexte(texteContentRoot) { const divisions = []; let divisionIndex = 0; const mentionByEId = new Map(); const iter = (node) => { if (node.nodeName === "content") { return; } switch (node.nodeName) { case "tome": case "part": case "book": case "title": case "subtitle": case "chapter": case "section": case "subsection": case "paragraph": divisions.push(buildDivision(node, divisionIndex++)); break; case "article": { const division = buildDivision(node, divisionIndex++); const mention = node.getAttribute("data:mention"); if (mention) mentionByEId.set(division.eId, mention); divisions.push(division); break; } } if (node.nodeName === "alinea") { Array.from(node.childNodes) // Find direct content children programmatically // because `:scope` selector does not work // https://github.com/jsdom/jsdom/issues/2998 .filter((alineaChildNode) => alineaChildNode.nodeName === "content") .forEach((alineaContentNode) => { // Hypothesis: alineas should always be enclosed in articles let lastArticle = divisions.findLast((division) => division.tag === "article"); if (!lastArticle) { lastArticle = buildEmptyArticle(divisionIndex++); divisions.push(lastArticle); } lastArticle.alineas.push(buildAlinea(alineaContentNode, node)); }); } if (node.hasChildNodes()) { node.childNodes.forEach((childNode) => iter(childNode)); } }; iter(texteContentRoot); for (const division of divisions) { if (division.tag !== "article") continue; const article = division; const mention = mentionByEId.get(division.eId); if (mention?.includes("(Supprimé)") && article.alineas.length === 0) { article.alineas.push({ eId: "", heading: { text: null }, text: "(Supprimé)", html: "(Supprimé)", pastille: null, }); } } return divisions; } export function transformTexte(document) { const metaElement = document.querySelector("meta"); const preambleElement = document.querySelector("preamble"); const identification = metaElement?.querySelector("FRBRExpression FRBRuri")?.getAttribute("value") ?? ""; const identificationParts = AKN_IDENTIFICATION_STRUCTURE_REGEXP.exec(identification)?.groups; const bodyElement = document.querySelector("body"); const sessionYears = identificationParts?.["session"]?.split("-") || null; const datePresentation = metaElement?.querySelector("FRBRdate[name='#presentation']")?.getAttribute("date"); const dateDepot = metaElement?.querySelector("FRBRdate[name='#depot']")?.getAttribute("date"); const datePublicationXml = metaElement?.querySelector("FRBRdate[name='#publication-xml']")?.getAttribute("date"); return { titre: preambleElement?.querySelector("docTitle")?.textContent || null, titre_court: metaElement?.querySelector("FRBRalias[name='intitule-court']")?.getAttribute("value") || null, signet_dossier: metaElement?.querySelector("FRBRalias[name='signet-dossier-legislatif-senat']")?.getAttribute("value") || null, url_dossier_senat: metaElement?.querySelector("FRBRalias[name='url-senat']")?.getAttribute("value") || null, url_dossier_assemblee: metaElement?.querySelector("FRBRalias[name='url-AN']")?.getAttribute("value") || null, type: identificationParts?.["type"] || null, session: sessionYears && sessionYears.length > 0 ? sessionYears[0] : null, numero: identificationParts?.["numTexte"] ? parseInt(identificationParts["numTexte"]) : null, date_presentation: datePresentation ? new Date(datePresentation) : null, date_depot: dateDepot ? new Date(dateDepot) : null, date_publication_xml: datePublicationXml ? new Date(datePublicationXml) : null, version: identificationParts?.["version"] ? identificationParts["version"] : null, divisions: bodyElement ? splitTexte(bodyElement) : [], }; } export function transformExposeDesMotifs(document) { const sectionElements = document.querySelectorAll("section"); const exposeDesMotifsRegexp = new RegExp("EXPOS.{1,2}[\\n\\s]DES[\\n\\s]MOTIFS"); for (const sectionElement of sectionElements) { const firstParagraph = sectionElement.querySelector("p:first-of-type"); const secondParagraph = sectionElement.querySelector("p:nth-of-type(2)"); if (!firstParagraph) { continue; } const firstParagraphContent = firstParagraph.textContent; const secondParagraphContent = secondParagraph?.textContent; if (!firstParagraphContent || !exposeDesMotifsRegexp.test(firstParagraphContent.toUpperCase())) { if (!secondParagraphContent || !exposeDesMotifsRegexp.test(secondParagraphContent.toUpperCase())) { continue; } else { secondParagraph.remove(); } } firstParagraph.remove(); return { text: sectionElement.textContent?.trim() ?? null, html: sectionElement.innerHTML?.trim() ?? null, }; } return null; } export function parseTexte(texteXml) { try { const { document } = new JSDOM(texteXml, { contentType: "text/xml", }).window; return transformTexte(document); } catch (error) { console.error(`Could not parse texte with error ${error}`); } return null; } // Prevent from memory leak // https://github.com/jsdom/jsdom/issues/2583#issuecomment-559520814 export async function parseTexteFromFile(xmlFilePath) { try { const { document } = (await JSDOM.fromFile(xmlFilePath, { contentType: "text/xml" })).window; return transformTexte(document); } catch (error) { console.error(`Could not parse texte with error ${error}`); } return null; } export function parseExposeDesMotifs(exposeDesMotifsHtml) { try { const { document } = new JSDOM(exposeDesMotifsHtml, { contentType: "text/html", }).window; return transformExposeDesMotifs(document); } catch (error) { console.error(`Could not parse exposé des motifs with error ${error}`); } return null; } // Prevent from memory leak // https://github.com/jsdom/jsdom/issues/2583#issuecomment-559520814 export async function parseExposeDesMotifsFromFile(htmlFilePath) { try { const { document } = (await JSDOM.fromFile(htmlFilePath, { contentType: "text/html" })).window; return transformExposeDesMotifs(document); } catch (error) { console.error(`Could not parse exposé des motifs with error ${error}`); } return null; }