@tricoteuses/senat
Version:
Handle French Sénat's open data
240 lines (239 loc) • 9.4 kB
JavaScript
import { JSDOM } from "jsdom";
import { AKN_IDENTIFICATION_STRUCTURE_REGEXP } from "../scripts/datautil";
import { DivisionType, } from "../types/texte";
function buildDivision(node, index) {
const eId = node.getAttribute("eId");
const tag = node.nodeName;
const level = DivisionType[tag];
const titleNode = node.querySelector("num");
const subtitleNode = node.querySelector("heading");
const mention = node.getAttribute("data:mention") ?? null;
const headings = [
...(titleNode
? [
{
text: titleNode.textContent?.trim() ?? null,
html: titleNode.innerHTML?.trim() ?? null,
},
]
: []),
...(subtitleNode
? [
{
text: subtitleNode.textContent?.trim() ?? null,
html: subtitleNode.innerHTML?.trim() ?? null,
},
]
: []),
];
if (mention?.includes("(nouveau)") && headings.length > 0) {
headings[0] = {
text: headings[0].text != null ? `${headings[0].text} (nouveau)` : "(nouveau)",
html: headings[0].html != null ? `${headings[0].html} (nouveau)` : "(nouveau)",
};
}
const division = {
index,
eId,
tag,
level,
headings,
};
if (tag === "article") {
;
division.alineas = [];
}
return division;
}
function buildAlinea(contentNode, alineaNode) {
const eId = alineaNode.getAttribute("eId");
const heading = {
text: alineaNode.querySelector("num")?.textContent ?? null,
};
const pastille = alineaNode.getAttribute("data:pastille") ?? null;
return {
eId,
heading,
text: contentNode.textContent?.trim() ?? null,
html: contentNode.innerHTML?.trim() ?? null,
pastille,
};
}
function buildEmptyArticle(index) {
return {
index: index,
eId: "",
tag: "article",
level: DivisionType["article"],
headings: [],
alineas: [],
};
}
function splitTexte(texteContentRoot) {
const divisions = [];
let divisionIndex = 0;
const mentionByEId = new Map();
const iter = (node) => {
if (node.nodeName === "content") {
return;
}
switch (node.nodeName) {
case "tome":
case "part":
case "book":
case "title":
case "subtitle":
case "chapter":
case "section":
case "subsection":
case "paragraph":
divisions.push(buildDivision(node, divisionIndex++));
break;
case "article": {
const division = buildDivision(node, divisionIndex++);
const mention = node.getAttribute("data:mention");
if (mention)
mentionByEId.set(division.eId, mention);
divisions.push(division);
break;
}
}
if (node.nodeName === "alinea") {
Array.from(node.childNodes)
// Find direct content children programmatically
// because `:scope` selector does not work
// https://github.com/jsdom/jsdom/issues/2998
.filter((alineaChildNode) => alineaChildNode.nodeName === "content")
.forEach((alineaContentNode) => {
// Hypothesis: alineas should always be enclosed in articles
let lastArticle = divisions.findLast((division) => division.tag === "article");
if (!lastArticle) {
lastArticle = buildEmptyArticle(divisionIndex++);
divisions.push(lastArticle);
}
lastArticle.alineas.push(buildAlinea(alineaContentNode, node));
});
}
if (node.hasChildNodes()) {
node.childNodes.forEach((childNode) => iter(childNode));
}
};
iter(texteContentRoot);
for (const division of divisions) {
if (division.tag !== "article")
continue;
const article = division;
const mention = mentionByEId.get(division.eId);
if (mention?.includes("(Supprimé)") && article.alineas.length === 0) {
article.alineas.push({
eId: "",
heading: { text: null },
text: "(Supprimé)",
html: "(Supprimé)",
pastille: null,
});
}
}
return divisions;
}
export function transformTexte(document) {
const metaElement = document.querySelector("meta");
const preambleElement = document.querySelector("preamble");
const identification = metaElement?.querySelector("FRBRExpression FRBRuri")?.getAttribute("value") ?? "";
const identificationParts = AKN_IDENTIFICATION_STRUCTURE_REGEXP.exec(identification)?.groups;
const bodyElement = document.querySelector("body");
const sessionYears = identificationParts?.["session"]?.split("-") || null;
const datePresentation = metaElement?.querySelector("FRBRdate[name='#presentation']")?.getAttribute("date");
const dateDepot = metaElement?.querySelector("FRBRdate[name='#depot']")?.getAttribute("date");
const datePublicationXml = metaElement?.querySelector("FRBRdate[name='#publication-xml']")?.getAttribute("date");
return {
titre: preambleElement?.querySelector("docTitle")?.textContent || null,
titre_court: metaElement?.querySelector("FRBRalias[name='intitule-court']")?.getAttribute("value") || null,
signet_dossier: metaElement?.querySelector("FRBRalias[name='signet-dossier-legislatif-senat']")?.getAttribute("value") || null,
url_dossier_senat: metaElement?.querySelector("FRBRalias[name='url-senat']")?.getAttribute("value") || null,
url_dossier_assemblee: metaElement?.querySelector("FRBRalias[name='url-AN']")?.getAttribute("value") || null,
type: identificationParts?.["type"] || null,
session: sessionYears && sessionYears.length > 0 ? sessionYears[0] : null,
numero: identificationParts?.["numTexte"] ? parseInt(identificationParts["numTexte"]) : null,
date_presentation: datePresentation ? new Date(datePresentation) : null,
date_depot: dateDepot ? new Date(dateDepot) : null,
date_publication_xml: datePublicationXml ? new Date(datePublicationXml) : null,
version: identificationParts?.["version"] ? identificationParts["version"] : null,
divisions: bodyElement ? splitTexte(bodyElement) : [],
};
}
export function transformExposeDesMotifs(document) {
const sectionElements = document.querySelectorAll("section");
const exposeDesMotifsRegexp = new RegExp("EXPOS.{1,2}[\\n\\s]DES[\\n\\s]MOTIFS");
for (const sectionElement of sectionElements) {
const firstParagraph = sectionElement.querySelector("p:first-of-type");
const secondParagraph = sectionElement.querySelector("p:nth-of-type(2)");
if (!firstParagraph) {
continue;
}
const firstParagraphContent = firstParagraph.textContent;
const secondParagraphContent = secondParagraph?.textContent;
if (!firstParagraphContent || !exposeDesMotifsRegexp.test(firstParagraphContent.toUpperCase())) {
if (!secondParagraphContent || !exposeDesMotifsRegexp.test(secondParagraphContent.toUpperCase())) {
continue;
}
else {
secondParagraph.remove();
}
}
firstParagraph.remove();
return {
text: sectionElement.textContent?.trim() ?? null,
html: sectionElement.innerHTML?.trim() ?? null,
};
}
return null;
}
export function parseTexte(texteXml) {
try {
const { document } = new JSDOM(texteXml, {
contentType: "text/xml",
}).window;
return transformTexte(document);
}
catch (error) {
console.error(`Could not parse texte with error ${error}`);
}
return null;
}
// Prevent from memory leak
// https://github.com/jsdom/jsdom/issues/2583#issuecomment-559520814
export async function parseTexteFromFile(xmlFilePath) {
try {
const { document } = (await JSDOM.fromFile(xmlFilePath, { contentType: "text/xml" })).window;
return transformTexte(document);
}
catch (error) {
console.error(`Could not parse texte with error ${error}`);
}
return null;
}
export function parseExposeDesMotifs(exposeDesMotifsHtml) {
try {
const { document } = new JSDOM(exposeDesMotifsHtml, {
contentType: "text/html",
}).window;
return transformExposeDesMotifs(document);
}
catch (error) {
console.error(`Could not parse exposé des motifs with error ${error}`);
}
return null;
}
// Prevent from memory leak
// https://github.com/jsdom/jsdom/issues/2583#issuecomment-559520814
export async function parseExposeDesMotifsFromFile(htmlFilePath) {
try {
const { document } = (await JSDOM.fromFile(htmlFilePath, { contentType: "text/html" })).window;
return transformExposeDesMotifs(document);
}
catch (error) {
console.error(`Could not parse exposé des motifs with error ${error}`);
}
return null;
}