@tricoteuses/senat
Version:
Handle French Sénat's open data
257 lines (256 loc) • 9.55 kB
JavaScript
import { JSDOM } from "jsdom";
import { AKN_IDENTIFICATION_STRUCTURE_REGEXP, AKN_WORKFLOW_IDENTIFICATION_STRUCTURE_REGEXP } from "../scripts/datautil";
import { DivisionType, } from "../types/texte";
function buildWorklow(metaElement) {
const stepElements = metaElement.querySelectorAll("workflow step");
const steps = [];
for (const stepElement of stepElements) {
const identification = stepElement.getAttribute("href") ?? "";
const identificationParts = AKN_WORKFLOW_IDENTIFICATION_STRUCTURE_REGEXP.exec(identification)?.groups;
steps.push({
eId: stepElement.getAttribute("eId"),
date: stepElement.getAttribute("date")
? new Date(stepElement.getAttribute("date") ?? "")
: null,
type: identificationParts?.["type"] || null,
session: identificationParts?.["session"] || null,
numero: identificationParts?.["numTexte"] || null,
version: identificationParts?.["version"]
? identificationParts["version"]
: null,
outcome: stepElement.getAttribute("outcome"),
});
}
return steps;
}
function buildDivision(node, index) {
const eId = node.getAttribute("eId");
const tag = node.nodeName;
const level = DivisionType[tag];
const titleNode = node.querySelector("num");
const subtitleNode = node.querySelector("heading");
const headings = [
...(titleNode
? [
{
text: titleNode.textContent?.trim() ?? null,
html: titleNode.innerHTML?.trim() ?? null,
},
]
: []),
...(subtitleNode
? [
{
text: subtitleNode.textContent?.trim() ?? null,
html: subtitleNode.innerHTML?.trim() ?? null,
},
]
: []),
];
const division = {
index,
eId,
tag,
level,
headings,
};
if (tag === "article") {
;
division.alineas = [];
}
return division;
}
function buildAlinea(contentNode, alineaNode) {
const eId = alineaNode.getAttribute("eId");
const heading = {
text: alineaNode.querySelector("num")?.textContent ?? null,
};
const pastille = alineaNode.getAttribute("data:pastille") ?? null;
return {
eId,
heading,
text: contentNode.textContent?.trim() ?? null,
html: contentNode.innerHTML?.trim() ?? null,
pastille,
};
}
function buildEmptyArticle(index) {
return {
index: index,
eId: "",
tag: "article",
level: DivisionType["article"],
headings: [],
alineas: [],
};
}
function flattenTexte(texteContentRoot) {
const divisions = [];
let divisionIndex = 0;
const iter = (node) => {
if (node.nodeName === "content") {
return;
}
switch (node.nodeName) {
case "tome":
case "part":
case "book":
case "title":
case "subtitle":
case "chapter":
case "section":
case "subsection":
case "paragraph":
case "article":
divisions.push(buildDivision(node, divisionIndex++));
break;
}
if (node.nodeName === "alinea") {
Array.from(node.childNodes)
// Find direct content children programmatically
// because `:scope` selector does not work
// https://github.com/jsdom/jsdom/issues/2998
.filter((alineaChildNode) => alineaChildNode.nodeName === "content")
.forEach((alineaContentNode) => {
// Hypothesis: alineas should always be enclosed in articles
let lastArticle = divisions.findLast((division) => division.tag === "article");
if (!lastArticle) {
lastArticle = buildEmptyArticle(divisionIndex++);
divisions.push(lastArticle);
}
lastArticle.alineas.push(buildAlinea(alineaContentNode, node));
});
}
if (node.hasChildNodes()) {
node.childNodes.forEach((childNode) => iter(childNode));
}
};
iter(texteContentRoot);
return divisions;
}
export function transformTexte(document) {
const metaElement = document.querySelector("meta");
const preambleElement = document.querySelector("preamble");
const identification = metaElement
?.querySelector("FRBRExpression FRBRuri")
?.getAttribute("value") ?? "";
const identificationParts = AKN_IDENTIFICATION_STRUCTURE_REGEXP.exec(identification)?.groups;
const bodyElement = document.querySelector("body");
const sessionYears = identificationParts?.["session"]?.split("-") || null;
const datePresentation = metaElement
?.querySelector("FRBRdate[name='#presentation']")
?.getAttribute("date");
const dateDepot = metaElement
?.querySelector("FRBRdate[name='#depot']")
?.getAttribute("date");
const datePublicationXml = metaElement
?.querySelector("FRBRdate[name='#publication-xml']")
?.getAttribute("date");
return {
titre: preambleElement?.querySelector("docTitle")?.textContent || null,
titreCourt: metaElement
?.querySelector("FRBRalias[name='intitule-court']")
?.getAttribute("value") || null,
signetDossier: metaElement
?.querySelector("FRBRalias[name='signet-dossier-legislatif-senat']")
?.getAttribute("value") || null,
urlDossierSenat: metaElement
?.querySelector("FRBRalias[name='url-senat']")
?.getAttribute("value") || null,
urlDossierAssemblee: metaElement
?.querySelector("FRBRalias[name='url-AN']")
?.getAttribute("value") || null,
type: identificationParts?.["type"] || null,
session: sessionYears && sessionYears.length > 0 ? sessionYears[0] : null,
numero: identificationParts?.["numTexte"]
? parseInt(identificationParts["numTexte"])
: null,
datePresentation: datePresentation ? new Date(datePresentation) : null,
dateDepot: dateDepot ? new Date(dateDepot) : null,
datePublicationXml: datePublicationXml
? new Date(datePublicationXml)
: null,
version: identificationParts?.["version"]
? identificationParts["version"]
: null,
workflow: metaElement ? buildWorklow(metaElement) : [],
divisions: bodyElement ? flattenTexte(bodyElement) : [],
};
}
export function transformExposeDesMotifs(document) {
const sectionElements = document.querySelectorAll("section");
const exposeDesMotifsRegexp = new RegExp("EXPOS.{1,2}[\\n\\s]DES[\\n\\s]MOTIFS");
for (const sectionElement of sectionElements) {
const firstParagraph = sectionElement.querySelector("p:first-of-type");
const secondParagraph = sectionElement.querySelector("p:nth-of-type(2)");
if (!firstParagraph) {
continue;
}
const firstParagraphContent = firstParagraph.textContent;
const secondParagraphContent = secondParagraph?.textContent;
if (!firstParagraphContent ||
!exposeDesMotifsRegexp.test(firstParagraphContent.toUpperCase())) {
if (!secondParagraphContent ||
!exposeDesMotifsRegexp.test(secondParagraphContent.toUpperCase())) {
continue;
}
else {
secondParagraph.remove();
}
}
firstParagraph.remove();
return {
text: sectionElement.textContent?.trim() ?? null,
html: sectionElement.innerHTML?.trim() ?? null,
};
}
return null;
}
export function parseTexte(texteXml) {
try {
const { document } = new JSDOM(texteXml, {
contentType: "text/xml",
}).window;
return transformTexte(document);
}
catch (error) {
console.error(`Could not parse texte with error ${error}`);
}
return null;
}
// Prevent from memory leak
// https://github.com/jsdom/jsdom/issues/2583#issuecomment-559520814
export async function parseTexteFromFile(xmlFilePath) {
try {
const { document } = (await JSDOM.fromFile(xmlFilePath, { contentType: "text/xml" })).window;
return transformTexte(document);
}
catch (error) {
console.error(`Could not parse texte with error ${error}`);
}
return null;
}
export function parseExposeDesMotifs(exposeDesMotifsHtml) {
try {
const { document } = new JSDOM(exposeDesMotifsHtml, {
contentType: "text/html",
}).window;
return transformExposeDesMotifs(document);
}
catch (error) {
console.error(`Could not parse exposé des motifs with error ${error}`);
}
return null;
}
// Prevent from memory leak
// https://github.com/jsdom/jsdom/issues/2583#issuecomment-559520814
export async function parseExposeDesMotifsFromFile(htmlFilePath) {
try {
const { document } = (await JSDOM.fromFile(htmlFilePath, { contentType: "text/html" })).window;
return transformExposeDesMotifs(document);
}
catch (error) {
console.error(`Could not parse exposé des motifs with error ${error}`);
}
return null;
}