@tricoteuses/senat
Version:
Handle French Sénat's open data
317 lines (316 loc) • 11.9 kB
JavaScript
import { JSDOM } from "jsdom";
import fs from "fs-extra";
import path from "path";
import { DateTime } from "luxon";
export function extractMetadata(xmlDoc) {
const metadata = {
number: null,
session: null,
date: null,
type: null,
authors: null,
title: xmlDoc.querySelector("docTitle")?.textContent?.trim() || null,
commission: null,
};
// Extract Number
const docIdAlias = xmlDoc.querySelector('FRBRalias[name="signet-dossier-legislatif-senat"]');
if (docIdAlias) {
const value = docIdAlias.getAttribute("value");
if (value) {
const match = value.match(/\d+$/);
if (match)
metadata.number = match[0];
}
}
// Extract Session
const sessionUri = xmlDoc.querySelector("FRBRExpression > FRBRuri")?.getAttribute("value");
if (sessionUri) {
const match = sessionUri.match(/\d{4}-\d{4}/);
if (match)
metadata.session = match[0];
}
// Extract Date
const depotDate = xmlDoc.querySelector('FRBRdate[name="#depot"]')?.getAttribute("date");
if (depotDate) {
metadata.date = DateTime.fromISO(depotDate).setLocale("fr").toFormat("d MMMM yyyy");
}
else {
const presentationDate = xmlDoc.querySelector('FRBRdate[name="#presentation"]')?.getAttribute("date");
if (presentationDate) {
metadata.date = DateTime.fromISO(presentationDate).setLocale("fr").toFormat("d MMMM yyyy");
}
}
// Extract Type
const bill = xmlDoc.querySelector("bill");
const typeCode = bill?.getAttribute("name");
if (typeCode === "ppl") {
metadata.type = "PROPOSITION DE LOI";
}
else if (typeCode === "pjl") {
metadata.type = "PROJET DE LOI";
}
// Extract Authors
const authorRef = xmlDoc.querySelector('FRBRWork > FRBRauthor[as="#auteur"]')?.getAttribute("href");
if (authorRef) {
const authorId = authorRef.replace(/^#/, "");
const authorPerson = xmlDoc.querySelector(`TLCPerson[eId="${authorId}"]`);
if (authorPerson) {
const showAs = authorPerson.getAttribute("showAs");
if (showAs) {
metadata.authors = showAs.replace(/, Sénateurs$/, ", Sénateurs et Sénatrices");
}
}
}
// Extract Commission
const commissionNode = xmlDoc.querySelector('TLCOrganization[eId="commission-senat"]') ||
xmlDoc.querySelector('TLCOrganization[eId^="commission-"]:not([eId*="assemblee"])');
if (commissionNode) {
metadata.commission = commissionNode.getAttribute("showAs");
}
return metadata;
}
export async function convertSenatXmlToHtml(texteXml, outputFilePath) {
let xmlDoc;
try {
xmlDoc = new JSDOM(texteXml, { contentType: "text/xml" }).window.document;
}
catch (err) {
if (await fs.pathExists(outputFilePath)) {
await fs.remove(outputFilePath);
}
throw err;
}
const metadata = extractMetadata(xmlDoc);
const xmlBody = xmlDoc.querySelector("body");
const style = `
body {
font-family: "URW Bookman", "Bookman Old Style", serif;
max-width: 800px;
margin: 40px auto;
line-height: 1.5;
color: #333;
}
.header {
text-align: center;
margin-bottom: 40px;
border-bottom: 2px solid #333;
padding-bottom: 20px;
}
.header-top {
font-weight: bold;
font-size: 1.2em;
margin-bottom: 10px;
}
.header-session {
text-transform: uppercase;
font-size: 0.9em;
margin-bottom: 5px;
}
.header-date {
font-size: 0.9em;
margin-bottom: 5px;
}
.header-number {
font-weight: bold;
font-size: 1.1em;
margin-bottom: 20px;
}
.header-type {
font-weight: bold;
font-size: 1.5em;
margin-top: 20px;
}
.header-authors {
margin-top: 20px;
font-style: italic;
}
.header-commission {
margin-top: 15px;
font-size: 0.9em;
}
h1 {
text-align: center;
font-size: 1.8em;
margin-top: 10px;
}
p {
margin: 0.6em 0;
}
p.has-alinea {
position: relative;
padding-left: 2.5em;
}
.alinea {
position: absolute;
left: 0;
top: 0.15em;
display: inline-flex;
align-items: center;
justify-content: center;
min-width: 1.5em;
height: 1.5em;
padding: 0 0.3em;
margin-right: 0.3em;
font-size: 0.75em;
font-weight: bold;
color: #555;
background-color: #f0f0f0;
border: 1px solid #ccc;
border-radius: 1em;
}
.num {
font-weight: bold;
margin-right: 0.2em;
}
.article {
margin-top: 2em;
}
.article h3 {
border-bottom: 1px solid #eee;
padding-bottom: 5px;
}
`;
const htmlDocTemplate = `<!DOCTYPE html>
<html lang="fr">
<head>
<meta charset="utf-8">
<title>${metadata.title || "Document Sénat"}</title>
<style>${style}</style>
</head>
<body>
<div class="header">
<div class="header-top">SÉNAT</div>
<div class="header-session">SESSION ORDINAIRE DE ${metadata.session || "...."}</div>
${metadata.date ? `<div class="header-date">Enregistré à la Présidence du Sénat le ${metadata.date}</div>` : ""}
<div class="header-number">N° ${metadata.number || "...."}</div>
<div class="header-type">${metadata.type || ""}</div>
<div class="header-authors">${metadata.authors || ""}</div>
${metadata.commission
? `<div class="header-commission">Envoyée à la ${metadata.commission.toLowerCase()}, sous réserve de la constitution éventuelle d'une commission spéciale dans les conditions prévues par le Règlement.</div>`
: ""}
</div>
<h1>${metadata.title || ""}</h1>
</body>
</html>`;
const { document: htmlDoc } = new JSDOM(htmlDocTemplate).window;
const body = htmlDoc.body;
if (xmlBody) {
const processNode = (xmlNode, htmlParent, alineaData = null) => {
const children = Array.from(xmlNode.childNodes);
const alineaChildren = [];
const otherChildren = [];
for (const child of children) {
if (child.nodeType === 1 && child.tagName.toLowerCase() === "alinea") {
alineaChildren.push(child);
}
else {
otherChildren.push(child);
}
}
for (const child of otherChildren) {
if (child.nodeType === 3) {
htmlParent.appendChild(htmlDoc.createTextNode(child.textContent || ""));
}
else if (child.nodeType === 1) {
const element = child;
const tagName = element.tagName.toLowerCase();
let htmlElement = null;
switch (tagName) {
case "article": {
htmlElement = htmlDoc.createElement("div");
htmlElement.className = "article";
const artId = element.getAttribute("eId");
if (artId)
htmlElement.id = artId;
const artGuid = element.getAttribute("GUID");
if (artGuid)
htmlElement.setAttribute("data-guid", artGuid);
break;
}
case "num": {
const parentTagName = element.parentElement?.tagName.toLowerCase();
if (parentTagName === "alinea" && alineaData) {
alineaData.numText = element.textContent?.trim();
continue;
}
htmlElement = htmlDoc.createElement("span");
htmlElement.className = "num";
break;
}
case "heading":
htmlElement = htmlDoc.createElement("h4");
break;
case "p":
htmlElement = htmlDoc.createElement("p");
if (alineaData) {
htmlElement.classList.add("has-alinea");
if (alineaData.id)
htmlElement.id = alineaData.id;
if (alineaData.guid)
htmlElement.setAttribute("data-guid", alineaData.guid);
const pastille = alineaData.pastille;
if (pastille) {
htmlElement.setAttribute("data-pastille", pastille);
if (!alineaData.pastilleApplied) {
const span = htmlDoc.createElement("span");
span.className = "alinea";
span.setAttribute("data-alinea", pastille);
span.textContent = pastille;
htmlElement.appendChild(span);
alineaData.pastilleApplied = true;
}
}
if (alineaData.numText) {
const xmlPText = element.textContent || "";
const normalize = (s) => s.replace(/[\\s\\u00A0]+/g, " ").trim();
const normalizedNum = normalize(alineaData.numText);
const normalizedP = normalize(xmlPText);
if (normalizedNum && !normalizedP.startsWith(normalizedNum)) {
const numSpan = htmlDoc.createElement("span");
numSpan.className = "num";
numSpan.textContent = alineaData.numText + " ";
htmlElement.appendChild(numSpan);
}
alineaData.numText = null;
}
}
break;
case "content":
processNode(element, htmlParent, alineaData);
continue;
case "doctitle":
continue;
case "i":
case "b":
case "u":
case "sup":
case "sub":
htmlElement = htmlDoc.createElement(tagName);
break;
default:
htmlElement = htmlDoc.createElement("span");
htmlElement.setAttribute("data-xml-tag", tagName);
break;
}
if (htmlElement) {
htmlParent.appendChild(htmlElement);
processNode(element, htmlElement, alineaData);
}
}
}
for (const element of alineaChildren) {
const nextAlineaData = {
id: element.getAttribute("eId"),
guid: element.getAttribute("GUID"),
pastille: element.getAttribute("data:pastille"),
pastilleApplied: false,
};
processNode(element, htmlParent, nextAlineaData);
}
};
processNode(xmlBody, body);
}
const htmlContent = "<!DOCTYPE html>\n" + htmlDoc.documentElement.outerHTML;
await fs.ensureDir(path.dirname(outputFilePath));
await fs.outputFile(outputFilePath, htmlContent);
}