UNPKG

@tricoteuses/senat

Version:

Handle French Sénat's open data

317 lines (316 loc) 11.9 kB
import { JSDOM } from "jsdom"; import fs from "fs-extra"; import path from "path"; import { DateTime } from "luxon"; export function extractMetadata(xmlDoc) { const metadata = { number: null, session: null, date: null, type: null, authors: null, title: xmlDoc.querySelector("docTitle")?.textContent?.trim() || null, commission: null, }; // Extract Number const docIdAlias = xmlDoc.querySelector('FRBRalias[name="signet-dossier-legislatif-senat"]'); if (docIdAlias) { const value = docIdAlias.getAttribute("value"); if (value) { const match = value.match(/\d+$/); if (match) metadata.number = match[0]; } } // Extract Session const sessionUri = xmlDoc.querySelector("FRBRExpression > FRBRuri")?.getAttribute("value"); if (sessionUri) { const match = sessionUri.match(/\d{4}-\d{4}/); if (match) metadata.session = match[0]; } // Extract Date const depotDate = xmlDoc.querySelector('FRBRdate[name="#depot"]')?.getAttribute("date"); if (depotDate) { metadata.date = DateTime.fromISO(depotDate).setLocale("fr").toFormat("d MMMM yyyy"); } else { const presentationDate = xmlDoc.querySelector('FRBRdate[name="#presentation"]')?.getAttribute("date"); if (presentationDate) { metadata.date = DateTime.fromISO(presentationDate).setLocale("fr").toFormat("d MMMM yyyy"); } } // Extract Type const bill = xmlDoc.querySelector("bill"); const typeCode = bill?.getAttribute("name"); if (typeCode === "ppl") { metadata.type = "PROPOSITION DE LOI"; } else if (typeCode === "pjl") { metadata.type = "PROJET DE LOI"; } // Extract Authors const authorRef = xmlDoc.querySelector('FRBRWork > FRBRauthor[as="#auteur"]')?.getAttribute("href"); if (authorRef) { const authorId = authorRef.replace(/^#/, ""); const authorPerson = xmlDoc.querySelector(`TLCPerson[eId="${authorId}"]`); if (authorPerson) { const showAs = authorPerson.getAttribute("showAs"); if (showAs) { metadata.authors = showAs.replace(/, Sénateurs$/, ", Sénateurs et Sénatrices"); } } } // Extract Commission const commissionNode = xmlDoc.querySelector('TLCOrganization[eId="commission-senat"]') || xmlDoc.querySelector('TLCOrganization[eId^="commission-"]:not([eId*="assemblee"])'); if (commissionNode) { metadata.commission = commissionNode.getAttribute("showAs"); } return metadata; } export async function convertSenatXmlToHtml(texteXml, outputFilePath) { let xmlDoc; try { xmlDoc = new JSDOM(texteXml, { contentType: "text/xml" }).window.document; } catch (err) { if (await fs.pathExists(outputFilePath)) { await fs.remove(outputFilePath); } throw err; } const metadata = extractMetadata(xmlDoc); const xmlBody = xmlDoc.querySelector("body"); const style = ` body { font-family: "URW Bookman", "Bookman Old Style", serif; max-width: 800px; margin: 40px auto; line-height: 1.5; color: #333; } .header { text-align: center; margin-bottom: 40px; border-bottom: 2px solid #333; padding-bottom: 20px; } .header-top { font-weight: bold; font-size: 1.2em; margin-bottom: 10px; } .header-session { text-transform: uppercase; font-size: 0.9em; margin-bottom: 5px; } .header-date { font-size: 0.9em; margin-bottom: 5px; } .header-number { font-weight: bold; font-size: 1.1em; margin-bottom: 20px; } .header-type { font-weight: bold; font-size: 1.5em; margin-top: 20px; } .header-authors { margin-top: 20px; font-style: italic; } .header-commission { margin-top: 15px; font-size: 0.9em; } h1 { text-align: center; font-size: 1.8em; margin-top: 10px; } p { margin: 0.6em 0; } p.has-alinea { position: relative; padding-left: 2.5em; } .alinea { position: absolute; left: 0; top: 0.15em; display: inline-flex; align-items: center; justify-content: center; min-width: 1.5em; height: 1.5em; padding: 0 0.3em; margin-right: 0.3em; font-size: 0.75em; font-weight: bold; color: #555; background-color: #f0f0f0; border: 1px solid #ccc; border-radius: 1em; } .num { font-weight: bold; margin-right: 0.2em; } .article { margin-top: 2em; } .article h3 { border-bottom: 1px solid #eee; padding-bottom: 5px; } `; const htmlDocTemplate = `<!DOCTYPE html> <html lang="fr"> <head> <meta charset="utf-8"> <title>${metadata.title || "Document Sénat"}</title> <style>${style}</style> </head> <body> <div class="header"> <div class="header-top">SÉNAT</div> <div class="header-session">SESSION ORDINAIRE DE ${metadata.session || "...."}</div> ${metadata.date ? `<div class="header-date">Enregistré à la Présidence du Sénat le ${metadata.date}</div>` : ""} <div class="header-number">N° ${metadata.number || "...."}</div> <div class="header-type">${metadata.type || ""}</div> <div class="header-authors">${metadata.authors || ""}</div> ${metadata.commission ? `<div class="header-commission">Envoyée à la ${metadata.commission.toLowerCase()}, sous réserve de la constitution éventuelle d'une commission spéciale dans les conditions prévues par le Règlement.</div>` : ""} </div> <h1>${metadata.title || ""}</h1> </body> </html>`; const { document: htmlDoc } = new JSDOM(htmlDocTemplate).window; const body = htmlDoc.body; if (xmlBody) { const processNode = (xmlNode, htmlParent, alineaData = null) => { const children = Array.from(xmlNode.childNodes); const alineaChildren = []; const otherChildren = []; for (const child of children) { if (child.nodeType === 1 && child.tagName.toLowerCase() === "alinea") { alineaChildren.push(child); } else { otherChildren.push(child); } } for (const child of otherChildren) { if (child.nodeType === 3) { htmlParent.appendChild(htmlDoc.createTextNode(child.textContent || "")); } else if (child.nodeType === 1) { const element = child; const tagName = element.tagName.toLowerCase(); let htmlElement = null; switch (tagName) { case "article": { htmlElement = htmlDoc.createElement("div"); htmlElement.className = "article"; const artId = element.getAttribute("eId"); if (artId) htmlElement.id = artId; const artGuid = element.getAttribute("GUID"); if (artGuid) htmlElement.setAttribute("data-guid", artGuid); break; } case "num": { const parentTagName = element.parentElement?.tagName.toLowerCase(); if (parentTagName === "alinea" && alineaData) { alineaData.numText = element.textContent?.trim(); continue; } htmlElement = htmlDoc.createElement("span"); htmlElement.className = "num"; break; } case "heading": htmlElement = htmlDoc.createElement("h4"); break; case "p": htmlElement = htmlDoc.createElement("p"); if (alineaData) { htmlElement.classList.add("has-alinea"); if (alineaData.id) htmlElement.id = alineaData.id; if (alineaData.guid) htmlElement.setAttribute("data-guid", alineaData.guid); const pastille = alineaData.pastille; if (pastille) { htmlElement.setAttribute("data-pastille", pastille); if (!alineaData.pastilleApplied) { const span = htmlDoc.createElement("span"); span.className = "alinea"; span.setAttribute("data-alinea", pastille); span.textContent = pastille; htmlElement.appendChild(span); alineaData.pastilleApplied = true; } } if (alineaData.numText) { const xmlPText = element.textContent || ""; const normalize = (s) => s.replace(/[\\s\\u00A0]+/g, " ").trim(); const normalizedNum = normalize(alineaData.numText); const normalizedP = normalize(xmlPText); if (normalizedNum && !normalizedP.startsWith(normalizedNum)) { const numSpan = htmlDoc.createElement("span"); numSpan.className = "num"; numSpan.textContent = alineaData.numText + " "; htmlElement.appendChild(numSpan); } alineaData.numText = null; } } break; case "content": processNode(element, htmlParent, alineaData); continue; case "doctitle": continue; case "i": case "b": case "u": case "sup": case "sub": htmlElement = htmlDoc.createElement(tagName); break; default: htmlElement = htmlDoc.createElement("span"); htmlElement.setAttribute("data-xml-tag", tagName); break; } if (htmlElement) { htmlParent.appendChild(htmlElement); processNode(element, htmlElement, alineaData); } } } for (const element of alineaChildren) { const nextAlineaData = { id: element.getAttribute("eId"), guid: element.getAttribute("GUID"), pastille: element.getAttribute("data:pastille"), pastilleApplied: false, }; processNode(element, htmlParent, nextAlineaData); } }; processNode(xmlBody, body); } const htmlContent = "<!DOCTYPE html>\n" + htmlDoc.documentElement.outerHTML; await fs.ensureDir(path.dirname(outputFilePath)); await fs.outputFile(outputFilePath, htmlContent); }