UNPKG

@tricoteuses/senat

Version:

Handle French Sénat's open data

268 lines (267 loc) 9.99 kB
import fs from "fs"; import * as cheerio from "cheerio"; import { toCRDate } from "./util"; import { makeReunionUid } from "../utils/reunion_parsing"; import { yyyymmddFromPath } from "../utils/date"; import { decodeHtmlEntities, dedupeSpeaker, fixApostrophes, norm } from "../utils/string_cleaning"; export async function parseCompteRenduIntervalFromFile(xmlFilePath, startIndex, endIndex, agendaEventId) { try { const raw = fs.readFileSync(xmlFilePath, "utf8"); const $ = cheerio.load(raw, { xml: false }); const metadonnees = extractMetadonnees($, xmlFilePath); const order = $("body *").toArray(); const idx = new Map(order.map((el, i) => [el, i])); const totalNodes = order.length; const clampedStart = Math.max(0, Math.min(startIndex, totalNodes - 1)); const clampedEnd = Math.max(0, Math.min(endIndex, totalNodes - 1)); const intervals = [ { start: clampedStart, end: clampedEnd, }, ]; metadonnees.sommaire = extractSommaireForIntervals($, idx, intervals); const points = []; let ordre = 0; const addPoint = (p) => points.push({ ...p, ordre_absolu_seance: String(++ordre) }); // Interventions $("div.intervenant").each((_, block) => { if (!elementInAnyInterval(block, idx, intervals)) return; const $block = $(block); $block .find([ "p[class^='titre_S']", "p.mention_titre", "p.intitule_titre", "p.mention_chapitre", "p.intitule_chapitre", "p.mention_article", "p.intitule_article", "p.mention_section", "p.intitule_section", ].join(",")) .remove(); const firstP = $block.find("p").first(); if (!firstP || firstP.length === 0) return; const speakerLabelRaw = firstP.find(".orateur_nom").text() || firstP.find("a.lien_senfic").text() || ""; const speakerLabel = dedupeSpeaker(speakerLabelRaw); const { mat, nom: nomCRI, qua: quaCRI } = readIntervenantMeta($block); const qualFromSpans = extractAndRemoveLeadingQualite($, $block); const qualite = norm(decodeHtmlEntities(quaCRI || "")) || qualFromSpans; const canonicalName = dedupeSpeaker(nomCRI || speakerLabel); const role = roleForSpeaker(speakerLabel) || roleForSpeaker(qualite) || roleForSpeaker(quaCRI || ""); const speechHtml = sanitizeInterventionHtml($, $block); const speechText = norm(cheerio.load(speechHtml).text() || ""); if (!speechText) return; addPoint({ code_grammaire: "PAROLE_GENERIQUE", roledebat: role, orateurs: { orateur: { nom: canonicalName, id: mat || "", qualite } }, texte: { _: speechHtml }, }); }); const contenu = { quantiemes: { journee: metadonnees.dateSeance, session: metadonnees.session, }, point: points, }; const yyyymmdd = yyyymmddFromPath(xmlFilePath); const dateISO = `${yyyymmdd.slice(0, 4)}-${yyyymmdd.slice(4, 6)}-${yyyymmdd.slice(6, 8)}`; const seanceRef = makeReunionUid(dateISO, "SP", agendaEventId, null); return { uid: `CRSSN${yyyymmdd}E${agendaEventId}`, seanceRef, sessionRef: metadonnees.session, metadonnees, contenu, }; } catch (e) { console.error(`[CRI] parseInterval error file=${xmlFilePath} interval=[${startIndex}..${endIndex}] event=${agendaEventId}:`, e); return null; } } export function sessionStartYearFromDate(d) { // Session (1th oct N → 30 sept N+1) const m = d.getMonth(); const y = d.getFullYear(); return m >= 9 ? y : y - 1; } function roleForSpeaker(labelOrQualite) { const s = (labelOrQualite || "").toLowerCase(); if (/^(m\.|mme)?\s*(le|la)\s+pr[ée]sident(e)?\b/.test(s) || /\bpr[ée]sident[e]?\s+de\s+séance\b/.test(s)) return "président"; return ""; } function readIntervenantMeta($block) { const int = $block.find("cri\\:intervenant").first(); if (int.length) return { mat: int.attr("mat") || undefined, nom: int.attr("nom") || undefined, qua: int.attr("qua") || undefined }; const html = $block.html() || ""; const m = html.match(/<!--\s*cri:intervenant\b([^>]+)-->/i); if (!m) return {}; const out = {}; const re = /(\w+)="([^"]*)"/g; let a; while ((a = re.exec(m[1]))) out[a[1]] = decodeHtmlEntities(a[2]); return { mat: out["mat"], nom: out["nom"], qua: out["qua"] }; } function extractAndRemoveLeadingQualite($, $block) { const firstP = $block.find("p").first(); if (firstP.length === 0) return ""; const parts = []; let stop = false; firstP.contents().each((_, node) => { if (stop) return; if (node.type === "tag") { const $node = $(node); if ($node.hasClass("orateur_nom")) { $node.remove(); return; } if ($node.hasClass("orateur_qualite")) { parts.push($node.text() || ""); $node.remove(); return; } const t = norm($node.text() || ""); if (t) stop = true; else $node.remove(); } else if (node.type === "text") { const t = norm(node.data || ""); if (!t || /^[:.,;–—-]+$/.test(t)) { ; node.data = ""; return; } stop = true; } }); return fixApostrophes(norm(parts.join(" "))); } function sanitizeInterventionHtml($, $block) { const ps = $block.find("p").toArray(); const cleaned = ps .map((p) => { const $p = $(p).clone(); $p.find(".orateur_nom, .orateur_qualite").remove(); $p.find("a").each((_, a) => { const $a = $(a); $a.replaceWith($a.text()); }); $p.find(".info_entre_parentheses").each((_, el) => { const txt = $(el).text(); $(el).replaceWith($("<em/>").text(txt)); }); $p.find("span").each((_, span) => { const $s = $(span); if (!$s.text().trim()) $s.remove(); }); const inner = ($p.html() || "").trim(); if (!inner) return null; return `<p>${inner}</p>`; }) .filter(Boolean); return cleaned.join("<br/>"); } function extractSommaireForIntervals($, idx, intervals) { const inIv = (el) => elementInAnyInterval(el, idx, intervals); const root = $("body"); const sommaire = { presidentSeance: { _: "" }, sommaire1: [] }; // (1) Présidence (tm2) — première ligne dans l’intervalle const pres = root .find("p.tm2") .filter((_, el) => inIv(el)) .first(); if (pres.length) sommaire.presidentSeance = { _: norm(pres.text()) }; // (2) Paras tm5 présents dans l’intervalle const paras = []; root.find("p.tm5").each((_, el) => { if (!inIv(el)) return; const t = norm($(el).text()); if (t) paras.push({ _: t }); }); if (paras.length) sommaire.para = paras.length === 1 ? paras[0] : paras; // (3) Items de 1er niveau (tm3) présents dans l’intervalle const items = []; root.find("p.tm3").each((_, el) => { if (!inIv(el)) return; const $p = $(el); const full = norm($p.text() || ""); if (!full) return; const numMatch = full.match(/^(\d+)\s*[.\-–—]\s*/); const valeur = numMatch ? numMatch[1] : undefined; // prefere intitule in ancre <a> if present const a = $p.find("a").first(); const intituleRaw = a.length ? a.text() : full.replace(/^(\d+)\s*[.\-–—]\s*/, ""); const intitule = norm(intituleRaw); // id_syceron from href="#Niv1_SOMx" const href = (a.attr("href") || "").trim(); const idSyceron = href.startsWith("#") ? href.slice(1) : href; const titreStruct = { id_syceron: idSyceron || "", intitule }; items.push({ valeur_pts_odj: valeur, titreStruct }); }); if (items.length) sommaire.sommaire1 = items; return sommaire; } function extractMetadonnees($, filePath) { let dateText = norm($("h1, h2, .page-title").first().text() || ""); if (!dateText) dateText = norm($("p").first().text() || ""); const dateMatch = dateText.match(/\b(\d{1,2}\s+\w+\s+\d{4})\b/i); const allText = norm($("body").text() || ""); const sessionMatch = allText.match(/\bsession\s+(\d{4}-\d{4})\b/i); let dateSeance = dateMatch?.[1] || ""; if (!dateSeance) { const m = filePath.match(/d(\d{4})(\d{2})(\d{2})\.xml$/i); if (m) dateSeance = `${m[1]}-${m[2]}-${m[3]}`; } dateSeance = toCRDate(dateSeance, null); return { dateSeance, dateSeanceJour: dateSeance, numSeanceJour: "", numSeance: "", typeAssemblee: "SN", legislature: "", session: sessionMatch?.[1] || "", nomFichierJo: "", validite: "", etat: "", diffusion: "", version: "1.0", environnement: "", heureGeneration: new Date(), }; } function elementInAnyInterval(el, idx, intervals) { const p = idx.get(el); if (p == null) return false; for (const iv of intervals) if (p >= iv.start && p < iv.end) return true; return false; }