@tricoteuses/senat
Version:
Handle French Sénat's open data
268 lines (267 loc) • 9.99 kB
JavaScript
import fs from "fs";
import * as cheerio from "cheerio";
import { toCRDate } from "./util";
import { makeReunionUid } from "../utils/reunion_parsing";
import { yyyymmddFromPath } from "../utils/date";
import { decodeHtmlEntities, dedupeSpeaker, fixApostrophes, norm } from "../utils/string_cleaning";
export async function parseCompteRenduIntervalFromFile(xmlFilePath, startIndex, endIndex, agendaEventId) {
try {
const raw = fs.readFileSync(xmlFilePath, "utf8");
const $ = cheerio.load(raw, { xml: false });
const metadonnees = extractMetadonnees($, xmlFilePath);
const order = $("body *").toArray();
const idx = new Map(order.map((el, i) => [el, i]));
const totalNodes = order.length;
const clampedStart = Math.max(0, Math.min(startIndex, totalNodes - 1));
const clampedEnd = Math.max(0, Math.min(endIndex, totalNodes - 1));
const intervals = [
{
start: clampedStart,
end: clampedEnd,
},
];
metadonnees.sommaire = extractSommaireForIntervals($, idx, intervals);
const points = [];
let ordre = 0;
const addPoint = (p) => points.push({ ...p, ordre_absolu_seance: String(++ordre) });
// Interventions
$("div.intervenant").each((_, block) => {
if (!elementInAnyInterval(block, idx, intervals))
return;
const $block = $(block);
$block
.find([
"p[class^='titre_S']",
"p.mention_titre",
"p.intitule_titre",
"p.mention_chapitre",
"p.intitule_chapitre",
"p.mention_article",
"p.intitule_article",
"p.mention_section",
"p.intitule_section",
].join(","))
.remove();
const firstP = $block.find("p").first();
if (!firstP || firstP.length === 0)
return;
const speakerLabelRaw = firstP.find(".orateur_nom").text() || firstP.find("a.lien_senfic").text() || "";
const speakerLabel = dedupeSpeaker(speakerLabelRaw);
const { mat, nom: nomCRI, qua: quaCRI } = readIntervenantMeta($block);
const qualFromSpans = extractAndRemoveLeadingQualite($, $block);
const qualite = norm(decodeHtmlEntities(quaCRI || "")) || qualFromSpans;
const canonicalName = dedupeSpeaker(nomCRI || speakerLabel);
const role = roleForSpeaker(speakerLabel) || roleForSpeaker(qualite) || roleForSpeaker(quaCRI || "");
const speechHtml = sanitizeInterventionHtml($, $block);
const speechText = norm(cheerio.load(speechHtml).text() || "");
if (!speechText)
return;
addPoint({
code_grammaire: "PAROLE_GENERIQUE",
roledebat: role,
orateurs: { orateur: { nom: canonicalName, id: mat || "", qualite } },
texte: { _: speechHtml },
});
});
const contenu = {
quantiemes: {
journee: metadonnees.dateSeance,
session: metadonnees.session,
},
point: points,
};
const yyyymmdd = yyyymmddFromPath(xmlFilePath);
const dateISO = `${yyyymmdd.slice(0, 4)}-${yyyymmdd.slice(4, 6)}-${yyyymmdd.slice(6, 8)}`;
const seanceRef = makeReunionUid(dateISO, "SP", agendaEventId, null);
return {
uid: `CRSSN${yyyymmdd}E${agendaEventId}`,
seanceRef,
sessionRef: metadonnees.session,
metadonnees,
contenu,
};
}
catch (e) {
console.error(`[CRI] parseInterval error file=${xmlFilePath} interval=[${startIndex}..${endIndex}] event=${agendaEventId}:`, e);
return null;
}
}
export function sessionStartYearFromDate(d) {
// Session (1th oct N → 30 sept N+1)
const m = d.getMonth();
const y = d.getFullYear();
return m >= 9 ? y : y - 1;
}
function roleForSpeaker(labelOrQualite) {
const s = (labelOrQualite || "").toLowerCase();
if (/^(m\.|mme)?\s*(le|la)\s+pr[ée]sident(e)?\b/.test(s) || /\bpr[ée]sident[e]?\s+de\s+séance\b/.test(s))
return "président";
return "";
}
function readIntervenantMeta($block) {
const int = $block.find("cri\\:intervenant").first();
if (int.length)
return { mat: int.attr("mat") || undefined, nom: int.attr("nom") || undefined, qua: int.attr("qua") || undefined };
const html = $block.html() || "";
const m = html.match(/<!--\s*cri:intervenant\b([^>]+)-->/i);
if (!m)
return {};
const out = {};
const re = /(\w+)="([^"]*)"/g;
let a;
while ((a = re.exec(m[1])))
out[a[1]] = decodeHtmlEntities(a[2]);
return { mat: out["mat"], nom: out["nom"], qua: out["qua"] };
}
function extractAndRemoveLeadingQualite($, $block) {
const firstP = $block.find("p").first();
if (firstP.length === 0)
return "";
const parts = [];
let stop = false;
firstP.contents().each((_, node) => {
if (stop)
return;
if (node.type === "tag") {
const $node = $(node);
if ($node.hasClass("orateur_nom")) {
$node.remove();
return;
}
if ($node.hasClass("orateur_qualite")) {
parts.push($node.text() || "");
$node.remove();
return;
}
const t = norm($node.text() || "");
if (t)
stop = true;
else
$node.remove();
}
else if (node.type === "text") {
const t = norm(node.data || "");
if (!t || /^[:.,;–—-]+$/.test(t)) {
;
node.data = "";
return;
}
stop = true;
}
});
return fixApostrophes(norm(parts.join(" ")));
}
function sanitizeInterventionHtml($, $block) {
const ps = $block.find("p").toArray();
const cleaned = ps
.map((p) => {
const $p = $(p).clone();
$p.find(".orateur_nom, .orateur_qualite").remove();
$p.find("a").each((_, a) => {
const $a = $(a);
$a.replaceWith($a.text());
});
$p.find(".info_entre_parentheses").each((_, el) => {
const txt = $(el).text();
$(el).replaceWith($("<em/>").text(txt));
});
$p.find("span").each((_, span) => {
const $s = $(span);
if (!$s.text().trim())
$s.remove();
});
const inner = ($p.html() || "").trim();
if (!inner)
return null;
return `<p>${inner}</p>`;
})
.filter(Boolean);
return cleaned.join("<br/>");
}
function extractSommaireForIntervals($, idx, intervals) {
const inIv = (el) => elementInAnyInterval(el, idx, intervals);
const root = $("body");
const sommaire = { presidentSeance: { _: "" }, sommaire1: [] };
// (1) Présidence (tm2) — première ligne dans l’intervalle
const pres = root
.find("p.tm2")
.filter((_, el) => inIv(el))
.first();
if (pres.length)
sommaire.presidentSeance = { _: norm(pres.text()) };
// (2) Paras tm5 présents dans l’intervalle
const paras = [];
root.find("p.tm5").each((_, el) => {
if (!inIv(el))
return;
const t = norm($(el).text());
if (t)
paras.push({ _: t });
});
if (paras.length)
sommaire.para = paras.length === 1 ? paras[0] : paras;
// (3) Items de 1er niveau (tm3) présents dans l’intervalle
const items = [];
root.find("p.tm3").each((_, el) => {
if (!inIv(el))
return;
const $p = $(el);
const full = norm($p.text() || "");
if (!full)
return;
const numMatch = full.match(/^(\d+)\s*[.\-–—]\s*/);
const valeur = numMatch ? numMatch[1] : undefined;
// prefere intitule in ancre <a> if present
const a = $p.find("a").first();
const intituleRaw = a.length ? a.text() : full.replace(/^(\d+)\s*[.\-–—]\s*/, "");
const intitule = norm(intituleRaw);
// id_syceron from href="#Niv1_SOMx"
const href = (a.attr("href") || "").trim();
const idSyceron = href.startsWith("#") ? href.slice(1) : href;
const titreStruct = { id_syceron: idSyceron || "", intitule };
items.push({ valeur_pts_odj: valeur, titreStruct });
});
if (items.length)
sommaire.sommaire1 = items;
return sommaire;
}
function extractMetadonnees($, filePath) {
let dateText = norm($("h1, h2, .page-title").first().text() || "");
if (!dateText)
dateText = norm($("p").first().text() || "");
const dateMatch = dateText.match(/\b(\d{1,2}\s+\w+\s+\d{4})\b/i);
const allText = norm($("body").text() || "");
const sessionMatch = allText.match(/\bsession\s+(\d{4}-\d{4})\b/i);
let dateSeance = dateMatch?.[1] || "";
if (!dateSeance) {
const m = filePath.match(/d(\d{4})(\d{2})(\d{2})\.xml$/i);
if (m)
dateSeance = `${m[1]}-${m[2]}-${m[3]}`;
}
dateSeance = toCRDate(dateSeance, null);
return {
dateSeance,
dateSeanceJour: dateSeance,
numSeanceJour: "",
numSeance: "",
typeAssemblee: "SN",
legislature: "",
session: sessionMatch?.[1] || "",
nomFichierJo: "",
validite: "",
etat: "",
diffusion: "",
version: "1.0",
environnement: "",
heureGeneration: new Date(),
};
}
function elementInAnyInterval(el, idx, intervals) {
const p = idx.get(el);
if (p == null)
return false;
for (const iv of intervals)
if (p >= iv.start && p < iv.end)
return true;
return false;
}