@tricoteuses/senat
Version:
Handle French Sénat's open data
270 lines (269 loc) • 10.3 kB
JavaScript
import * as cheerio from "cheerio";
import path from "path";
import { makeReunionUid } from "../utils/reunion_parsing";
import { norm } from "../utils/string_cleaning";
import { frDateToISO, hourShortToStartTime } from "../utils/date";
import { toCRDate } from "./util";
const PARA_h3_SEL = "p.sh_justify, p.sh_center, p.sh_marge, p[align], li, h3";
function findDayRoot($, targetISO) {
let $root = $();
$("h2").each((_, el) => {
const txt = norm($(el).text());
const m = txt.match(/(?:Lundi|Mardi|Mercredi|Jeudi|Vendredi|Samedi|Dimanche)\s+(.+)$/i);
const iso = m ? frDateToISO(m[1]) : undefined;
if (iso === targetISO && $root.length === 0)
$root = $(el);
});
return $root;
}
function normalizeSpaces(s) {
return s.replace(/[\u00A0\u202F\u2009]/g, " ");
}
function stripIntroPunct(s) {
return s.replace(/^[\s]*[.:;]?\s*(?:[–—-]\s*)+/u, "");
}
function collectLeadingHeaderStrongEls($, $clone) {
const els = [];
const nodes = $clone.contents().toArray();
for (const node of nodes) {
if (node.type === "text") {
if (norm(node.data || ""))
break;
continue;
}
if (node.type === "tag") {
const $n = $(node);
if ($n.is("strong, b")) {
els.push(node);
continue;
}
if ($n.is("a") && $n.children("strong, b").length) {
$n.children("strong, b").each((_, el) => {
els.push($(el));
});
continue;
}
break;
}
}
return els;
}
// Remove orateur's name from text and clean intro punct
export function getRemainingTextAfterSpeakerHeader($, $p) {
const $clone = $p.clone();
// 1) Remove <strong> at start
const headerStrongEls = collectLeadingHeaderStrongEls($, $clone);
for (const el of headerStrongEls)
$(el).remove();
// 2) normalize + clean intro punct
let remainingHtml = $clone.html() || "";
remainingHtml = normalizeSpaces(cheerio.load(remainingHtml).text());
remainingHtml = stripIntroPunct(remainingHtml);
const remainingText = norm(remainingHtml || "");
return remainingText;
}
function buildPointsFromParagraphs($, paras) {
const points = [];
let ordreAbsoluSeance = 0;
const normSpeaker = (s) => s
.normalize("NFKC")
.replace(/\s+/g, " ")
.replace(/[:\.]\s*$/, "")
.trim();
const normQual = (s) => s
.normalize("NFKC")
.replace(/\s+/g, " ")
.replace(/^\s*,\s*|\s+$/g, "")
.replace(/[\s\u00A0]*[.,;:–—-]+$/u, "")
.trim();
let currentOrateur = null;
let currentQualite = "";
let currentTexte = "";
function isPresidentQual(qual) {
return /\bprésident(e)?\b/i.test(qual);
}
// Flush the buffered speaker’s text into points[] if any.
function flush() {
if (!currentOrateur || !currentTexte.trim())
return;
ordreAbsoluSeance++;
points.push({
code_grammaire: "PAROLE_GENERIQUE",
roledebat: isPresidentQual(currentQualite) ? "président" : "",
ordre_absolu_seance: String(ordreAbsoluSeance),
orateurs: { orateur: { nom: currentOrateur, id: "", qualite: currentQualite || "" } },
texte: { _: currentTexte.trim() },
});
currentOrateur = null;
currentQualite = "";
currentTexte = "";
}
function addPoint(payload) {
ordreAbsoluSeance++;
points.push({ ...payload, ordre_absolu_seance: String(ordreAbsoluSeance) });
}
for (const $p of paras) {
if ($p.closest("table").length)
continue;
const tagName = ($p.prop("tagName") || "").toString().toLowerCase();
const rawText = ($p.text() || "").replace(/\u00a0/g, " ").trim();
const text = norm(rawText);
if (!text || text.length <= 3)
continue;
const html = ($p.html() || "").trim();
const italicSpans = $p.find("i, em, span[style*='italic']");
const firstItalicOuter = italicSpans.length ? $(italicSpans[0]).prop("outerHTML") || "" : "";
const htmlBeforeFirstItalic = firstItalicOuter ? html.split(firstItalicOuter)[0].trim() : "";
const isPureItalic = italicSpans.length > 0 && italicSpans.length === $p.find("span,i,em").length && htmlBeforeFirstItalic === "";
if (tagName === "h3") {
flush();
addPoint({
code_style: "Titre",
code_grammaire: "TITRE_TEXTE_DISCUSSION",
texte: { _: text },
});
continue;
}
const boldSpans = $p.find("strong, b");
const joinedBold = norm(boldSpans
.map((_, el) => $(el).text() || "")
.get()
.join(""));
const [namePartRaw, qualPartRaw] = joinedBold.split(/\s*,\s+/, 2);
const namePart = namePartRaw ? normSpeaker(namePartRaw) : "";
const qualPart = qualPartRaw ? normQual(qualPartRaw) : "";
const looksLikeName = namePart.length > 3 && /^(M\.|Mme)[\s\u00A0\u202F]+/i.test(namePart);
const startsWithName = namePart && text.startsWith(namePart);
const isNewSpeaker = looksLikeName && startsWithName && namePart !== currentOrateur;
if (isNewSpeaker) {
flush();
currentOrateur = namePart;
currentQualite = qualPart;
const remainingText = getRemainingTextAfterSpeakerHeader($, $p);
currentTexte = remainingText;
continue;
}
if (isPureItalic || (!joinedBold && !currentOrateur && text)) {
flush();
addPoint({
code_style: "Info Italiques",
code_grammaire: "PAROLE_GENERIQUE",
texte: { _: "<i>" + text + "</i>" },
});
continue;
}
// concat text because same orateur
if (currentOrateur) {
const removeOrateurFromText = getRemainingTextAfterSpeakerHeader($, $p);
currentTexte += (currentTexte ? "<br/><br/>" : "") + removeOrateurFromText;
continue;
}
}
flush();
return points;
}
const TIME_RE = /(?:\b[àa]\s*)?(\d{1,2})\s*(?:h|heures?)\s*(?:([0-5]\d))?/i;
export function cleanTitle(t) {
return (t || "").replace(/\s+/g, " ").trim();
}
function parseTimeToHHmm(text) {
const m = normalizeSpaces(text).match(TIME_RE);
if (!m)
return undefined;
const hh = m[1]?.padStart(2, "0");
const mm = (m[2] ?? "00").padStart(2, "0");
const h = Number(hh);
if (h >= 0 && h <= 23)
return `${hh}:${mm}`;
return undefined;
}
function findNearbyTime($, $h3) {
let cur = $h3.prev();
for (let i = 0; i < 3 && cur.length; i++, cur = cur.prev()) {
const direct = parseTimeToHHmm(cur.text());
if (direct)
return direct;
const italic = parseTimeToHHmm(cur.find("i, em").first().text());
if (italic)
return italic;
}
return undefined;
}
export function extractDayH3Sections($, dateISO) {
const sections = [];
const $dayRoot = findDayRoot($, dateISO);
if ($dayRoot.length === 0)
return sections;
const $range = $dayRoot.nextUntil("h2");
const $h3s = $range.filter("h3").add($range.find("h3"));
$h3s.each((_, el) => {
const $h3 = $(el);
const title = cleanTitle($h3.text());
if (!title)
return;
const time = findNearbyTime($, $h3);
sections.push({ title, $start: $h3, time });
});
return sections;
}
export function parseCommissionCRSectionFromDom($, htmlFilePath, opts) {
try {
const { dateISO, hourShort, organe, section, matched } = opts;
const seanceRef = matched?.uid ?? makeReunionUid(dateISO, "COM", matched?.events[0].id ?? hourShort ?? "", organe ?? undefined);
const uid = seanceRef.replace(/^RU/, "CRC");
const dateSeance = toCRDate(dateISO, matched?.startTime ?? hourShortToStartTime(hourShort));
const $dayRoot = findDayRoot($, dateISO);
if ($dayRoot.length === 0) {
console.warn(`[COM-CR][parse] day root not found for ${dateISO} in ${path.basename(htmlFilePath)}`);
return null;
}
const paras = [];
let $cursor = section.$start;
// Jump title if we do not want to add it to paragraphes
$cursor = $cursor.next();
while ($cursor.length && !$cursor.is("h2") && !$cursor.is("h3")) {
if ($cursor.is(PARA_h3_SEL)) {
paras.push($cursor);
}
else {
const $ps = $cursor.find(PARA_h3_SEL);
if ($ps.length)
$ps.each((_, p) => {
paras.push($(p));
});
}
$cursor = $cursor.next();
}
const points = buildPointsFromParagraphs($, paras);
if (points.length < 4 || !points.some((pt) => pt.code_grammaire === "PAROLE_GENERIQUE" && pt.orateurs)) {
console.warn(`[COM-CR][parse] Insufficient points or no interventions found for a section in ${path.basename(htmlFilePath)}`);
return null;
}
const session = dateISO.slice(5, 7) >= "10" ? `${dateISO.slice(0, 4)}` : `${Number(dateISO.slice(0, 4)) - 1}`;
const contenu = {
quantiemes: { journee: dateISO, session },
point: points,
};
const metadonnees = {
dateSeance,
dateSeanceJour: dateISO,
numSeanceJour: "",
numSeance: "",
typeAssemblee: "SN",
legislature: "",
session,
nomFichierJo: path.basename(htmlFilePath),
validite: "non-certifie",
etat: "definitif",
diffusion: "publique",
version: "1",
environnement: "prod",
heureGeneration: new Date(),
};
return { uid, seanceRef, sessionRef: session, metadonnees, contenu };
}
catch (e) {
console.error(`[COM-CR][parse] error section file=${path.basename(htmlFilePath)}:`, e);
return null;
}
}