UNPKG

@tricoteuses/senat

Version:

Handle French Sénat's open data

270 lines (269 loc) 10.3 kB
import * as cheerio from "cheerio"; import path from "path"; import { makeReunionUid } from "../utils/reunion_parsing"; import { norm } from "../utils/string_cleaning"; import { frDateToISO, hourShortToStartTime } from "../utils/date"; import { toCRDate } from "./util"; const PARA_h3_SEL = "p.sh_justify, p.sh_center, p.sh_marge, p[align], li, h3"; function findDayRoot($, targetISO) { let $root = $(); $("h2").each((_, el) => { const txt = norm($(el).text()); const m = txt.match(/(?:Lundi|Mardi|Mercredi|Jeudi|Vendredi|Samedi|Dimanche)\s+(.+)$/i); const iso = m ? frDateToISO(m[1]) : undefined; if (iso === targetISO && $root.length === 0) $root = $(el); }); return $root; } function normalizeSpaces(s) { return s.replace(/[\u00A0\u202F\u2009]/g, " "); } function stripIntroPunct(s) { return s.replace(/^[\s]*[.:;]?\s*(?:[–—-]\s*)+/u, ""); } function collectLeadingHeaderStrongEls($, $clone) { const els = []; const nodes = $clone.contents().toArray(); for (const node of nodes) { if (node.type === "text") { if (norm(node.data || "")) break; continue; } if (node.type === "tag") { const $n = $(node); if ($n.is("strong, b")) { els.push(node); continue; } if ($n.is("a") && $n.children("strong, b").length) { $n.children("strong, b").each((_, el) => { els.push($(el)); }); continue; } break; } } return els; } // Remove orateur's name from text and clean intro punct export function getRemainingTextAfterSpeakerHeader($, $p) { const $clone = $p.clone(); // 1) Remove <strong> at start const headerStrongEls = collectLeadingHeaderStrongEls($, $clone); for (const el of headerStrongEls) $(el).remove(); // 2) normalize + clean intro punct let remainingHtml = $clone.html() || ""; remainingHtml = normalizeSpaces(cheerio.load(remainingHtml).text()); remainingHtml = stripIntroPunct(remainingHtml); const remainingText = norm(remainingHtml || ""); return remainingText; } function buildPointsFromParagraphs($, paras) { const points = []; let ordreAbsoluSeance = 0; const normSpeaker = (s) => s .normalize("NFKC") .replace(/\s+/g, " ") .replace(/[:\.]\s*$/, "") .trim(); const normQual = (s) => s .normalize("NFKC") .replace(/\s+/g, " ") .replace(/^\s*,\s*|\s+$/g, "") .replace(/[\s\u00A0]*[.,;:–—-]+$/u, "") .trim(); let currentOrateur = null; let currentQualite = ""; let currentTexte = ""; function isPresidentQual(qual) { return /\bprésident(e)?\b/i.test(qual); } // Flush the buffered speaker’s text into points[] if any. function flush() { if (!currentOrateur || !currentTexte.trim()) return; ordreAbsoluSeance++; points.push({ code_grammaire: "PAROLE_GENERIQUE", roledebat: isPresidentQual(currentQualite) ? "président" : "", ordre_absolu_seance: String(ordreAbsoluSeance), orateurs: { orateur: { nom: currentOrateur, id: "", qualite: currentQualite || "" } }, texte: { _: currentTexte.trim() }, }); currentOrateur = null; currentQualite = ""; currentTexte = ""; } function addPoint(payload) { ordreAbsoluSeance++; points.push({ ...payload, ordre_absolu_seance: String(ordreAbsoluSeance) }); } for (const $p of paras) { if ($p.closest("table").length) continue; const tagName = ($p.prop("tagName") || "").toString().toLowerCase(); const rawText = ($p.text() || "").replace(/\u00a0/g, " ").trim(); const text = norm(rawText); if (!text || text.length <= 3) continue; const html = ($p.html() || "").trim(); const italicSpans = $p.find("i, em, span[style*='italic']"); const firstItalicOuter = italicSpans.length ? $(italicSpans[0]).prop("outerHTML") || "" : ""; const htmlBeforeFirstItalic = firstItalicOuter ? html.split(firstItalicOuter)[0].trim() : ""; const isPureItalic = italicSpans.length > 0 && italicSpans.length === $p.find("span,i,em").length && htmlBeforeFirstItalic === ""; if (tagName === "h3") { flush(); addPoint({ code_style: "Titre", code_grammaire: "TITRE_TEXTE_DISCUSSION", texte: { _: text }, }); continue; } const boldSpans = $p.find("strong, b"); const joinedBold = norm(boldSpans .map((_, el) => $(el).text() || "") .get() .join("")); const [namePartRaw, qualPartRaw] = joinedBold.split(/\s*,\s+/, 2); const namePart = namePartRaw ? normSpeaker(namePartRaw) : ""; const qualPart = qualPartRaw ? normQual(qualPartRaw) : ""; const looksLikeName = namePart.length > 3 && /^(M\.|Mme)[\s\u00A0\u202F]+/i.test(namePart); const startsWithName = namePart && text.startsWith(namePart); const isNewSpeaker = looksLikeName && startsWithName && namePart !== currentOrateur; if (isNewSpeaker) { flush(); currentOrateur = namePart; currentQualite = qualPart; const remainingText = getRemainingTextAfterSpeakerHeader($, $p); currentTexte = remainingText; continue; } if (isPureItalic || (!joinedBold && !currentOrateur && text)) { flush(); addPoint({ code_style: "Info Italiques", code_grammaire: "PAROLE_GENERIQUE", texte: { _: "<i>" + text + "</i>" }, }); continue; } // concat text because same orateur if (currentOrateur) { const removeOrateurFromText = getRemainingTextAfterSpeakerHeader($, $p); currentTexte += (currentTexte ? "<br/><br/>" : "") + removeOrateurFromText; continue; } } flush(); return points; } const TIME_RE = /(?:\b[àa]\s*)?(\d{1,2})\s*(?:h|heures?)\s*(?:([0-5]\d))?/i; export function cleanTitle(t) { return (t || "").replace(/\s+/g, " ").trim(); } function parseTimeToHHmm(text) { const m = normalizeSpaces(text).match(TIME_RE); if (!m) return undefined; const hh = m[1]?.padStart(2, "0"); const mm = (m[2] ?? "00").padStart(2, "0"); const h = Number(hh); if (h >= 0 && h <= 23) return `${hh}:${mm}`; return undefined; } function findNearbyTime($, $h3) { let cur = $h3.prev(); for (let i = 0; i < 3 && cur.length; i++, cur = cur.prev()) { const direct = parseTimeToHHmm(cur.text()); if (direct) return direct; const italic = parseTimeToHHmm(cur.find("i, em").first().text()); if (italic) return italic; } return undefined; } export function extractDayH3Sections($, dateISO) { const sections = []; const $dayRoot = findDayRoot($, dateISO); if ($dayRoot.length === 0) return sections; const $range = $dayRoot.nextUntil("h2"); const $h3s = $range.filter("h3").add($range.find("h3")); $h3s.each((_, el) => { const $h3 = $(el); const title = cleanTitle($h3.text()); if (!title) return; const time = findNearbyTime($, $h3); sections.push({ title, $start: $h3, time }); }); return sections; } export function parseCommissionCRSectionFromDom($, htmlFilePath, opts) { try { const { dateISO, hourShort, organe, section, matched } = opts; const seanceRef = matched?.uid ?? makeReunionUid(dateISO, "COM", matched?.events[0].id ?? hourShort ?? "", organe ?? undefined); const uid = seanceRef.replace(/^RU/, "CRC"); const dateSeance = toCRDate(dateISO, matched?.startTime ?? hourShortToStartTime(hourShort)); const $dayRoot = findDayRoot($, dateISO); if ($dayRoot.length === 0) { console.warn(`[COM-CR][parse] day root not found for ${dateISO} in ${path.basename(htmlFilePath)}`); return null; } const paras = []; let $cursor = section.$start; // Jump title if we do not want to add it to paragraphes $cursor = $cursor.next(); while ($cursor.length && !$cursor.is("h2") && !$cursor.is("h3")) { if ($cursor.is(PARA_h3_SEL)) { paras.push($cursor); } else { const $ps = $cursor.find(PARA_h3_SEL); if ($ps.length) $ps.each((_, p) => { paras.push($(p)); }); } $cursor = $cursor.next(); } const points = buildPointsFromParagraphs($, paras); if (points.length < 4 || !points.some((pt) => pt.code_grammaire === "PAROLE_GENERIQUE" && pt.orateurs)) { console.warn(`[COM-CR][parse] Insufficient points or no interventions found for a section in ${path.basename(htmlFilePath)}`); return null; } const session = dateISO.slice(5, 7) >= "10" ? `${dateISO.slice(0, 4)}` : `${Number(dateISO.slice(0, 4)) - 1}`; const contenu = { quantiemes: { journee: dateISO, session }, point: points, }; const metadonnees = { dateSeance, dateSeanceJour: dateISO, numSeanceJour: "", numSeance: "", typeAssemblee: "SN", legislature: "", session, nomFichierJo: path.basename(htmlFilePath), validite: "non-certifie", etat: "definitif", diffusion: "publique", version: "1", environnement: "prod", heureGeneration: new Date(), }; return { uid, seanceRef, sessionRef: session, metadonnees, contenu }; } catch (e) { console.error(`[COM-CR][parse] error section file=${path.basename(htmlFilePath)}:`, e); return null; } }