UNPKG

@tricoteuses/senat

Version:

Handle French Sénat's open data

251 lines (250 loc) 8.81 kB
import { parseISO } from "./reunion_parsing"; import { normalizeText } from "./string_cleaning"; export function jaccard(a, b) { if (!a.size || !b.size) return 0; let inter = 0; for (const t of a) if (b.has(t)) inter++; return inter / (a.size + b.size - inter); } export function jaccardTokenSim(a, b) { const A = new Set(normalizeText(a).split(" ").filter(Boolean)); const B = new Set(normalizeText(b).split(" ").filter(Boolean)); if (A.size === 0 || B.size === 0) return 0; let inter = 0; for (const x of A) if (B.has(x)) inter++; return inter / (A.size + B.size - inter); } export function isNoiseBlock(text) { const t = normalizeText(text); if (!t) return true; if (t.length < 6) return true; if (/^article\s+\d+/.test(t)) return true; if (/\b(organisation des travaux|ordre du jour|suspension|reprise de la seance)\b/.test(t)) return true; if (/\b(vice presidente|president|secretaire|ministre|rapporteur)\b/.test(t)) return true; return false; } /** * Score robuste pour "bloc court vs event long": * - coverageBloc = |A∩B| / |A| (critère principal) * - jaccard en secondaire (utile quand les 2 sont longs) * - bonus time optionnel (déjà chez toi) */ export function scoreSommaireBlockForEvent(blockText, ev) { const evText = `${ev.titre ?? ""} ${ev.objet ?? ""}`; const A = tokens(blockText); // tokens du bloc const B = tokens(evText); // tokens de l'event if (A.size === 0 || B.size === 0) return 0; const inter = intersectionSize(A, B); const coverageBloc = inter / A.size; // 🔥 clé const jac = jaccard(A, B); // bonus inclusion brut (utile sur des expressions exactes) const bNorm = normalizeText(blockText); const eNorm = normalizeText(evText); const inclusion = bNorm.length >= 10 && eNorm.includes(bNorm) ? 0.12 : 0; // bonus heure (si tu l'as) const timeBonus = timeProximityBonus(ev.startTime ?? null, blockText); // combine: coverage domine const score = Math.max(coverageBloc, jac) * 0.85 + Math.min(1, jac) * 0.1 + inclusion + timeBonus; return Math.max(0, Math.min(1, score)); } function intersectionSize(a, b) { let inter = 0; for (const x of a) if (b.has(x)) inter++; return inter; } function tokens(s) { const stop = new Set(["de", "du", "des", "la", "le", "les", "et", "au", "aux", "sur", "en", "d", "l", "un", "une"]); return new Set(normalizeText(s) .split(" ") .filter((w) => w.length >= 3 && !stop.has(w))); } function timeProximityBonus(eventStartISO, blockText) { if (!eventStartISO) return 0; const dt = parseISO(eventStartISO); if (!dt) return 0; const eventMin = dt.hour * 60 + dt.minute; const hints = extractHourHints(blockText); if (!hints.length) return 0; let best = Infinity; for (const x of hints) best = Math.min(best, Math.abs(x.h * 60 + x.m - eventMin)); if (best <= 30) return 0.1; if (best <= 60) return 0.06; if (best <= 120) return 0.03; return 0; } function extractHourHints(text) { const t = (text || "").toLowerCase(); const out = []; const re = /\b(\d{1,2})\s*h\s*(\d{2})\b/g; let m; while ((m = re.exec(t))) out.push({ h: Number(m[1]), m: Number(m[2]) }); return out; } export function getOrgKey(norm) { if (!norm) return "autre"; if (norm.includes("seance publique")) return "seance_publique"; if (norm.includes("culture")) return "culture"; if (norm.includes("finances")) return "finances"; if (norm.includes("sociales")) return "affaires_sociales"; if (norm.includes("economiques")) return "affaires_economiques"; if (norm.includes("europeennes")) return "affaires_europeennes"; if (norm.includes("etrangeres") || norm.includes("forces armees") || norm.includes("defense")) { return "affaires_etrangeres_defense"; } if (norm.includes("territoire") || norm.includes("durable")) { return "amenagement_territoire_dd"; } if (norm.includes("commission des lois")) return "lois"; if (norm.includes("delegation aux collectivites territoriales") || norm.includes("delegation a la decentralisation")) return "delegation_collectivites"; if (norm.includes("delegation aux droits des femmes") || norm.includes("egalite des chances entre les hommes et les femmes")) return "delegation_droits_femmes"; if (norm.includes("delegation aux entreprises")) return "delegation_entreprises"; if (norm.includes("delegation senatoriale aux outre mer") || norm.includes("delegation aux outre mer")) return "delegation_outre_mer"; if (norm.includes("delegation a la prospective")) return "delegation_prospective"; if (norm.includes("office parlementaire d evaluation des choix scientifiques et technologiques") || norm.includes("opecst")) return "opecst"; return "autre"; } function tokensDice(s) { return normalize(s).split(" ").filter(Boolean); } export function dice(a, b) { const A = new Set(tokensDice(a)), B = new Set(tokensDice(b)); if (!A.size || !B.size) return 0; let inter = 0; for (const t of A) if (B.has(t)) inter++; return (2 * inter) / (A.size + B.size); } function tokenSet(s) { if (!s) return new Set(); return new Set(tokens(s)); } export function coverage(reference, candidate) { const A = tokenSet(reference); const B = tokenSet(candidate); if (!A.size || !B.size) return 0; let inter = 0; for (const t of A) if (B.has(t)) inter++; return inter / A.size; } export function diceFiltered(a, b) { const A = tokenSet(a); const B = tokenSet(b); if (!A.size || !B.size) return 0; let inter = 0; for (const t of A) if (B.has(t)) inter++; return (2 * inter) / (A.size + B.size); } export function similarityScore(a, b) { const cov = coverage(a, b); const d = diceFiltered(a, b); return 0.7 * cov + 0.3 * d; } export function normalize(s) { return (s ?? "") .toLowerCase() .normalize("NFD") .replace(/[\u0300-\u036f]/g, "") .replace(/[^\p{L}\p{N}\s-]/gu, " ") .replace(/\s+/g, " ") .trim(); } export function normalizeSalle(s) { const v = (s ?? "").trim(); if (!v) return null; const m = v.match(/\b([A-Z]\d{2,4})\b/); if (m) return m[1]; // "A263" return v.toLowerCase(); } export function scoreVideo(agenda, agendaTs, sameOrg, w, videoTitle, videoEpoch, videoOrganes, timeAmbigious = false, salle, chapterTitles) { const weights = w; const objetS = similarityScore(agenda.objet || "", videoTitle || ""); const titleS = similarityScore(agenda.titre || "", videoTitle || ""); let titleScore = Math.max(objetS, titleS); chapterTitles = chapterTitles || []; for (const ch of chapterTitles) { const chObjetS = similarityScore(agenda.objet || "", ch.label); const chTitreS = similarityScore(agenda.titre || "", ch.label); titleScore = Math.max(titleScore, Math.max(chObjetS, chTitreS)); } let timeScore = 0; if (agendaTs && videoEpoch) { const deltaMin = Math.abs(videoEpoch - agendaTs) / 60; timeScore = Math.exp(-deltaMin / 60); } let orgScore = 0; if (agenda.organe && videoOrganes?.length) { orgScore = Math.max(...videoOrganes.map((v) => similarityScore(agenda.organe, v))); } // Salle: normalized "A263" matching let salleScore = 0; const aSalle = normalizeSalle(agenda.lieu); const vSalle = normalizeSalle(salle); if (aSalle && vSalle) { salleScore = aSalle === vSalle ? 1 : 0; } // --- gate: title minimum if (weights.titleMin && titleScore < weights.titleMin) { const signals = { titleScore, orgScore, salleScore, timeScore, sameOrg, timeAmbigious }; return { score: 0, signals }; } // base score const base = weights.wTitle * titleScore + (sameOrg ? weights.sameOrgBonus : weights.wOrg * orgScore) + weights.wSalle * salleScore + weights.wTime * timeScore; // time ambiguous + no salle => ignore time part (keep your previous behavior) const score = timeAmbigious && salleScore === 0 ? weights.wTitle * titleScore + (sameOrg ? weights.sameOrgBonus : weights.wOrg * orgScore) : base; const signals = { titleScore, orgScore, salleScore, timeScore, sameOrg, timeAmbigious }; return { score, signals }; }