@tricoteuses/senat
Version:
Handle French Sénat's open data
251 lines (250 loc) • 8.81 kB
JavaScript
import { parseISO } from "./reunion_parsing";
import { normalizeText } from "./string_cleaning";
export function jaccard(a, b) {
if (!a.size || !b.size)
return 0;
let inter = 0;
for (const t of a)
if (b.has(t))
inter++;
return inter / (a.size + b.size - inter);
}
export function jaccardTokenSim(a, b) {
const A = new Set(normalizeText(a).split(" ").filter(Boolean));
const B = new Set(normalizeText(b).split(" ").filter(Boolean));
if (A.size === 0 || B.size === 0)
return 0;
let inter = 0;
for (const x of A)
if (B.has(x))
inter++;
return inter / (A.size + B.size - inter);
}
export function isNoiseBlock(text) {
const t = normalizeText(text);
if (!t)
return true;
if (t.length < 6)
return true;
if (/^article\s+\d+/.test(t))
return true;
if (/\b(organisation des travaux|ordre du jour|suspension|reprise de la seance)\b/.test(t))
return true;
if (/\b(vice presidente|president|secretaire|ministre|rapporteur)\b/.test(t))
return true;
return false;
}
/**
* Score robuste pour "bloc court vs event long":
* - coverageBloc = |A∩B| / |A| (critère principal)
* - jaccard en secondaire (utile quand les 2 sont longs)
* - bonus time optionnel (déjà chez toi)
*/
export function scoreSommaireBlockForEvent(blockText, ev) {
const evText = `${ev.titre ?? ""} ${ev.objet ?? ""}`;
const A = tokens(blockText); // tokens du bloc
const B = tokens(evText); // tokens de l'event
if (A.size === 0 || B.size === 0)
return 0;
const inter = intersectionSize(A, B);
const coverageBloc = inter / A.size; // 🔥 clé
const jac = jaccard(A, B);
// bonus inclusion brut (utile sur des expressions exactes)
const bNorm = normalizeText(blockText);
const eNorm = normalizeText(evText);
const inclusion = bNorm.length >= 10 && eNorm.includes(bNorm) ? 0.12 : 0;
// bonus heure (si tu l'as)
const timeBonus = timeProximityBonus(ev.startTime ?? null, blockText);
// combine: coverage domine
const score = Math.max(coverageBloc, jac) * 0.85 + Math.min(1, jac) * 0.1 + inclusion + timeBonus;
return Math.max(0, Math.min(1, score));
}
function intersectionSize(a, b) {
let inter = 0;
for (const x of a)
if (b.has(x))
inter++;
return inter;
}
function tokens(s) {
const stop = new Set(["de", "du", "des", "la", "le", "les", "et", "au", "aux", "sur", "en", "d", "l", "un", "une"]);
return new Set(normalizeText(s)
.split(" ")
.filter((w) => w.length >= 3 && !stop.has(w)));
}
function timeProximityBonus(eventStartISO, blockText) {
if (!eventStartISO)
return 0;
const dt = parseISO(eventStartISO);
if (!dt)
return 0;
const eventMin = dt.hour * 60 + dt.minute;
const hints = extractHourHints(blockText);
if (!hints.length)
return 0;
let best = Infinity;
for (const x of hints)
best = Math.min(best, Math.abs(x.h * 60 + x.m - eventMin));
if (best <= 30)
return 0.1;
if (best <= 60)
return 0.06;
if (best <= 120)
return 0.03;
return 0;
}
function extractHourHints(text) {
const t = (text || "").toLowerCase();
const out = [];
const re = /\b(\d{1,2})\s*h\s*(\d{2})\b/g;
let m;
while ((m = re.exec(t)))
out.push({ h: Number(m[1]), m: Number(m[2]) });
return out;
}
export function getOrgKey(norm) {
if (!norm)
return "autre";
if (norm.includes("seance publique"))
return "seance_publique";
if (norm.includes("culture"))
return "culture";
if (norm.includes("finances"))
return "finances";
if (norm.includes("sociales"))
return "affaires_sociales";
if (norm.includes("economiques"))
return "affaires_economiques";
if (norm.includes("europeennes"))
return "affaires_europeennes";
if (norm.includes("etrangeres") || norm.includes("forces armees") || norm.includes("defense")) {
return "affaires_etrangeres_defense";
}
if (norm.includes("territoire") || norm.includes("durable")) {
return "amenagement_territoire_dd";
}
if (norm.includes("commission des lois"))
return "lois";
if (norm.includes("delegation aux collectivites territoriales") || norm.includes("delegation a la decentralisation"))
return "delegation_collectivites";
if (norm.includes("delegation aux droits des femmes") ||
norm.includes("egalite des chances entre les hommes et les femmes"))
return "delegation_droits_femmes";
if (norm.includes("delegation aux entreprises"))
return "delegation_entreprises";
if (norm.includes("delegation senatoriale aux outre mer") || norm.includes("delegation aux outre mer"))
return "delegation_outre_mer";
if (norm.includes("delegation a la prospective"))
return "delegation_prospective";
if (norm.includes("office parlementaire d evaluation des choix scientifiques et technologiques") ||
norm.includes("opecst"))
return "opecst";
return "autre";
}
function tokensDice(s) {
return normalize(s).split(" ").filter(Boolean);
}
export function dice(a, b) {
const A = new Set(tokensDice(a)), B = new Set(tokensDice(b));
if (!A.size || !B.size)
return 0;
let inter = 0;
for (const t of A)
if (B.has(t))
inter++;
return (2 * inter) / (A.size + B.size);
}
function tokenSet(s) {
if (!s)
return new Set();
return new Set(tokens(s));
}
export function coverage(reference, candidate) {
const A = tokenSet(reference);
const B = tokenSet(candidate);
if (!A.size || !B.size)
return 0;
let inter = 0;
for (const t of A)
if (B.has(t))
inter++;
return inter / A.size;
}
export function diceFiltered(a, b) {
const A = tokenSet(a);
const B = tokenSet(b);
if (!A.size || !B.size)
return 0;
let inter = 0;
for (const t of A)
if (B.has(t))
inter++;
return (2 * inter) / (A.size + B.size);
}
export function similarityScore(a, b) {
const cov = coverage(a, b);
const d = diceFiltered(a, b);
return 0.7 * cov + 0.3 * d;
}
export function normalize(s) {
return (s ?? "")
.toLowerCase()
.normalize("NFD")
.replace(/[\u0300-\u036f]/g, "")
.replace(/[^\p{L}\p{N}\s-]/gu, " ")
.replace(/\s+/g, " ")
.trim();
}
export function normalizeSalle(s) {
const v = (s ?? "").trim();
if (!v)
return null;
const m = v.match(/\b([A-Z]\d{2,4})\b/);
if (m)
return m[1]; // "A263"
return v.toLowerCase();
}
export function scoreVideo(agenda, agendaTs, sameOrg, w, videoTitle, videoEpoch, videoOrganes, timeAmbigious = false, salle, chapterTitles) {
const weights = w;
const objetS = similarityScore(agenda.objet || "", videoTitle || "");
const titleS = similarityScore(agenda.titre || "", videoTitle || "");
let titleScore = Math.max(objetS, titleS);
chapterTitles = chapterTitles || [];
for (const ch of chapterTitles) {
const chObjetS = similarityScore(agenda.objet || "", ch.label);
const chTitreS = similarityScore(agenda.titre || "", ch.label);
titleScore = Math.max(titleScore, Math.max(chObjetS, chTitreS));
}
let timeScore = 0;
if (agendaTs && videoEpoch) {
const deltaMin = Math.abs(videoEpoch - agendaTs) / 60;
timeScore = Math.exp(-deltaMin / 60);
}
let orgScore = 0;
if (agenda.organe && videoOrganes?.length) {
orgScore = Math.max(...videoOrganes.map((v) => similarityScore(agenda.organe, v)));
}
// Salle: normalized "A263" matching
let salleScore = 0;
const aSalle = normalizeSalle(agenda.lieu);
const vSalle = normalizeSalle(salle);
if (aSalle && vSalle) {
salleScore = aSalle === vSalle ? 1 : 0;
}
// --- gate: title minimum
if (weights.titleMin && titleScore < weights.titleMin) {
const signals = { titleScore, orgScore, salleScore, timeScore, sameOrg, timeAmbigious };
return { score: 0, signals };
}
// base score
const base = weights.wTitle * titleScore +
(sameOrg ? weights.sameOrgBonus : weights.wOrg * orgScore) +
weights.wSalle * salleScore +
weights.wTime * timeScore;
// time ambiguous + no salle => ignore time part (keep your previous behavior)
const score = timeAmbigious && salleScore === 0
? weights.wTitle * titleScore + (sameOrg ? weights.sameOrgBonus : weights.wOrg * orgScore)
: base;
const signals = { titleScore, orgScore, salleScore, timeScore, sameOrg, timeAmbigious };
return { score, signals };
}