UNPKG

@tricoteuses/senat

Version:

Handle French Sénat's open data

357 lines (356 loc) 13.6 kB
// File: tests/videoMatching.test.ts // Video matching benchmark test (LIVE data) using a gold set of expected matches // to tune weights for the matching algorithm. // The test performs a grid search over a range of weight configurations, // aiming to find configurations that yield zero WRONG matches while maximizing HITs. // The gold set is stored in tests/fixtures/data/expected-video-matching.json import { describe, it, expect } from "vitest"; import * as fs from "node:fs/promises"; import * as path from "node:path"; import { isAmbiguousTimeOriginal, toFRDate } from "../src/utils/date"; import { dice, getOrgKey, normalize, scoreVideo } from "../src/utils/scoring"; import { buildSenatVodMasterM3u8FromNvs, getLevel1Chapters, parseDataNvs } from "../src/utils/nvs-parsing"; import { extractCandidatesFromSearchHtml, fetchAllSearchPages, fetchBuffer, getAgendaType, SENAT_DATAS_ROOT, } from "../src/videos"; const LIVE_CACHE_DIR = path.join(process.cwd(), "tests", ".cache", "video-matching-live"); const FIXTURES_ROOT = path.join(process.cwd(), "tests", "fixtures", "data"); const GOLD_PATH = path.join(FIXTURES_ROOT, "expected-video-matching.json"); async function readFileIfExists(p) { try { return await fs.readFile(p); } catch { return null; } } async function writeFileAtomic(p, buf) { await fs.mkdir(path.dirname(p), { recursive: true }); const tmp = `${p}.tmp-${Date.now()}`; await fs.writeFile(tmp, buf); await fs.rename(tmp, p); } async function readJson(p) { return JSON.parse(await fs.readFile(p, "utf-8")); } async function loadAgenda(uid) { const p = path.join(FIXTURES_ROOT, uid + ".json"); return readJson(p); } function computeAgendaTsSeconds(dateStr, startTimeStr) { if (!dateStr || !startTimeStr) return null; const iso = `${dateStr}T${startTimeStr}`; const ms = Date.parse(iso); if (Number.isNaN(ms)) return null; return Math.floor(ms / 1000); } function verdict(picked, expected) { if (!picked) return "MISS"; if (picked.m3u8 !== expected.m3u8) return "WRONG"; if (typeof expected.startSec === "number" && picked.startSec !== expected.startSec) return "WRONG"; return "HIT"; } function* ratioWeights(step = 0.05) { // wTitle + wOrg + wSalle + wTime = 1 // wTime et wSalle peuvent être petits for (let wTitle = 0.45; wTitle <= 0.8; wTitle += step) { for (let wOrg = 0.1; wOrg <= 0.35; wOrg += step) { for (let wSalle = 0.0; wSalle <= 0.25; wSalle += step) { const wTime = 1 - wTitle - wOrg - wSalle; if (wTime < 0 || wTime > 0.2) continue; yield { wTitle, wOrg, wSalle, wTime }; } } } } function* weightGrid() { const minAcceptList = [0.2, 0.4, 0.5, 0.6, 0.7]; const marginList = [0.04, 0.06, 0.08, 0.1]; const titleMinList = [0.2, 0.3, 0.4]; const titleDominanceList = [0, 0.5]; const orgPenaltyList = [0.6, 0.8, 0.9]; const sameOrgBonusList = [0.1, 0.2, 0.3]; for (const minAccept of minAcceptList) { for (const margin of marginList) { for (const titleMin of titleMinList) { for (const titleDominance of titleDominanceList) { for (const orgUncertainPenalty of orgPenaltyList) { for (const sameOrgBonus of sameOrgBonusList) { for (const { wTitle, wOrg, wSalle, wTime } of ratioWeights(0.05)) { yield { minAccept, margin, titleMin, titleDominance, orgUncertainPenalty, sameOrgBonus, wTitle, wOrg, wSalle, wTime, }; } } } } } } } } function candKey(c) { return `${c.id}_${c.hash}`; } function candidatesCachePath(uid) { return path.join(LIVE_CACHE_DIR, uid, "candidates.json"); } function nvsCachePath(uid, c) { return path.join(LIVE_CACHE_DIR, uid, "nvs", `${c.id}_${c.hash}.data.nvs`); } async function prefetchForUid(uid, agenda) { const agendaTs = computeAgendaTsSeconds(agenda.date, agenda.startTime); const timeAmbigious = isAmbiguousTimeOriginal(agenda.events?.[0]?.timeOriginal); // 1) candidates (cached) let candidates = []; const candCache = await readFileIfExists(candidatesCachePath(uid)); if (candCache) { candidates = JSON.parse(candCache.toString("utf-8")); } else { const searchParams = { search: "true", videotype: getAgendaType(agenda), }; if (agenda.date) { const fr = toFRDate(agenda.date); searchParams.period = "custom"; searchParams.begin = fr; searchParams.end = fr; } if (agenda.organe) searchParams.organe = agenda.organe; const pages = await fetchAllSearchPages(searchParams, 3); candidates = pages.length ? extractCandidatesFromSearchHtml(pages.join("\n")) : []; await writeFileAtomic(candidatesCachePath(uid), Buffer.from(JSON.stringify(candidates, null, 2), "utf-8")); } // 2) data.nvs for each candidate (cached) const nvsByKey = new Map(); for (const c of candidates) { const key = candKey(c); const p = nvsCachePath(uid, c); const cached = await readFileIfExists(p); if (cached) { nvsByKey.set(key, cached.toString("utf-8")); continue; } const url = `${SENAT_DATAS_ROOT}/${c.id}_${c.hash}/content/data.nvs`; const buf = await fetchBuffer(url); if (!buf) continue; await writeFileAtomic(p, buf); nvsByKey.set(key, buf.toString("utf-8")); } return { uid, agenda, agendaTs, timeAmbigious, candidates, nvsByKey }; } async function addFeatures(pref) { const feats = new Map(); const agendaOrgNorm = pref.agenda.organe ? normalize(pref.agenda.organe) : null; const agendaKey = agendaOrgNorm ? getOrgKey(agendaOrgNorm) : null; for (const c of pref.candidates) { const dataStr = pref.nvsByKey.get(candKey(c)); if (!dataStr) continue; const m3u8 = buildSenatVodMasterM3u8FromNvs(dataStr); if (!m3u8) continue; const meta = parseDataNvs(dataStr); const chapterTitles = getLevel1Chapters(dataStr); // same as prod: SP title override let videoTitle = c.title; if (c.isSeancePublique && meta.firstChapterLabel) { videoTitle = meta.firstChapterLabel; } // precompute org similarity (like prod loop) let bestDice = 0; let hasSameKey = false; if (agendaOrgNorm && agendaKey && meta.organes?.length) { for (const vo of meta.organes) { const videoOrgNorm = normalize(vo); const videoKey = getOrgKey(videoOrgNorm); const d = dice(agendaOrgNorm, videoOrgNorm); if (videoKey === agendaKey && videoKey !== "autre") hasSameKey = true; if (d > bestDice) bestDice = d; } } feats.set(candKey(c), { m3u8, videoTitle, videoEpoch: meta.epoch, videoOrganes: meta.organes, salle: meta.salle, chapterTitles, hasSameKey, bestDice, }); } return { ...pref, feats }; } function scoreCandidateWithProd(pref, f, w) { const agenda = pref.agenda; const isSP = (agenda.type ?? "").toLowerCase().includes("séance publique"); const orgSkipDice = w.orgSkipDice ?? 0.8; const titleDominance = w.titleDominance ?? 0; const orgUncertainPenalty = w.orgUncertainPenalty ?? 1; // ---- same org gate (same as prod) let sameOrg = false; if (agenda.organe && f.videoOrganes?.length) { if (f.hasSameKey) sameOrg = true; else if (f.bestDice < orgSkipDice) return null; // skipped like prod } const vw = { wTitle: w.wTitle, wOrg: w.wOrg, wSalle: w.wSalle, wTime: w.wTime, sameOrgBonus: w.sameOrgBonus, titleMin: isSP ? (w.spTitleMin ?? w.titleMin) : w.titleMin, }; const { score: rawScore, signals } = scoreVideo(agenda, pref.agendaTs, sameOrg, vw, f.videoTitle, f.videoEpoch, f.videoOrganes, pref.timeAmbigious, f.salle, f.chapterTitles); // org uncertainty penalty (like prod matchOneReunion) const s = sameOrg ? rawScore : rawScore * orgUncertainPenalty; // title dominance gate if (titleDominance > 0 && signals.titleScore < titleDominance * s) return null; return s; } function pickBestFromPref(pref, w) { let best = null; let secondScore = -Infinity; for (const c of pref.candidates) { const f = pref.feats.get(candKey(c)); if (!f) continue; const s = scoreCandidateWithProd(pref, f, w); if (s == null) continue; if (!best || s > best.score) { if (best) secondScore = best.score; best = { m3u8: f.m3u8, score: s }; } else if (s > secondScore) { secondScore = s; } } if (!best) return null; const minAcceptBase = w.minAccept ?? 0.75; const minAccept = minAcceptBase + (pref.candidates.length >= 20 ? 0.08 : pref.candidates.length >= 10 ? 0.05 : 0); if (best.score < minAccept) return null; const marginBase = w.margin ?? 0.08; const isSP = (pref.agenda.type ?? "").toLowerCase().includes("séance publique"); const margin = isSP ? (w.spMargin ?? Math.max(0.12, marginBase)) : marginBase; if (secondScore !== -Infinity && best.score - secondScore < margin) return null; return best; } function n(v, def = 0) { return typeof v === "number" ? v : def; } function cmpZeroWrong(a, b) { // 1) HIT maximal if (a.hit !== b.hit) return b.hit - a.hit; // 2) MISS minimal if (a.miss !== b.miss) return a.miss - b.miss; // 3) minAccept maximal const aMin = n(a.minAccept, 0); const bMin = n(b.minAccept, 0); if (aMin !== bMin) return bMin - aMin; // 4) margin maximal const aMar = n(a.margin, 0); const bMar = n(b.margin, 0); if (aMar !== bMar) return bMar - aMar; return 0; } function cmpBestOverall(a, b) { // 1) WRONG minimal if (a.wrong !== b.wrong) return a.wrong - b.wrong; // 2) HIT maximal if (a.hit !== b.hit) return b.hit - a.hit; // 3) MISS minimal if (a.miss !== b.miss) return a.miss - b.miss; // 4) minAccept maximal const aMin = n(a.minAccept, 0); const bMin = n(b.minAccept, 0); if (aMin !== bMin) return bMin - aMin; // 5) margin maximal const aMar = n(a.margin, 0); const bMar = n(b.margin, 0); if (aMar !== bMar) return bMar - aMar; return 0; } describe("video matching LIVE benchmark (gold) — HIT/WRONG/MISS", () => { it("grid search (target WRONG=0, maximize HIT)", async () => { const gold = await readJson(GOLD_PATH); // Prefetch + features const prefByUid = new Map(); for (const item of gold) { const agenda = await loadAgenda(item.reunionUid); const pref = await prefetchForUid(item.reunionUid, agenda); const withF = await addFeatures(pref); prefByUid.set(item.reunionUid, withF); if (withF.candidates.length === 0) { console.log(`[MISS-CANDIDATES] ${item.reunionUid} → no candidates from search`); } else { console.log(`[CANDIDATES] ${item.reunionUid}${withF.candidates.length}`); } } const rows = []; for (const w of weightGrid()) { let hit = 0, wrong = 0, miss = 0; // EARLY STOP: target WRONG=0 for (const item of gold) { const expected = { m3u8: item.expected.m3u8 }; const pref = prefByUid.get(item.reunionUid); const best = pickBestFromPref(pref, w); const v = verdict(best ? { m3u8: best.m3u8 } : null, expected); if (v === "HIT") hit++; else if (v === "WRONG") { wrong++; break; } else miss++; } rows.push({ ...w, hit, wrong, miss }); } // STRICT: WRONG = 0 const zeroWrong = rows.filter((r) => r.wrong === 0).sort(cmpZeroWrong); console.log("Top configs (WRONG=0) — sorted by: HIT desc, MISS asc, minAccept desc, margin desc"); console.table(zeroWrong.slice(0, 15)); const bestOverall = rows.slice().sort(cmpBestOverall)[0]; console.log("Best overall (min WRONG, max HIT, min MISS, max minAccept, max margin):", bestOverall); // you want at least one config with WRONG=0 expect(zeroWrong.length).toBeGreaterThan(0); expect(zeroWrong[0].wrong).toBe(0); }, 20 * 60_000); });