@tricoteuses/senat
Version:
Handle French Sénat's open data
357 lines (356 loc) • 13.6 kB
JavaScript
// File: tests/videoMatching.test.ts
// Video matching benchmark test (LIVE data) using a gold set of expected matches
// to tune weights for the matching algorithm.
// The test performs a grid search over a range of weight configurations,
// aiming to find configurations that yield zero WRONG matches while maximizing HITs.
// The gold set is stored in tests/fixtures/data/expected-video-matching.json
import { describe, it, expect } from "vitest";
import * as fs from "node:fs/promises";
import * as path from "node:path";
import { isAmbiguousTimeOriginal, toFRDate } from "../src/utils/date";
import { dice, getOrgKey, normalize, scoreVideo } from "../src/utils/scoring";
import { buildSenatVodMasterM3u8FromNvs, getLevel1Chapters, parseDataNvs } from "../src/utils/nvs-parsing";
import { extractCandidatesFromSearchHtml, fetchAllSearchPages, fetchBuffer, getAgendaType, SENAT_DATAS_ROOT, } from "../src/videos";
const LIVE_CACHE_DIR = path.join(process.cwd(), "tests", ".cache", "video-matching-live");
const FIXTURES_ROOT = path.join(process.cwd(), "tests", "fixtures", "data");
const GOLD_PATH = path.join(FIXTURES_ROOT, "expected-video-matching.json");
async function readFileIfExists(p) {
try {
return await fs.readFile(p);
}
catch {
return null;
}
}
async function writeFileAtomic(p, buf) {
await fs.mkdir(path.dirname(p), { recursive: true });
const tmp = `${p}.tmp-${Date.now()}`;
await fs.writeFile(tmp, buf);
await fs.rename(tmp, p);
}
async function readJson(p) {
return JSON.parse(await fs.readFile(p, "utf-8"));
}
async function loadAgenda(uid) {
const p = path.join(FIXTURES_ROOT, uid + ".json");
return readJson(p);
}
function computeAgendaTsSeconds(dateStr, startTimeStr) {
if (!dateStr || !startTimeStr)
return null;
const iso = `${dateStr}T${startTimeStr}`;
const ms = Date.parse(iso);
if (Number.isNaN(ms))
return null;
return Math.floor(ms / 1000);
}
function verdict(picked, expected) {
if (!picked)
return "MISS";
if (picked.m3u8 !== expected.m3u8)
return "WRONG";
if (typeof expected.startSec === "number" && picked.startSec !== expected.startSec)
return "WRONG";
return "HIT";
}
function* ratioWeights(step = 0.05) {
// wTitle + wOrg + wSalle + wTime = 1
// wTime et wSalle peuvent être petits
for (let wTitle = 0.45; wTitle <= 0.8; wTitle += step) {
for (let wOrg = 0.1; wOrg <= 0.35; wOrg += step) {
for (let wSalle = 0.0; wSalle <= 0.25; wSalle += step) {
const wTime = 1 - wTitle - wOrg - wSalle;
if (wTime < 0 || wTime > 0.2)
continue;
yield { wTitle, wOrg, wSalle, wTime };
}
}
}
}
function* weightGrid() {
const minAcceptList = [0.2, 0.4, 0.5, 0.6, 0.7];
const marginList = [0.04, 0.06, 0.08, 0.1];
const titleMinList = [0.2, 0.3, 0.4];
const titleDominanceList = [0, 0.5];
const orgPenaltyList = [0.6, 0.8, 0.9];
const sameOrgBonusList = [0.1, 0.2, 0.3];
for (const minAccept of minAcceptList) {
for (const margin of marginList) {
for (const titleMin of titleMinList) {
for (const titleDominance of titleDominanceList) {
for (const orgUncertainPenalty of orgPenaltyList) {
for (const sameOrgBonus of sameOrgBonusList) {
for (const { wTitle, wOrg, wSalle, wTime } of ratioWeights(0.05)) {
yield {
minAccept,
margin,
titleMin,
titleDominance,
orgUncertainPenalty,
sameOrgBonus,
wTitle,
wOrg,
wSalle,
wTime,
};
}
}
}
}
}
}
}
}
function candKey(c) {
return `${c.id}_${c.hash}`;
}
function candidatesCachePath(uid) {
return path.join(LIVE_CACHE_DIR, uid, "candidates.json");
}
function nvsCachePath(uid, c) {
return path.join(LIVE_CACHE_DIR, uid, "nvs", `${c.id}_${c.hash}.data.nvs`);
}
async function prefetchForUid(uid, agenda) {
const agendaTs = computeAgendaTsSeconds(agenda.date, agenda.startTime);
const timeAmbigious = isAmbiguousTimeOriginal(agenda.events?.[0]?.timeOriginal);
// 1) candidates (cached)
let candidates = [];
const candCache = await readFileIfExists(candidatesCachePath(uid));
if (candCache) {
candidates = JSON.parse(candCache.toString("utf-8"));
}
else {
const searchParams = {
search: "true",
videotype: getAgendaType(agenda),
};
if (agenda.date) {
const fr = toFRDate(agenda.date);
searchParams.period = "custom";
searchParams.begin = fr;
searchParams.end = fr;
}
if (agenda.organe)
searchParams.organe = agenda.organe;
const pages = await fetchAllSearchPages(searchParams, 3);
candidates = pages.length ? extractCandidatesFromSearchHtml(pages.join("\n")) : [];
await writeFileAtomic(candidatesCachePath(uid), Buffer.from(JSON.stringify(candidates, null, 2), "utf-8"));
}
// 2) data.nvs for each candidate (cached)
const nvsByKey = new Map();
for (const c of candidates) {
const key = candKey(c);
const p = nvsCachePath(uid, c);
const cached = await readFileIfExists(p);
if (cached) {
nvsByKey.set(key, cached.toString("utf-8"));
continue;
}
const url = `${SENAT_DATAS_ROOT}/${c.id}_${c.hash}/content/data.nvs`;
const buf = await fetchBuffer(url);
if (!buf)
continue;
await writeFileAtomic(p, buf);
nvsByKey.set(key, buf.toString("utf-8"));
}
return { uid, agenda, agendaTs, timeAmbigious, candidates, nvsByKey };
}
async function addFeatures(pref) {
const feats = new Map();
const agendaOrgNorm = pref.agenda.organe ? normalize(pref.agenda.organe) : null;
const agendaKey = agendaOrgNorm ? getOrgKey(agendaOrgNorm) : null;
for (const c of pref.candidates) {
const dataStr = pref.nvsByKey.get(candKey(c));
if (!dataStr)
continue;
const m3u8 = buildSenatVodMasterM3u8FromNvs(dataStr);
if (!m3u8)
continue;
const meta = parseDataNvs(dataStr);
const chapterTitles = getLevel1Chapters(dataStr);
// same as prod: SP title override
let videoTitle = c.title;
if (c.isSeancePublique && meta.firstChapterLabel) {
videoTitle = meta.firstChapterLabel;
}
// precompute org similarity (like prod loop)
let bestDice = 0;
let hasSameKey = false;
if (agendaOrgNorm && agendaKey && meta.organes?.length) {
for (const vo of meta.organes) {
const videoOrgNorm = normalize(vo);
const videoKey = getOrgKey(videoOrgNorm);
const d = dice(agendaOrgNorm, videoOrgNorm);
if (videoKey === agendaKey && videoKey !== "autre")
hasSameKey = true;
if (d > bestDice)
bestDice = d;
}
}
feats.set(candKey(c), {
m3u8,
videoTitle,
videoEpoch: meta.epoch,
videoOrganes: meta.organes,
salle: meta.salle,
chapterTitles,
hasSameKey,
bestDice,
});
}
return { ...pref, feats };
}
function scoreCandidateWithProd(pref, f, w) {
const agenda = pref.agenda;
const isSP = (agenda.type ?? "").toLowerCase().includes("séance publique");
const orgSkipDice = w.orgSkipDice ?? 0.8;
const titleDominance = w.titleDominance ?? 0;
const orgUncertainPenalty = w.orgUncertainPenalty ?? 1;
// ---- same org gate (same as prod)
let sameOrg = false;
if (agenda.organe && f.videoOrganes?.length) {
if (f.hasSameKey)
sameOrg = true;
else if (f.bestDice < orgSkipDice)
return null; // skipped like prod
}
const vw = {
wTitle: w.wTitle,
wOrg: w.wOrg,
wSalle: w.wSalle,
wTime: w.wTime,
sameOrgBonus: w.sameOrgBonus,
titleMin: isSP ? (w.spTitleMin ?? w.titleMin) : w.titleMin,
};
const { score: rawScore, signals } = scoreVideo(agenda, pref.agendaTs, sameOrg, vw, f.videoTitle, f.videoEpoch, f.videoOrganes, pref.timeAmbigious, f.salle, f.chapterTitles);
// org uncertainty penalty (like prod matchOneReunion)
const s = sameOrg ? rawScore : rawScore * orgUncertainPenalty;
// title dominance gate
if (titleDominance > 0 && signals.titleScore < titleDominance * s)
return null;
return s;
}
function pickBestFromPref(pref, w) {
let best = null;
let secondScore = -Infinity;
for (const c of pref.candidates) {
const f = pref.feats.get(candKey(c));
if (!f)
continue;
const s = scoreCandidateWithProd(pref, f, w);
if (s == null)
continue;
if (!best || s > best.score) {
if (best)
secondScore = best.score;
best = { m3u8: f.m3u8, score: s };
}
else if (s > secondScore) {
secondScore = s;
}
}
if (!best)
return null;
const minAcceptBase = w.minAccept ?? 0.75;
const minAccept = minAcceptBase + (pref.candidates.length >= 20 ? 0.08 : pref.candidates.length >= 10 ? 0.05 : 0);
if (best.score < minAccept)
return null;
const marginBase = w.margin ?? 0.08;
const isSP = (pref.agenda.type ?? "").toLowerCase().includes("séance publique");
const margin = isSP ? (w.spMargin ?? Math.max(0.12, marginBase)) : marginBase;
if (secondScore !== -Infinity && best.score - secondScore < margin)
return null;
return best;
}
function n(v, def = 0) {
return typeof v === "number" ? v : def;
}
function cmpZeroWrong(a, b) {
// 1) HIT maximal
if (a.hit !== b.hit)
return b.hit - a.hit;
// 2) MISS minimal
if (a.miss !== b.miss)
return a.miss - b.miss;
// 3) minAccept maximal
const aMin = n(a.minAccept, 0);
const bMin = n(b.minAccept, 0);
if (aMin !== bMin)
return bMin - aMin;
// 4) margin maximal
const aMar = n(a.margin, 0);
const bMar = n(b.margin, 0);
if (aMar !== bMar)
return bMar - aMar;
return 0;
}
function cmpBestOverall(a, b) {
// 1) WRONG minimal
if (a.wrong !== b.wrong)
return a.wrong - b.wrong;
// 2) HIT maximal
if (a.hit !== b.hit)
return b.hit - a.hit;
// 3) MISS minimal
if (a.miss !== b.miss)
return a.miss - b.miss;
// 4) minAccept maximal
const aMin = n(a.minAccept, 0);
const bMin = n(b.minAccept, 0);
if (aMin !== bMin)
return bMin - aMin;
// 5) margin maximal
const aMar = n(a.margin, 0);
const bMar = n(b.margin, 0);
if (aMar !== bMar)
return bMar - aMar;
return 0;
}
describe("video matching LIVE benchmark (gold) — HIT/WRONG/MISS", () => {
it("grid search (target WRONG=0, maximize HIT)", async () => {
const gold = await readJson(GOLD_PATH);
// Prefetch + features
const prefByUid = new Map();
for (const item of gold) {
const agenda = await loadAgenda(item.reunionUid);
const pref = await prefetchForUid(item.reunionUid, agenda);
const withF = await addFeatures(pref);
prefByUid.set(item.reunionUid, withF);
if (withF.candidates.length === 0) {
console.log(`[MISS-CANDIDATES] ${item.reunionUid} → no candidates from search`);
}
else {
console.log(`[CANDIDATES] ${item.reunionUid} → ${withF.candidates.length}`);
}
}
const rows = [];
for (const w of weightGrid()) {
let hit = 0, wrong = 0, miss = 0;
// EARLY STOP: target WRONG=0
for (const item of gold) {
const expected = { m3u8: item.expected.m3u8 };
const pref = prefByUid.get(item.reunionUid);
const best = pickBestFromPref(pref, w);
const v = verdict(best ? { m3u8: best.m3u8 } : null, expected);
if (v === "HIT")
hit++;
else if (v === "WRONG") {
wrong++;
break;
}
else
miss++;
}
rows.push({ ...w, hit, wrong, miss });
}
// STRICT: WRONG = 0
const zeroWrong = rows.filter((r) => r.wrong === 0).sort(cmpZeroWrong);
console.log("Top configs (WRONG=0) — sorted by: HIT desc, MISS asc, minAccept desc, margin desc");
console.table(zeroWrong.slice(0, 15));
const bestOverall = rows.slice().sort(cmpBestOverall)[0];
console.log("Best overall (min WRONG, max HIT, min MISS, max minAccept, max margin):", bestOverall);
// you want at least one config with WRONG=0
expect(zeroWrong.length).toBeGreaterThan(0);
expect(zeroWrong[0].wrong).toBe(0);
}, 20 * 60_000);
});