@tricoteuses/senat
Version:
Handle French Sénat's open data
202 lines (201 loc) • 8.13 kB
JavaScript
// scripts/retrieve_senat_videos_from_agendas.ts
import assert from "assert";
import commandLineArgs from "command-line-args";
import fs from "fs-extra";
import fsp from "fs/promises";
import path from "path";
import * as git from "../git.js";
import { AGENDA_FOLDER, iterLoadSenatAgendas } from "../loaders";
import { getSessionsFromStart } from "../types/sessions";
import { commonOptions } from "./shared/cli_helpers";
import { getAgendaSegmentTimecodes, buildSenatVodMasterM3u8FromNvs } from "../utils/nvs-parsing";
import { epochToParisDateTime, isAmbiguousTimeOriginal, toTargetEpoch } from "../utils/date";
import { pathToFileURL } from "url";
import { fetchCandidatesForAgenda, fetchText } from "../videos/search";
import { matchAgendaToVideo } from "../videos/match";
import { SENAT_DATAS_ROOT, STATS, VIDEOS_ROOT_FOLDER, weights } from "../videos/config";
import { processBisIfNeeded, processOneReunionMatch, writeIfChanged } from "../videos";
const optionsDefinitions = [...commonOptions];
const options = commandLineArgs(optionsDefinitions);
function shouldSkipAgenda(agenda) {
if (!agenda.captationVideo)
return true;
if (!agenda.date || !agenda.startTime)
return true;
if (agenda.uid.endsWith("Bis"))
return true; // Don't reprocess bis reunions
const agendaTs = toTargetEpoch(agenda.startTime, agenda.date);
const now = Date.now();
if (agendaTs && agendaTs * 1000 > now)
return true;
return false;
}
async function computeContext(agenda, session, dataDir) {
const agendaTs = agenda.startTime && agenda.date ? toTargetEpoch(agenda.startTime, agenda.date) : null;
const reunionUid = agenda.uid;
const baseDir = path.join(dataDir, VIDEOS_ROOT_FOLDER, String(session), reunionUid);
await fs.ensureDir(baseDir);
return { session, dataDir, baseDir, reunionUid, agendaTs };
}
function shouldSkipDownload(agenda, baseDir) {
if (!options["only-recent"])
return false;
const now = Date.now();
const cutoff = now - options["only-recent"] * 24 * 3600 * 1000;
const reunionTs = Date.parse(agenda.date);
if (reunionTs >= cutoff)
return false;
const dataNvsPath = path.join(baseDir, "data.nvs");
const finalplayerNvsPath = path.join(baseDir, "finalplayer.nvs");
return fs.existsSync(dataNvsPath) && fs.existsSync(finalplayerNvsPath);
}
async function writeMatchArtifacts(args) {
const { agenda, ctx, best, secondBest } = args;
const bestDt = best.epoch ? epochToParisDateTime(best.epoch) : null;
const secondBestDt = secondBest && secondBest.epoch ? epochToParisDateTime(secondBest.epoch) : null;
const metadata = {
reunionUid: ctx.reunionUid,
session: ctx.session,
accepted: true,
threshold: weights.minAccept,
agenda: {
date: agenda.date,
startTime: agenda.startTime,
titre: agenda.titre,
organe: agenda.organe ?? undefined,
uid: agenda.uid,
},
best: {
id: best.id,
hash: best.hash,
pageUrl: best.pageUrl,
epoch: best.epoch ?? null,
date: bestDt?.date ?? null,
startTime: bestDt?.startTime ?? null,
title: best.vtitle ?? null,
score: best.score,
},
secondBest: secondBest
? {
id: secondBest.id,
hash: secondBest.hash,
pageUrl: secondBest.pageUrl,
epoch: secondBest.epoch ?? null,
date: secondBestDt?.date ?? null,
startTime: secondBestDt?.startTime ?? null,
title: secondBest.vtitle ?? null,
score: secondBest.score,
}
: null,
};
await writeIfChanged(path.join(ctx.baseDir, "metadata.json"), JSON.stringify(metadata, null, 2));
const dataUrl = `${SENAT_DATAS_ROOT}/${best.id}_${best.hash}/content/data.nvs`;
const finalUrl = `${SENAT_DATAS_ROOT}/${best.id}_${best.hash}/content/finalplayer.nvs`;
const dataTxt = await fetchText(dataUrl);
const finalTxt = await fetchText(finalUrl);
if (dataTxt)
await fsp.writeFile(path.join(ctx.baseDir, "data.nvs"), dataTxt, "utf-8");
if (finalTxt)
await fsp.writeFile(path.join(ctx.baseDir, "finalplayer.nvs"), finalTxt, "utf-8");
}
async function processGroupedReunion(agenda, session, dataDir, lastByVideo) {
// 1) GuardRails
if (shouldSkipAgenda(agenda))
return;
const ctx = await computeContext(agenda, session, dataDir);
const skipDownload = shouldSkipDownload(agenda, ctx.baseDir);
let match = null;
let best = null;
let secondBest = null;
// 2) Match + download artifacts (only if not skipped)
if (!skipDownload) {
STATS.total++;
const candidates = await fetchCandidatesForAgenda(agenda, options);
if (!candidates) {
console.log(`[warn] ${agenda.uid} No candidate found for this reunion. Probably VOD not published yet.`);
return;
}
match = await matchAgendaToVideo({ agenda, agendaTs: ctx.agendaTs, candidates, options });
if (!match) {
console.log(`[miss] ${agenda.uid} No match found for this reunion`);
return;
}
;
({ best, secondBest } = match);
STATS.accepted++;
await writeMatchArtifacts({ agenda, ctx, best, secondBest });
}
if (best && isAmbiguousTimeOriginal(agenda.events[0].timeOriginal)) {
if (!options["silent"])
console.log("If the time is ambiguous, update agenda startTime from matched video");
agenda = { ...agenda, startTime: epochToParisDateTime(best.epoch)?.startTime ?? agenda.startTime };
}
// 3) Always update BEST agenda JSON from local NVS
await processOneReunionMatch({
agenda,
best,
baseDir: ctx.baseDir,
dataDir: ctx.dataDir,
session: ctx.session,
options,
writeIfChanged,
lastByVideo, // NEW
getAgendaSegmentTimecodes,
buildSenatVodMasterM3u8FromNvs,
});
await processBisIfNeeded({
agenda,
secondBest,
ctx,
skipDownload,
options,
lastByVideo,
writeIfChanged,
processOneReunionMatch,
getAgendaSegmentTimecodes,
buildSenatVodMasterM3u8FromNvs,
});
}
async function processAll(dataDir, sessions) {
console.log("Process all Agendas and fetch video's url");
for (const session of sessions) {
const lastByVideo = new Map();
for (const { item: agenda } of iterLoadSenatAgendas(dataDir, session)) {
try {
await processGroupedReunion(agenda, session, dataDir, lastByVideo);
}
catch (e) {
console.error(`[error] ${agenda?.uid ?? "unknown-uid"}:`, e?.message || e);
}
}
}
const debatsDir = path.join(dataDir, AGENDA_FOLDER);
commitAndPushGit(debatsDir, options);
}
function commitAndPushGit(datasetDir, options) {
let exitCode = 10; // 0: some data changed, 10: no modification
if (options.commit) {
const errorCode = git.commitAndPush(datasetDir, "Nouvelle moisson", options.remote);
if ((exitCode === 10 && errorCode !== 10) || (exitCode === 0 && errorCode !== 0 && errorCode !== 10)) {
exitCode = errorCode;
}
}
}
async function main() {
const dataDir = options["dataDir"];
assert(dataDir, "Missing argument: data directory");
const sessions = getSessionsFromStart(options["fromSession"]);
const TIMER = "senat-agendas→videos processing time";
console.time(TIMER);
await processAll(dataDir, sessions);
console.timeEnd(TIMER);
const { total, accepted } = STATS;
const ratio = total ? ((accepted / total) * 100).toFixed(1) : "0.0";
console.log(`[summary] accepted=${accepted} / total=${total} (${ratio}%)`);
}
if (import.meta.url === pathToFileURL(process.argv[1]).href) {
main().catch((err) => {
console.error(err);
process.exit(1);
});
}